diff --git a/.copyright.hook b/.copyright.hook new file mode 100644 index 0000000000000..7cb4721940fd4 --- /dev/null +++ b/.copyright.hook @@ -0,0 +1,134 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import io +import re +import sys +import os +import datetime + +COPYRIGHT = '''Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License.''' + +def _generate_copyright(comment_mark): + copyright=COPYRIGHT.split(os.linesep) + header = copyright[0].rstrip() + + p = re.search('(\d{4})', header).group(0) + now = datetime.datetime.now() + + header = header.replace(p,str(now.year)) + + ans=[comment_mark + " " + header + os.linesep] + for idx, line in enumerate(copyright[1:]): + ans.append(comment_mark + " " + line.rstrip() + os.linesep) + + return ans + +def _get_comment_mark(path): + lang_type=re.compile(r"\.(py|sh)$") + if lang_type.search(path) is not None: + return "#" + + lang_type=re.compile(r"\.(h|c|hpp|cc|cpp|cu|go|cuh|proto)$") + if lang_type.search(path) is not None: + return "//" + + return None + + +RE_ENCODE = re.compile(r"^[ \t\v]*#.*?coding[:=]", re.IGNORECASE) +RE_COPYRIGHT = re.compile(r".*Copyright( \(c\))* \d{4}", re.IGNORECASE) +RE_SHEBANG = re.compile(r"^[ \t\v]*#[ \t]?\!") + +def _check_copyright(path): + head=[] + try: + with open(path) as f: + head = [next(f) for x in range(4)] + except StopIteration: + pass + + for idx, line in enumerate(head): + if RE_COPYRIGHT.search(line) is not None: + return True + + return False + +def generate_copyright(path, comment_mark): + original_contents = io.open(path, encoding="utf-8").readlines() + head = original_contents[0:4] + + insert_line_no=0 + for i, line in enumerate(head): + if RE_ENCODE.search(line) or RE_SHEBANG.search(line): + insert_line_no=i+1 + + copyright = _generate_copyright(comment_mark) + if insert_line_no == 0: + new_contents = copyright + if len(original_contents) > 0 and len(original_contents[0].strip()) != 0: + new_contents.append(os.linesep) + new_contents.extend(original_contents) + else: + new_contents=original_contents[0:insert_line_no] + new_contents.append(os.linesep) + new_contents.extend(copyright) + if len(original_contents) > insert_line_no and len(original_contents[insert_line_no].strip()) != 0: + new_contents.append(os.linesep) + new_contents.extend(original_contents[insert_line_no:]) + new_contents="".join(new_contents) + + with io.open(path, 'w') as output_file: + output_file.write(new_contents) + + + +def main(argv=None): + parser = argparse.ArgumentParser( + description='Checker for copyright declaration.') + parser.add_argument('filenames', nargs='*', help='Filenames to check') + args = parser.parse_args(argv) + + retv = 0 + for path in args.filenames: + comment_mark = _get_comment_mark(path) + if comment_mark is None: + print("warning:Unsupported file", path, file=sys.stderr) + continue + + if _check_copyright(path): + continue + + generate_copyright(path, comment_mark) + + +if __name__ == '__main__': + exit(main()) \ No newline at end of file diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000000000..0833e9852c8f2 --- /dev/null +++ b/.flake8 @@ -0,0 +1,7 @@ +[flake8] +ignore = E203, E402, E501, E731, E741, W503, W605, E722 +max-line-length = 119 + +# E402: module level import not at top of file +per-file-ignores = + __init__.py:F401,F403,E402 \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1b21ccc980216..578b2b16f65ec 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,44 +1,45 @@ -- repo: https://github.com/PaddlePaddle/mirrors-yapf.git - sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37 +repos: +# For Python files +- repo: https://github.com/psf/black.git + rev: 22.8.0 hooks: - - id: yapf - files: \.py$ + - id: black + files: \.(py|pyi)$ + additional_dependencies: [toml] +- repo: https://github.com/PyCQA/isort + rev: 5.11.5 + hooks: + - id: isort +- repo: https://github.com/PyCQA/flake8 + rev: 4.0.1 + hooks: + - id: flake8 - repo: https://github.com/pre-commit/pre-commit-hooks - sha: a11d9314b22d8f8c7556443875b731ef05965464 + rev: v4.1.0 hooks: - id: check-merge-conflict - id: check-symlinks - id: detect-private-key files: (?!.*paddle)^.*$ - id: end-of-file-fixer - files: \.(md|yml)$ + files: \.md$ - id: trailing-whitespace - files: \.(md|yml)$ + files: \.md$ - repo: https://github.com/Lucas-C/pre-commit-hooks - sha: v1.0.1 + rev: v1.1.14 hooks: - id: forbid-crlf - files: \.(md|yml)$ + files: \.md$ - id: remove-crlf - files: \.(md|yml)$ + files: \.md$ - id: forbid-tabs - files: \.(md|yml)$ + files: \.md$ - id: remove-tabs - files: \.(md|yml)$ -- repo: local - hooks: - - id: clang-format-with-version-check - name: clang-format - description: Format files with ClangFormat. - entry: bash ./.travis/codestyle/clang_format.hook -i - language: system - files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$ - + files: \.md$ - repo: local hooks: - - id: cpplint-cpp-source - name: cpplint - description: Check C++ code style using cpplint.py. - entry: bash ./.travis/codestyle/cpplint_pre_commit.hook + - id: copyright_checker + name: copyright_checker + entry: python .copyright.hook language: system - files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$ \ No newline at end of file + files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps|py|sh)$ \ No newline at end of file diff --git a/applications/Automatic_label/automatic_label.py b/applications/Automatic_label/automatic_label.py index 69947e3f2f7d9..8aa5277232de9 100644 --- a/applications/Automatic_label/automatic_label.py +++ b/applications/Automatic_label/automatic_label.py @@ -23,13 +23,16 @@ import requests from paddlenlp.trainer import PdArgumentParser from paddlenlp.transformers import AutoTokenizer -from PIL import Image, ImageDraw, ImageFont +from PIL import Image from paddlemix.models.blip2.modeling import Blip2ForConditionalGeneration from paddlemix.models.groundingdino.modeling import GroundingDinoModel from paddlemix.models.sam.modeling import SamModel from paddlemix.processors.blip_processing import ( - Blip2Processor, BlipImageProcessor, BlipTextProcessor) + Blip2Processor, + BlipImageProcessor, + BlipTextProcessor, +) from paddlemix.processors.groundingdino_processing import GroudingDinoProcessor from paddlemix.processors.sam_processing import SamProcessor from paddlemix.utils.log import logger @@ -48,9 +51,7 @@ def show_mask(mask, ax, random_color=False): def show_box(box, ax, label): x0, y0 = box[0], box[1] w, h = box[2] - box[0], box[3] - box[1] - ax.add_patch( - plt.Rectangle( - (x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=2)) + ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=2)) ax.text(x0, y0, label) @@ -79,28 +80,36 @@ class ModelArguments: blip2_model_name_or_path: str = field( default="paddlemix/blip2-caption-opt2.7b", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) text_model_name_or_path: str = field( default="facebook/opt-2.7b", - metadata={"help": "The type of text model to use (OPT, T5)."}, ) + metadata={"help": "The type of text model to use (OPT, T5)."}, + ) dino_model_name_or_path: str = field( default="GroundingDino/groundingdino-swint-ogc", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) sam_model_name_or_path: str = field( default="Sam/SamVitH-1024", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) box_threshold: float = field( default=0.3, - metadata={"help": "box threshold."}, ) + metadata={"help": "box threshold."}, + ) text_threshold: float = field( default=0.25, - metadata={"help": "text threshold."}, ) + metadata={"help": "text threshold."}, + ) output_dir: str = field( default="automatic_label", - metadata={"help": "output directory."}, ) + metadata={"help": "output directory."}, + ) visual: bool = field( default=True, - metadata={"help": "save visual image."}, ) + metadata={"help": "save visual image."}, + ) def generate_caption(raw_image, prompt, processor, blip2_model): @@ -110,10 +119,10 @@ def generate_caption(raw_image, prompt, processor, blip2_model): text=prompt, return_tensors="pd", return_attention_mask=True, - mode="test", ) + mode="test", + ) generated_ids, scores = blip2_model.generate(**inputs) - generated_text = processor.batch_decode( - generated_ids, skip_special_tokens=True)[0].strip() + generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() logger.info("Generate text: {}".format(generated_text)) return generated_text @@ -123,10 +132,7 @@ def generate_tags(caption): lemma = nltk.wordnet.WordNetLemmatizer() nltk.download(["punkt", "averaged_perceptron_tagger", "wordnet"]) - tags_list = [ - word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(caption)) - if pos[0] == "N" - ] + tags_list = [word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(caption)) if pos[0] == "N"] tags_lemma = [lemma.lemmatize(w) for w in tags_list] tags = ", ".join(map(str, tags_lemma)) @@ -140,19 +146,17 @@ def main(): logger.info("blip2_model: {}".format(model_args.blip2_model_name_or_path)) # bulid blip2 processor - blip2_tokenizer_class = AutoTokenizer.from_pretrained( - model_args.text_model_name_or_path, use_fast=False) + blip2_tokenizer_class = AutoTokenizer.from_pretrained(model_args.text_model_name_or_path, use_fast=False) blip2_image_processor = BlipImageProcessor.from_pretrained( - os.path.join(model_args.blip2_model_name_or_path, "processor", "eval")) + os.path.join(model_args.blip2_model_name_or_path, "processor", "eval") + ) blip2_text_processor_class = BlipTextProcessor.from_pretrained( - os.path.join(model_args.blip2_model_name_or_path, "processor", "eval")) - blip2_processor = Blip2Processor(blip2_image_processor, - blip2_text_processor_class, - blip2_tokenizer_class) + os.path.join(model_args.blip2_model_name_or_path, "processor", "eval") + ) + blip2_processor = Blip2Processor(blip2_image_processor, blip2_text_processor_class, blip2_tokenizer_class) # #bulid blip2 model - blip2_model = Blip2ForConditionalGeneration.from_pretrained( - model_args.blip2_model_name_or_path) + blip2_model = Blip2ForConditionalGeneration.from_pretrained(model_args.blip2_model_name_or_path) paddle.device.cuda.empty_cache() blip2_model.eval() @@ -160,21 +164,17 @@ def main(): logger.info("dino_model: {}".format(model_args.dino_model_name_or_path)) # bulid dino processor - dino_processor = GroudingDinoProcessor.from_pretrained( - model_args.dino_model_name_or_path) + dino_processor = GroudingDinoProcessor.from_pretrained(model_args.dino_model_name_or_path) # bulid dino model - dino_model = GroundingDinoModel.from_pretrained( - model_args.dino_model_name_or_path) + dino_model = GroundingDinoModel.from_pretrained(model_args.dino_model_name_or_path) dino_model.eval() logger.info("dino_model build finish!") # buidl sam processor - sam_processor = SamProcessor.from_pretrained( - model_args.sam_model_name_or_path) + sam_processor = SamProcessor.from_pretrained(model_args.sam_model_name_or_path) # bulid model logger.info("SamModel: {}".format(model_args.sam_model_name_or_path)) - sam_model = SamModel.from_pretrained( - model_args.sam_model_name_or_path, input_type="boxs") + sam_model = SamModel.from_pretrained(model_args.sam_model_name_or_path, input_type="boxs") logger.info("SamModel build finish!") # read image @@ -188,7 +188,8 @@ def main(): image_pil, prompt=data_args.prompt, processor=blip2_processor, - blip2_model=blip2_model, ) + blip2_model=blip2_model, + ) det_prompt = generate_tags(caption) logger.info("det prompt: {}".format(det_prompt)) @@ -196,8 +197,7 @@ def main(): image_pil = image_pil.convert("RGB") # preprocess image text_prompt - image_tensor, mask, tokenized_out = dino_processor( - images=image_pil, text=det_prompt) + image_tensor, mask, tokenized_out = dino_processor(images=image_pil, text=det_prompt) with paddle.no_grad(): outputs = dino_model( @@ -205,9 +205,9 @@ def main(): mask, input_ids=tokenized_out["input_ids"], attention_mask=tokenized_out["attention_mask"], - text_self_attention_masks=tokenized_out[ - "text_self_attention_masks"], - position_ids=tokenized_out["position_ids"], ) + text_self_attention_masks=tokenized_out["text_self_attention_masks"], + position_ids=tokenized_out["position_ids"], + ) logits = F.sigmoid(outputs["pred_logits"])[0] # (nq, 256) boxes = outputs["pred_boxes"][0] # (nq, 4) @@ -243,8 +243,7 @@ def main(): x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1) boxes.append([x0, y0, x1, y1]) boxes = np.array(boxes) - image_seg, prompt = sam_processor( - image_pil, input_type="boxs", box=boxes, point_coords=None) + image_seg, prompt = sam_processor(image_pil, input_type="boxs", box=boxes, point_coords=None) seg_masks = sam_model(img=image_seg, prompt=prompt) seg_masks = sam_processor.postprocess_masks(seg_masks) @@ -267,7 +266,8 @@ def main(): os.path.join(model_args.output_dir, "mask_pred.jpg"), bbox_inches="tight", dpi=300, - pad_inches=0.0, ) + pad_inches=0.0, + ) logger.info("finish!") diff --git a/applications/CVinW/grounded_sam.py b/applications/CVinW/grounded_sam.py index 86a326e8947ab..35d251f59ef6f 100644 --- a/applications/CVinW/grounded_sam.py +++ b/applications/CVinW/grounded_sam.py @@ -13,9 +13,7 @@ # limitations under the License. import os -import sys from dataclasses import dataclass, field -from typing import List import matplotlib.pyplot as plt import numpy as np @@ -23,7 +21,7 @@ import paddle.nn.functional as F import requests from paddlenlp.trainer import PdArgumentParser -from PIL import Image, ImageDraw, ImageFont +from PIL import Image from paddlemix.models.groundingdino.modeling import GroundingDinoModel from paddlemix.models.sam.modeling import SamModel @@ -45,9 +43,7 @@ def show_mask(mask, ax, random_color=False): def show_box(box, ax, label): x0, y0 = box[0], box[1] w, h = box[2] - box[0], box[3] - box[1] - ax.add_patch( - plt.Rectangle( - (x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=2)) + ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=2)) ax.text(x0, y0, label) @@ -61,8 +57,7 @@ class DataArguments: """ input_image: str = field(metadata={"help": "The name of input image."}) - prompt: str = field( - default=None, metadata={"help": "The prompt of the image to be det."}) + prompt: str = field(default=None, metadata={"help": "The prompt of the image to be det."}) @dataclass @@ -73,22 +68,28 @@ class ModelArguments: dino_model_name_or_path: str = field( default="GroundingDino/groundingdino-swint-ogc", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) sam_model_name_or_path: str = field( default="Sam/SamVitH-1024", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) box_threshold: float = field( default=0.3, - metadata={"help": "box threshold."}, ) + metadata={"help": "box threshold."}, + ) text_threshold: float = field( default=0.25, - metadata={"help": "text threshold."}, ) + metadata={"help": "text threshold."}, + ) output_dir: str = field( default="grounded_sam_output", - metadata={"help": "output directory."}, ) + metadata={"help": "output directory."}, + ) visual: bool = field( default=True, - metadata={"help": "save visual image."}, ) + metadata={"help": "save visual image."}, + ) def main(): @@ -96,32 +97,26 @@ def main(): model_args, data_args = parser.parse_args_into_dataclasses() url = data_args.input_image # bulid dino processor - dino_processor = GroudingDinoProcessor.from_pretrained( - model_args.dino_model_name_or_path) + dino_processor = GroudingDinoProcessor.from_pretrained(model_args.dino_model_name_or_path) # bulid dino model logger.info("dino_model: {}".format(model_args.dino_model_name_or_path)) - dino_model = GroundingDinoModel.from_pretrained( - model_args.dino_model_name_or_path) + dino_model = GroundingDinoModel.from_pretrained(model_args.dino_model_name_or_path) dino_model.eval() # buidl sam processor - sam_processor = SamProcessor.from_pretrained( - model_args.sam_model_name_or_path) + sam_processor = SamProcessor.from_pretrained(model_args.sam_model_name_or_path) # bulid model logger.info("SamModel: {}".format(model_args.sam_model_name_or_path)) - sam_model = SamModel.from_pretrained( - model_args.sam_model_name_or_path, input_type="boxs") + sam_model = SamModel.from_pretrained(model_args.sam_model_name_or_path, input_type="boxs") # read image if os.path.isfile(url): # read image image_pil = Image.open(url).convert("RGB") else: - image_pil = Image.open(requests.get(url, stream=True).raw).convert( - "RGB") + image_pil = Image.open(requests.get(url, stream=True).raw).convert("RGB") # preprocess image text_prompt - image_tensor, mask, tokenized_out = dino_processor( - images=image_pil, text=data_args.prompt) + image_tensor, mask, tokenized_out = dino_processor(images=image_pil, text=data_args.prompt) with paddle.no_grad(): outputs = dino_model( @@ -129,9 +124,9 @@ def main(): mask, input_ids=tokenized_out["input_ids"], attention_mask=tokenized_out["attention_mask"], - text_self_attention_masks=tokenized_out[ - "text_self_attention_masks"], - position_ids=tokenized_out["position_ids"], ) + text_self_attention_masks=tokenized_out["text_self_attention_masks"], + position_ids=tokenized_out["position_ids"], + ) logits = F.sigmoid(outputs["pred_logits"])[0] # (nq, 256) boxes = outputs["pred_boxes"][0] # (nq, 4) @@ -167,8 +162,7 @@ def main(): x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1) boxes.append([x0, y0, x1, y1]) boxes = np.array(boxes) - image_seg, prompt = sam_processor( - image_pil, input_type="boxs", box=boxes, point_coords=None) + image_seg, prompt = sam_processor(image_pil, input_type="boxs", box=boxes, point_coords=None) seg_masks = sam_model(img=image_seg, prompt=prompt) seg_masks = sam_processor.postprocess_masks(seg_masks) @@ -190,7 +184,8 @@ def main(): os.path.join(model_args.output_dir, "mask_pred.jpg"), bbox_inches="tight", dpi=300, - pad_inches=0.0, ) + pad_inches=0.0, + ) logger.info("finish!") diff --git a/applications/Inpainting/grounded_sam_chatglm.py b/applications/Inpainting/grounded_sam_chatglm.py index a983379e2450b..dd99dc6bc8007 100644 --- a/applications/Inpainting/grounded_sam_chatglm.py +++ b/applications/Inpainting/grounded_sam_chatglm.py @@ -22,7 +22,7 @@ import requests from paddlenlp import Taskflow from paddlenlp.trainer import PdArgumentParser -from PIL import Image, ImageDraw, ImageFont +from PIL import Image from paddlemix.models.groundingdino.modeling import GroundingDinoModel from paddlemix.models.sam.modeling import SamModel @@ -45,9 +45,7 @@ def show_mask(mask, ax, random_color=False): def show_box(box, ax, label): x0, y0 = box[0], box[1] w, h = box[2] - box[0], box[3] - box[1] - ax.add_patch( - plt.Rectangle( - (x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=2)) + ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=2)) ax.text(x0, y0, label) @@ -60,11 +58,14 @@ class DataArguments: the command line. """ - input_image: str = field(metadata={"help": "The name of input image."}, ) + input_image: str = field( + metadata={"help": "The name of input image."}, + ) prompt: str = field( default=None, - metadata={"help": "The prompt of the image to be inpaint."}, ) + metadata={"help": "The prompt of the image to be inpaint."}, + ) @dataclass @@ -75,36 +76,45 @@ class ModelArguments: stable_diffusion_pipeline_name_or_path: str = field( default="stabilityai/stable-diffusion-2-inpainting", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) dino_model_name_or_path: str = field( default="GroundingDino/groundingdino-swint-ogc", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) sam_model_name_or_path: str = field( default="Sam/SamVitH-1024", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) chatglm_model_name_or_path: str = field( default="THUDM/chatglm-6b", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) box_threshold: float = field( default=0.3, - metadata={"help": "box threshold."}, ) + metadata={"help": "box threshold."}, + ) text_threshold: float = field( default=0.25, - metadata={"help": "text threshold."}, ) + metadata={"help": "text threshold."}, + ) output_dir: str = field( default="inpainting_output", - metadata={"help": "output directory."}, ) + metadata={"help": "output directory."}, + ) visual: bool = field( default=True, - metadata={"help": "save visual image."}, ) + metadata={"help": "save visual image."}, + ) def filter_prompts_with_chatglm(caption, model_name_or_path="THUDM/chatglm-6b"): prompt = ( "Given caption,extract the main object to be replaced and marked it as 'main_object', " - + f"Extract the remaining part as 'other prompt', " + - f"Return main_object, other prompt in English" + - f"Given caption: {caption}.") + + "Extract the remaining part as 'other prompt', " + + "Return main_object, other prompt in English" + + "Given caption: {}.".format(caption) + ) logger.info("chatglm: {}".format(model_name_or_path)) textGen = Taskflow("text2text_generation", model=model_name_or_path) @@ -113,7 +123,8 @@ def filter_prompts_with_chatglm(caption, model_name_or_path="THUDM/chatglm-6b"): det_prompt, inpaint_prompt = ( reply.split("\n")[0].split(":")[-1].strip(), - reply.split("\n")[-1].split(":")[-1].strip(), ) + reply.split("\n")[-1].split(":")[-1].strip(), + ) return det_prompt, inpaint_prompt @@ -125,21 +136,17 @@ def main(): logger.info("dino_model: {}".format(model_args.dino_model_name_or_path)) # bulid dino processor - dino_processor = GroudingDinoProcessor.from_pretrained( - model_args.dino_model_name_or_path) + dino_processor = GroudingDinoProcessor.from_pretrained(model_args.dino_model_name_or_path) # bulid dino model - dino_model = GroundingDinoModel.from_pretrained( - model_args.dino_model_name_or_path) + dino_model = GroundingDinoModel.from_pretrained(model_args.dino_model_name_or_path) dino_model.eval() logger.info("dino_model build finish!") # buidl sam processor - sam_processor = SamProcessor.from_pretrained( - model_args.sam_model_name_or_path) + sam_processor = SamProcessor.from_pretrained(model_args.sam_model_name_or_path) # bulid model logger.info("SamModel: {}".format(model_args.sam_model_name_or_path)) - sam_model = SamModel.from_pretrained( - model_args.sam_model_name_or_path, input_type="boxs") + sam_model = SamModel.from_pretrained(model_args.sam_model_name_or_path, input_type="boxs") logger.info("SamModel build finish!") # read image @@ -149,16 +156,14 @@ def main(): else: image_pil = Image.open(requests.get(url, stream=True).raw) - det_prompt, inpaint_prompt = filter_prompts_with_chatglm( - data_args.prompt, model_args.chatglm_model_name_or_path) + det_prompt, inpaint_prompt = filter_prompts_with_chatglm(data_args.prompt, model_args.chatglm_model_name_or_path) logger.info("det prompt: {}".format(det_prompt)) logger.info("inpaint prompt: {}".format(inpaint_prompt)) image_pil = image_pil.convert("RGB") # preprocess image text_prompt - image_tensor, mask, tokenized_out = dino_processor( - images=image_pil, text=det_prompt) + image_tensor, mask, tokenized_out = dino_processor(images=image_pil, text=det_prompt) with paddle.no_grad(): outputs = dino_model( @@ -166,9 +171,9 @@ def main(): mask, input_ids=tokenized_out["input_ids"], attention_mask=tokenized_out["attention_mask"], - text_self_attention_masks=tokenized_out[ - "text_self_attention_masks"], - position_ids=tokenized_out["position_ids"], ) + text_self_attention_masks=tokenized_out["text_self_attention_masks"], + position_ids=tokenized_out["position_ids"], + ) logits = F.sigmoid(outputs["pred_logits"])[0] # (nq, 256) boxes = outputs["pred_boxes"][0] # (nq, 4) @@ -204,8 +209,7 @@ def main(): x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1) boxes.append([x0, y0, x1, y1]) boxes = np.array(boxes) - image_seg, prompt = sam_processor( - image_pil, input_type="boxs", box=boxes, point_coords=None) + image_seg, prompt = sam_processor(image_pil, input_type="boxs", box=boxes, point_coords=None) seg_masks = sam_model(img=image_seg, prompt=prompt) seg_masks = sam_processor.postprocess_masks(seg_masks) @@ -227,12 +231,11 @@ def main(): os.path.join(model_args.output_dir, "mask_pred.jpg"), bbox_inches="tight", dpi=300, - pad_inches=0.0, ) + pad_inches=0.0, + ) - logger.info("stable diffusion pipeline: {}".format( - model_args.stable_diffusion_pipeline_name_or_path)) - pipe = StableDiffusionInpaintPipeline.from_pretrained( - model_args.stable_diffusion_pipeline_name_or_path) + logger.info("stable diffusion pipeline: {}".format(model_args.stable_diffusion_pipeline_name_or_path)) + pipe = StableDiffusionInpaintPipeline.from_pretrained(model_args.stable_diffusion_pipeline_name_or_path) logger.info("stable diffusion pipeline build finish!") merge_mask = paddle.sum(seg_masks, axis=0).unsqueeze(0) @@ -242,11 +245,9 @@ def main(): image_pil = image_pil.resize((512, 512)) mask_pil = mask_pil.resize((512, 512)) - image = pipe( - prompt=inpaint_prompt, image=image_pil, mask_image=mask_pil).images[0] + image = pipe(prompt=inpaint_prompt, image=image_pil, mask_image=mask_pil).images[0] image = image.resize(size) - image.save( - os.path.join(model_args.output_dir, "grounded_sam_chatglm_output.jpg")) + image.save(os.path.join(model_args.output_dir, "grounded_sam_chatglm_output.jpg")) logger.info("finish!") diff --git a/applications/Inpainting/grounded_sam_inpainting.py b/applications/Inpainting/grounded_sam_inpainting.py index 1fa8aacc4a39c..eccc41359072f 100644 --- a/applications/Inpainting/grounded_sam_inpainting.py +++ b/applications/Inpainting/grounded_sam_inpainting.py @@ -21,7 +21,7 @@ import paddle.nn.functional as F import requests from paddlenlp.trainer import PdArgumentParser -from PIL import Image, ImageDraw, ImageFont +from PIL import Image from paddlemix.models.groundingdino.modeling import GroundingDinoModel from paddlemix.models.sam.modeling import SamModel @@ -44,9 +44,7 @@ def show_mask(mask, ax, random_color=False): def show_box(box, ax, label): x0, y0 = box[0], box[1] w, h = box[2] - box[0], box[3] - box[1] - ax.add_patch( - plt.Rectangle( - (x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=2)) + ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=2)) ax.text(x0, y0, label) @@ -59,15 +57,19 @@ class DataArguments: the command line. """ - input_image: str = field(metadata={"help": "The name of input image."}, ) + input_image: str = field( + metadata={"help": "The name of input image."}, + ) det_prompt: str = field( default=None, - metadata={"help": "The prompt of the image to be det."}, ) + metadata={"help": "The prompt of the image to be det."}, + ) inpaint_prompt: str = field( default=None, - metadata={"help": "The prompt of the image to be inpaint."}, ) + metadata={"help": "The prompt of the image to be inpaint."}, + ) @dataclass @@ -78,25 +80,32 @@ class ModelArguments: stable_diffusion_pipeline_name_or_path: str = field( default="stabilityai/stable-diffusion-2-inpainting", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) dino_model_name_or_path: str = field( default="GroundingDino/groundingdino-swint-ogc", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) sam_model_name_or_path: str = field( default="Sam/SamVitH-1024", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) box_threshold: float = field( default=0.3, - metadata={"help": "box threshold."}, ) + metadata={"help": "box threshold."}, + ) text_threshold: float = field( default=0.25, - metadata={"help": "text threshold."}, ) + metadata={"help": "text threshold."}, + ) output_dir: str = field( default="inpainting_output", - metadata={"help": "output directory."}, ) + metadata={"help": "output directory."}, + ) visual: bool = field( default=True, - metadata={"help": "save visual image."}, ) + metadata={"help": "save visual image."}, + ) def main(): @@ -104,29 +113,23 @@ def main(): model_args, data_args = parser.parse_args_into_dataclasses() url = data_args.input_image - logger.info("stable diffusion pipeline: {}".format( - model_args.stable_diffusion_pipeline_name_or_path)) - pipe = StableDiffusionInpaintPipeline.from_pretrained( - model_args.stable_diffusion_pipeline_name_or_path) + logger.info("stable diffusion pipeline: {}".format(model_args.stable_diffusion_pipeline_name_or_path)) + pipe = StableDiffusionInpaintPipeline.from_pretrained(model_args.stable_diffusion_pipeline_name_or_path) logger.info("stable diffusion pipeline build finish!") logger.info("dino_model: {}".format(model_args.dino_model_name_or_path)) # bulid dino processor - dino_processor = GroudingDinoProcessor.from_pretrained( - model_args.dino_model_name_or_path) + dino_processor = GroudingDinoProcessor.from_pretrained(model_args.dino_model_name_or_path) # bulid dino model - dino_model = GroundingDinoModel.from_pretrained( - model_args.dino_model_name_or_path) + dino_model = GroundingDinoModel.from_pretrained(model_args.dino_model_name_or_path) dino_model.eval() logger.info("dino_model build finish!") # buidl sam processor - sam_processor = SamProcessor.from_pretrained( - model_args.sam_model_name_or_path) + sam_processor = SamProcessor.from_pretrained(model_args.sam_model_name_or_path) # bulid model logger.info("SamModel: {}".format(model_args.sam_model_name_or_path)) - sam_model = SamModel.from_pretrained( - model_args.sam_model_name_or_path, input_type="boxs") + sam_model = SamModel.from_pretrained(model_args.sam_model_name_or_path, input_type="boxs") logger.info("SamModel build finish!") # read image @@ -142,8 +145,7 @@ def main(): image_pil = image_pil.convert("RGB") # preprocess image text_prompt - image_tensor, mask, tokenized_out = dino_processor( - images=image_pil, text=data_args.det_prompt) + image_tensor, mask, tokenized_out = dino_processor(images=image_pil, text=data_args.det_prompt) with paddle.no_grad(): outputs = dino_model( @@ -151,9 +153,9 @@ def main(): mask, input_ids=tokenized_out["input_ids"], attention_mask=tokenized_out["attention_mask"], - text_self_attention_masks=tokenized_out[ - "text_self_attention_masks"], - position_ids=tokenized_out["position_ids"], ) + text_self_attention_masks=tokenized_out["text_self_attention_masks"], + position_ids=tokenized_out["position_ids"], + ) logits = F.sigmoid(outputs["pred_logits"])[0] # (nq, 256) boxes = outputs["pred_boxes"][0] # (nq, 4) @@ -189,8 +191,7 @@ def main(): x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1) boxes.append([x0, y0, x1, y1]) boxes = np.array(boxes) - image_seg, prompt = sam_processor( - image_pil, input_type="boxs", box=boxes, point_coords=None) + image_seg, prompt = sam_processor(image_pil, input_type="boxs", box=boxes, point_coords=None) seg_masks = sam_model(img=image_seg, prompt=prompt) seg_masks = sam_processor.postprocess_masks(seg_masks) @@ -212,7 +213,8 @@ def main(): os.path.join(model_args.output_dir, "mask_pred.jpg"), bbox_inches="tight", dpi=300, - pad_inches=0.0, ) + pad_inches=0.0, + ) merge_mask = paddle.sum(seg_masks, axis=0).unsqueeze(0) merge_mask = merge_mask > 0 @@ -221,13 +223,9 @@ def main(): image_pil = image_pil.resize((512, 512)) mask_pil = mask_pil.resize((512, 512)) - image = pipe( - prompt=data_args.inpaint_prompt, image=image_pil, - mask_image=mask_pil).images[0] + image = pipe(prompt=data_args.inpaint_prompt, image=image_pil, mask_image=mask_pil).images[0] image = image.resize(size) - image.save( - os.path.join(model_args.output_dir, - "grounded_sam_inpainting_output.jpg")) + image.save(os.path.join(model_args.output_dir, "grounded_sam_inpainting_output.jpg")) logger.info("finish!") diff --git a/deploy/groundingdino/export.py b/deploy/groundingdino/export.py index 1e617cd852353..86de4d5085a5c 100644 --- a/deploy/groundingdino/export.py +++ b/deploy/groundingdino/export.py @@ -31,12 +31,12 @@ def _prune_input_spec(input_spec, program, targets): pruned_input_spec = [{}] program = program.clone() program = program._prune(targets=targets) - global_block = program.global_block() + # global_block = program.global_block() for spec in input_spec: try: name = spec.name - v = global_block.var(name) + # v = global_block.var(name) pruned_input_spec[0][name] = spec except Exception: pass @@ -47,20 +47,12 @@ def _prune_input_spec(input_spec, program, targets): def apply_to_static(model): input_spec = [ - InputSpec( - shape=[None, 3, None, None], name="x", dtype="float32"), - InputSpec( - shape=[None, None, None], name="m", dtype="int64"), - InputSpec( - shape=[None, None], name="input_ids", dtype="int64"), - InputSpec( - shape=[None, None], name="attention_mask", dtype="int64"), - InputSpec( - shape=[None, None, None], - name="text_self_attention_masks", - dtype="int64"), - InputSpec( - shape=[None, None], name="position_ids", dtype="int64"), + InputSpec(shape=[None, 3, None, None], name="x", dtype="float32"), + InputSpec(shape=[None, None, None], name="m", dtype="int64"), + InputSpec(shape=[None, None], name="input_ids", dtype="int64"), + InputSpec(shape=[None, None], name="attention_mask", dtype="int64"), + InputSpec(shape=[None, None, None], name="text_self_attention_masks", dtype="int64"), + InputSpec(shape=[None, None], name="position_ids", dtype="int64"), ] model = paddle.jit.to_static(model, input_spec=input_spec) return model, input_spec @@ -74,13 +66,15 @@ def apply_to_static(model): "-dt", type=str, default="GroundingDino/groundingdino-swint-ogc", - help="dino type", ) + help="dino type", + ) parser.add_argument( "--output_dir", "-o", type=str, default="output_groundingdino", - help="output directory", ) + help="output directory", + ) args = parser.parse_args() output_dir = args.output_dir @@ -93,4 +87,5 @@ def apply_to_static(model): paddle.jit.save( static_model, os.path.join(output_dir, "groundingdino_model"), - input_spec=input_spec, ) + input_spec=input_spec, + ) diff --git a/deploy/groundingdino/predict.py b/deploy/groundingdino/predict.py index 836529faab8ed..60c13c20fcf5e 100644 --- a/deploy/groundingdino/predict.py +++ b/deploy/groundingdino/predict.py @@ -25,25 +25,26 @@ from PIL import Image, ImageDraw, ImageFont from paddlemix.processors.groundingdino_processing import GroudingDinoProcessor -from paddlemix.utils.log import logger ms_deformable_attn = load( name="deformable_detr_ops", sources=[ "./paddlemix/models/groundingdino/csrc/ms_deformable_attn_op.cc", "./paddlemix/models/groundingdino/csrc/ms_deformable_attn_op.cu", - ], ) + ], +) def load_predictor( - model_dir, - run_mode="paddle", - batch_size=1, - device="GPU", - cpu_threads=1, - enable_mkldnn=False, - enable_mkldnn_bfloat16=False, - delete_shuffle_pass=False, ): + model_dir, + run_mode="paddle", + batch_size=1, + device="GPU", + cpu_threads=1, + enable_mkldnn=False, + enable_mkldnn_bfloat16=False, + delete_shuffle_pass=False, +): """set AnalysisConfig, generate AnalysisPredictor Args: model_dir (str): root path of __model__ and __params__ @@ -64,8 +65,8 @@ def load_predictor( """ if device != "GPU" and run_mode != "paddle": raise ValueError( - "Predict by TensorRT mode: {}, expect device=='GPU', but device == {}". - format(run_mode, device)) + "Predict by TensorRT mode: {}, expect device=='GPU', but device == {}".format(run_mode, device) + ) infer_model = os.path.join(model_dir, "groundingdino_model.pdmodel") infer_params = os.path.join(model_dir, "groundingdino_model.pdiparams") @@ -93,10 +94,8 @@ def load_predictor( config.enable_mkldnn() if enable_mkldnn_bfloat16: config.enable_mkldnn_bfloat16() - except Exception as e: - print( - "The current environment does not support `mkldnn`, so disable mkldnn." - ) + except Exception: + print("The current environment does not support `mkldnn`, so disable mkldnn.") pass # disable print log when predict @@ -154,8 +153,7 @@ def plot_boxes_to_image(image_pil, tgt): class Predictor(object): def __init__(self, model_args, data_args): - self.processor = GroudingDinoProcessor.from_pretrained( - model_args.text_encoder_type) + self.processor = GroudingDinoProcessor.from_pretrained(model_args.text_encoder_type) self.box_threshold = model_args.box_threshold self.text_threshold = model_args.text_threshold self.predictor, self.config = load_predictor(model_args.model_path) @@ -171,8 +169,7 @@ def create_inputs(self): self.input_map["m"] = np.array(self.mask.numpy(), dtype="int64") for key in self.tokenized_input.keys(): - self.input_map[key] = np.array( - self.tokenized_input[key].numpy(), dtype="int64") + self.input_map[key] = np.array(self.tokenized_input[key].numpy(), dtype="int64") input_names = self.predictor.get_input_names() for i in range(len(input_names)): @@ -181,8 +178,7 @@ def create_inputs(self): def preprocess(self, image, text): - self.image, self.mask, self.tokenized_input = self.processor( - images=image, text=text) + self.image, self.mask, self.tokenized_input = self.processor(images=image, text=text) def run(self, image, prompt): self.preprocess(image, data_args.prompt) @@ -190,10 +186,8 @@ def run(self, image, prompt): self.create_inputs() self.predictor.run() output_names = self.predictor.get_output_names() - pred_boxes = self.predictor.get_output_handle(output_names[ - 0]).copy_to_cpu() - pred_logits = self.predictor.get_output_handle(output_names[ - 1]).copy_to_cpu() + pred_boxes = self.predictor.get_output_handle(output_names[0]).copy_to_cpu() + pred_logits = self.predictor.get_output_handle(output_names[1]).copy_to_cpu() pred_dict = { "pred_logits": paddle.to_tensor(pred_logits), @@ -219,8 +213,7 @@ def postprocess(self, outputs, with_logits=True): for logit, box in zip(logits_filt, boxes_filt): pred_phrase = self.processor.decode(logit > self.text_threshold) if with_logits: - pred_phrases.append(pred_phrase + - f"({str(logit.max().item())[:4]})") + pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})") else: pred_phrases.append(pred_phrase) @@ -235,8 +228,7 @@ def main(model_args, data_args): # read image image_pil = Image.open(data_args.input_image).convert("RGB") else: - image_pil = Image.open(requests.get(url, stream=True).raw).convert( - "RGB") + image_pil = Image.open(requests.get(url, stream=True).raw).convert("RGB") boxes_filt, pred_phrases = predictor.run(image_pil, data_args.prompt) @@ -265,9 +257,7 @@ class DataArguments: """ input_image: str = field(metadata={"help": "The name of input image."}) - prompt: str = field( - default=None, - metadata={"help": "The prompt of the image to be generated."}) + prompt: str = field(default=None, metadata={"help": "The prompt of the image to be generated."}) @dataclass @@ -278,30 +268,32 @@ class ModelArguments: model_path: str = field( default="output_groundingdino/", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) text_encoder_type: str = field( default="GroundingDino/groundingdino-swint-ogc", - metadata={"help": "type for text encoder ."}, ) + metadata={"help": "type for text encoder ."}, + ) box_threshold: float = field( default=0.3, - metadata={"help": "box threshold."}, ) + metadata={"help": "box threshold."}, + ) text_threshold: float = field( default=0.25, - metadata={"help": "text threshold."}, ) + metadata={"help": "text threshold."}, + ) output_dir: str = field( default="output", - metadata={"help": "output directory."}, ) + metadata={"help": "output directory."}, + ) run_mode: str = field( default="paddle", - metadata={ - "help": "mode of running(paddle/trt_fp32/trt_fp16/trt_int8)." - }, ) + metadata={"help": "mode of running(paddle/trt_fp32/trt_fp16/trt_int8)."}, + ) device: str = field( default="GPU", - metadata={ - "help": - "Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU." - }, ) + metadata={"help": "Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU."}, + ) if __name__ == "__main__": diff --git a/deploy/sam/export.py b/deploy/sam/export.py index a2ecf4f879a12..e56c1eb7aaa33 100644 --- a/deploy/sam/export.py +++ b/deploy/sam/export.py @@ -14,7 +14,6 @@ import argparse import os -import sys import paddle import yaml @@ -30,24 +29,28 @@ def parse_args(): choices=["SamVitL", "SamVitB", "SamVitH"], required=True, help="The model type.", - type=str, ) + type=str, + ) parser.add_argument( "--input_type", choices=["boxs", "points", "points_grid"], required=True, help="The model type.", - type=str, ) + type=str, + ) parser.add_argument( "--save_dir", help="The directory for saving the exported inference model", type=str, - default="./output/inference_model", ) + default="./output/inference_model", + ) parser.add_argument( "--input_img_shape", nargs="+", help="Export the model with fixed input shape, e.g., `--input_img_shape 1 3 512 1024`.", type=int, - default=[1, 3, 1024, 1024], ) + default=[1, 3, 1024, 1024], + ) return parser.parse_args() @@ -56,11 +59,9 @@ def main(args): os.environ["PADDLESEG_EXPORT_STAGE"] = "True" - model = SamModel.from_pretrained( - args.model_type, input_type=args.input_type) + model = SamModel.from_pretrained(args.model_type, input_type=args.input_type) - shape = ([None, 3, None, None] - if args.input_img_shape is None else args.input_img_shape) + shape = [None, 3, None, None] if args.input_img_shape is None else args.input_img_shape if args.input_type == "points": shape2 = [1, 1, 2] elif args.input_type == "boxs": @@ -69,10 +70,8 @@ def main(args): shape2 = [64, 1, 2] input_spec = [ - paddle.static.InputSpec( - shape=shape, dtype="float32"), - paddle.static.InputSpec( - shape=shape2, dtype="int32"), + paddle.static.InputSpec(shape=shape, dtype="float32"), + paddle.static.InputSpec(shape=shape2, dtype="int32"), ] model.eval() model = paddle.jit.to_static(model, input_spec=input_spec) diff --git a/deploy/sam/predict.py b/deploy/sam/predict.py index cde1d2b01fe68..00d260a840e65 100644 --- a/deploy/sam/predict.py +++ b/deploy/sam/predict.py @@ -19,15 +19,12 @@ import matplotlib.pyplot as plt import numpy as np -import paddle -import paddle.nn.functional as F import requests import yaml from paddle.inference import Config as PredictConfig -from paddle.inference import PrecisionType, create_predictor -from paddle.utils.cpp_extension import load +from paddle.inference import create_predictor from paddlenlp.trainer import PdArgumentParser -from PIL import Image, ImageDraw, ImageFont +from PIL import Image from paddlemix.processors.sam_processing import SamProcessor from paddlemix.utils.log import logger @@ -60,9 +57,13 @@ def params(self): def use_auto_tune(args): - return (hasattr(PredictConfig, "collect_shape_range_info") and - hasattr(PredictConfig, "enable_tuned_tensorrt_dynamic_shape") and - args.device == "gpu" and args.use_trt and args.enable_auto_tune) + return ( + hasattr(PredictConfig, "collect_shape_range_info") + and hasattr(PredictConfig, "enable_tuned_tensorrt_dynamic_shape") + and args.device == "gpu" + and args.use_trt + and args.enable_auto_tune + ) def auto_tune(args, imgs, img_nums): @@ -80,8 +81,8 @@ def auto_tune(args, imgs, img_nums): logger.info("Auto tune the dynamic shape for GPU TRT.") assert use_auto_tune(args), ( - "Do not support auto_tune, which requires " - "device==gpu && use_trt==True && paddle >= 2.2") + "Do not support auto_tune, which requires " "device==gpu && use_trt==True && paddle >= 2.2" + ) if not isinstance(imgs, (list, tuple)): imgs = [imgs] @@ -114,8 +115,8 @@ def auto_tune(args, imgs, img_nums): except Exception as e: logger.info(str(e)) logger.info( - "Auto tune failed. Usually, the error is out of GPU memory " - "for the model or image is too large. \n") + "Auto tune failed. Usually, the error is out of GPU memory " "for the model or image is too large. \n" + ) del predictor if os.path.exists(args.auto_tuned_shape_file): os.remove(args.auto_tuned_shape_file) @@ -153,7 +154,8 @@ def __init__(self, args): logger.info( "If the above error is '(InvalidArgument) some trt inputs dynamic shape info not set, " "..., Expected all_dynamic_shape_set == true, ...', " - "please set --enable_auto_tune=True to use auto_tune. \n") + "please set --enable_auto_tune=True to use auto_tune. \n" + ) exit() def _init_base_config(self): @@ -182,12 +184,6 @@ def _init_gpu_config(self): """ logger.info("Use GPU") self.pred_cfg.enable_use_gpu(100, 0) - precision_map = { - "fp16": PrecisionType.Half, - "fp32": PrecisionType.Float32, - "int8": PrecisionType.Int8, - } - precision_mode = precision_map[self.args.precision] def run(self, image, prompt_out): image, prompt_out = self.preprocess(image, prompt_out) @@ -218,7 +214,8 @@ def preprocess(self, image, prompts): image, input_type=self.args.input_type, box=prompts["boxs"], - point_coords=prompts["points"], ) + point_coords=prompts["points"], + ) return [image_seg, prompt] @@ -236,11 +233,8 @@ class DataArguments: """ input_image: str = field(metadata={"help": "The name of input image."}) - box_prompt: List[int] = field( - default=None, metadata={"help": "box promt format as xyxyxyxy...]."}) - points_prompt: List[int] = field( - default=None, - metadata={"help": "point promt format as [[xy],[xy]...]."}) + box_prompt: List[int] = field(default=None, metadata={"help": "box promt format as xyxyxyxy...]."}) + points_prompt: List[int] = field(default=None, metadata={"help": "point promt format as [[xy],[xy]...]."}) @dataclass @@ -251,53 +245,56 @@ class ModelArguments: model_name_or_path: str = field( default="Sam/SamVitH", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) input_type: str = field( default="boxs", - metadata={ - "help": - "The model prompt type, choices ['boxs', 'points', 'points_grid']." - }, ) + metadata={"help": "The model prompt type, choices ['boxs', 'points', 'points_grid']."}, + ) cfg: str = field( default=None, - metadata={"help": "The config file."}, ) + metadata={"help": "The config file."}, + ) use_trt: bool = field( default=False, - metadata={ - "help": "Whether to use Nvidia TensorRT to accelerate prediction." - }, ) + metadata={"help": "Whether to use Nvidia TensorRT to accelerate prediction."}, + ) precision: str = field( default="fp32", - metadata={"help": "The tensorrt precision."}, ) + metadata={"help": "The tensorrt precision."}, + ) min_subgraph_size: int = field( default=3, - metadata={"help": "The min subgraph size in tensorrt prediction.'"}, ) + metadata={"help": "The min subgraph size in tensorrt prediction.'"}, + ) enable_auto_tune: bool = field( default=False, metadata={ - "help": - "Whether to enable tuned dynamic shape. We uses some images to collect \ + "help": "Whether to enable tuned dynamic shape. We uses some images to collect \ the dynamic shape for trt sub graph, which avoids setting dynamic shape manually." - }, ) + }, + ) device: str = field( default="GPU", - metadata={ - "help": - "Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU." - }, ) + metadata={"help": "Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU."}, + ) cpu_threads: int = field( default=10, - metadata={"help": "Number of threads to predict when using cpu."}, ) + metadata={"help": "Number of threads to predict when using cpu."}, + ) enable_mkldnn: bool = field( default=False, - metadata={"help": "Enable to use mkldnn to speed up when using cpu."}, ) + metadata={"help": "Enable to use mkldnn to speed up when using cpu."}, + ) output_dir: str = field( default="seg_output", - metadata={"help": "output directory."}, ) + metadata={"help": "output directory."}, + ) visual: bool = field( default=True, - metadata={"help": "save visual image."}, ) + metadata={"help": "save visual image."}, + ) def main(model_args, data_args): @@ -308,8 +305,7 @@ def main(model_args, data_args): # read image image_pil = Image.open(data_args.input_image).convert("RGB") else: - image_pil = Image.open(requests.get(url, stream=True).raw).convert( - "RGB") + image_pil = Image.open(requests.get(url, stream=True).raw).convert("RGB") if data_args.box_prompt is not None: data_args.box_prompt = np.array(data_args.box_prompt) @@ -323,10 +319,7 @@ def main(model_args, data_args): predictor = Predictor(model_args) image_pil = Image.open(data_args.input_image).convert("RGB") - seg_masks = predictor.run(image_pil, { - "points": data_args.points_prompt, - "boxs": data_args.box_prompt - }) + seg_masks = predictor.run(image_pil, {"points": data_args.points_prompt, "boxs": data_args.box_prompt}) if model_args.visual: # make dir @@ -342,10 +335,10 @@ def main(model_args, data_args): os.path.join(model_args.output_dir, "mask_pred.jpg"), bbox_inches="tight", dpi=300, - pad_inches=0.0, ) + pad_inches=0.0, + ) - if use_auto_tune(model_args) and os.path.exists( - model_args.auto_tuned_shape_file): + if use_auto_tune(model_args) and os.path.exists(model_args.auto_tuned_shape_file): os.remove(model_args.auto_tuned_shape_file) diff --git a/paddlemix/activations.py b/paddlemix/activations.py index c3119b2315377..ab9be11679283 100644 --- a/paddlemix/activations.py +++ b/paddlemix/activations.py @@ -28,9 +28,9 @@ class NewGELUActivation(nn.Layer): """ def forward(self, input: Tensor) -> Tensor: - return (0.5 * input * (1.0 + paddle.tanh( - math.sqrt(2.0 / math.pi) * - (input + 0.044715 * paddle.pow(input, 3.0))))) + return ( + 0.5 * input * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * paddle.pow(input, 3.0)))) + ) class GELUActivation(nn.Layer): @@ -41,7 +41,7 @@ class GELUActivation(nn.Layer): Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 """ - def __init__(self, use_gelu_python: bool=False): + def __init__(self, use_gelu_python: bool = False): super().__init__() if use_gelu_python: self.act = self._gelu_python @@ -61,9 +61,7 @@ class FastGELUActivation(nn.Layer): """ def forward(self, input: Tensor) -> Tensor: - return (0.5 * input * - (1.0 + paddle.tanh(input * 0.7978845608 * - (1.0 + 0.044715 * input * input)))) + return 0.5 * input * (1.0 + paddle.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input))) class QuickGELUActivation(nn.Layer): @@ -90,8 +88,7 @@ class ClippedGELUActivation(nn.Layer): def __init__(self, min: float, max: float): if min > max: - raise ValueError( - f"min should be < max (got min: {min}, max: {max})") + raise ValueError(f"min should be < max (got min: {min}, max: {max})") super().__init__() self.min = min @@ -142,15 +139,10 @@ def __getitem__(self, key): ACT2CLS = { "gelu": GELUActivation, - "gelu_10": (ClippedGELUActivation, { - "min": -10, - "max": 10 - }), + "gelu_10": (ClippedGELUActivation, {"min": -10, "max": 10}), "gelu_fast": FastGELUActivation, "gelu_new": NewGELUActivation, - "gelu_python": (GELUActivation, { - "use_gelu_python": True - }), + "gelu_python": (GELUActivation, {"use_gelu_python": True}), "linear": LinearActivation, "mish": MishActivation, "quick_gelu": QuickGELUActivation, @@ -168,9 +160,7 @@ def get_activation(activation_string): if activation_string in ACT2FN: return ACT2FN[activation_string] else: - raise KeyError( - f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}" - ) + raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}") # For backwards compatibility with: from activations import gelu_python diff --git a/paddlemix/appflow/appflow.py b/paddlemix/appflow/appflow.py index 4823fee7482bb..d26e362965302 100644 --- a/paddlemix/appflow/appflow.py +++ b/paddlemix/appflow/appflow.py @@ -32,16 +32,8 @@ class Appflow(object): """ - def __init__(self, - app, - models=None, - mode=None, - device_id=0, - from_hf_hub=False, - **kwargs): - assert ( - app in APPLICATIONS - ), f"The task name:{app} is not in Taskflow list, please check your task name." + def __init__(self, app, models=None, mode=None, device_id=0, from_hf_hub=False, **kwargs): + assert app in APPLICATIONS, f"The task name:{app} is not in Taskflow list, please check your task name." self.app = app # Set the device for the task device = get_env_device() @@ -55,16 +47,14 @@ def __init__(self, self.models = models if isinstance(self.models, list) and len(self.models) > 0: for model in self.models: - assert model in set(APPLICATIONS[app][tag].keys( - )), f"The {tag} name: {model} is not in task:[{app}]" + assert model in set(APPLICATIONS[app][tag].keys()), f"The {tag} name: {model} is not in task:[{app}]" else: self.models = [APPLICATIONS[app]["default"][ind_tag]] self.task_instances = [] for model in self.models: if "task_priority_path" in APPLICATIONS[self.app][tag][model]: - priority_path = APPLICATIONS[self.app][tag][model][ - "task_priority_path"] + priority_path = APPLICATIONS[self.app][tag][model]["task_priority_path"] else: priority_path = None @@ -79,7 +69,9 @@ def __init__(self, task=self.app, priority_path=priority_path, from_hf_hub=from_hf_hub, - **kwargs, )) + **kwargs, + ) + ) app_list = APPLICATIONS.keys() Appflow.app_list = app_list diff --git a/paddlemix/appflow/apptask.py b/paddlemix/appflow/apptask.py index 7b7bdc6055650..694204c312c22 100644 --- a/paddlemix/appflow/apptask.py +++ b/paddlemix/appflow/apptask.py @@ -42,28 +42,22 @@ def __init__(self, model, task, priority_path=None, **kwargs): self._priority_path = priority_path self.is_static_model = kwargs.get("is_static_model", False) - self._home_path = (self.kwargs["home_path"] - if "home_path" in self.kwargs else PPMIX_HOME) + self._home_path = self.kwargs["home_path"] if "home_path" in self.kwargs else PPMIX_HOME if "task_path" in self.kwargs: self._task_path = self.kwargs["task_path"] self._model_dir = self._task_path elif self._priority_path: - self._task_path = os.path.join(self._home_path, "models", - self._priority_path) + self._task_path = os.path.join(self._home_path, "models", self._priority_path) self._model_dir = os.path.join(self._home_path, "models") else: - self._task_path = os.path.join(self._home_path, "models", - self.model) + self._task_path = os.path.join(self._home_path, "models", self.model) self._model_dir = os.path.join(self._home_path, "models") - self._infer_precision = (self.kwargs["precision"] - if "precision" in self.kwargs else "fp32") + self._infer_precision = self.kwargs["precision"] if "precision" in self.kwargs else "fp32" # Default to use Paddle Inference self._predictor_type = "paddle-inference" - self._num_threads = (self.kwargs["num_threads"] - if "num_threads" in self.kwargs else - math.ceil(cpu_count() / 2)) + self._num_threads = self.kwargs["num_threads"] if "num_threads" in self.kwargs else math.ceil(cpu_count() / 2) def _construct_tokenizer(self, model): """ @@ -83,8 +77,7 @@ def _get_static_model_name(self): if len(names) == 0: raise IOError(f"{self._task_path} should include '.pdparams' file.") if len(names) > 1: - logger.warning( - f"{self._task_path} includes more than one '.pdparams' file.") + logger.warning(f"{self._task_path} includes more than one '.pdparams' file.") return names[0] def _convert_dygraph_to_static(self): @@ -98,12 +91,10 @@ def _convert_dygraph_to_static(self): self._input_spec is not None ), "The input spec must be created before converting the dygraph model to static model." logger.info("Converting to the inference model cost a little time.") - static_model = paddle.jit.to_static( - self._model, input_spec=self._input_spec) + static_model = paddle.jit.to_static(self._model, input_spec=self._input_spec) paddle.jit.save(static_model, self.inference_model_path) - logger.info("The inference model save in the path:{}".format( - self.inference_model_path)) + logger.info("The inference model save in the path:{}".format(self.inference_model_path)) def _prepare_static_mode(self): """ @@ -139,50 +130,46 @@ def _prepare_static_mode(self): min_subgraph_size=30, precision_mode=precision_map[self._infer_precision], use_static=True, - use_calib_mode=False, ) + use_calib_mode=False, + ) if not os.path.exists(self._tuned_trt_shape_file): - self._config.collect_shape_range_info( - self._tuned_trt_shape_file) + self._config.collect_shape_range_info(self._tuned_trt_shape_file) else: - logger.info(f"Use dynamic shape file: " - f"{self._tuned_trt_shape_file} for TRT...") - self._config.enable_tuned_tensorrt_dynamic_shape( - self._tuned_trt_shape_file, True) + logger.info(f"Use dynamic shape file: " f"{self._tuned_trt_shape_file} for TRT...") + self._config.enable_tuned_tensorrt_dynamic_shape(self._tuned_trt_shape_file, True) if self.task == "openset_det_sam": self._config.delete_pass("add_support_int8_pass") if self.model == "GroundingDino/groundingdino-swint-ogc": - self._config.exp_disable_tensorrt_ops([ - "pad3d", - "set_value", - "reduce_all", - "cumsum_8.tmp_0", - "linear_296.tmp_1", - ]) + self._config.exp_disable_tensorrt_ops( + [ + "pad3d", + "set_value", + "reduce_all", + "cumsum_8.tmp_0", + "linear_296.tmp_1", + ] + ) if self.model == "Sam/SamVitH-1024" or self.model == "Sam/SamVitH-512": self._config.delete_pass("shuffle_channel_detect_pass") self._config.delete_pass("trt_skip_layernorm_fuse_pass") self._config.delete_pass("preln_residual_bias_fuse_pass") - self._config.exp_disable_tensorrt_ops([ - "concat_1.tmp_0", - "set_value", - "empty_0.tmp_0", - "concat_55.tmp_0", - ]) + self._config.exp_disable_tensorrt_ops( + [ + "concat_1.tmp_0", + "set_value", + "empty_0.tmp_0", + "concat_55.tmp_0", + ] + ) self.predictor = paddle.inference.create_predictor(self._config) self.input_names = [name for name in self.predictor.get_input_names()] - self.input_handles = [ - self.predictor.get_input_handle(name) - for name in self.predictor.get_input_names() - ] - self.output_handle = [ - self.predictor.get_output_handle(name) - for name in self.predictor.get_output_names() - ] + self.input_handles = [self.predictor.get_input_handle(name) for name in self.predictor.get_input_names()] + self.output_handle = [self.predictor.get_output_handle(name) for name in self.predictor.get_output_names()] def _get_inference_model(self): """ @@ -191,11 +178,10 @@ def _get_inference_model(self): # When the user-provided model path is already a static model, skip to_static conversion if self.is_static_model: - self.inference_model_path = os.path.join(self._task_path, - self._static_model_name) - if not os.path.exists(self.inference_model_path + - ".pdmodel") or not os.path.exists( - self.inference_model_path + ".pdiparams"): + self.inference_model_path = os.path.join(self._task_path, self._static_model_name) + if not os.path.exists(self.inference_model_path + ".pdmodel") or not os.path.exists( + self.inference_model_path + ".pdiparams" + ): raise IOError( f"{self._task_path} should include {self._static_model_name + '.pdmodel'} and {self._static_model_name + '.pdiparams'} while is_static_model is True" ) @@ -205,8 +191,7 @@ def _get_inference_model(self): else: # Since 'self._task_path' is used to load the HF Hub path when 'from_hf_hub=True', we construct the static model path in a different way - self.inference_model_path = os.path.join(self._task_path, - self._static_model_name) + self.inference_model_path = os.path.join(self._task_path, self._static_model_name) self._tuned_trt_shape_file = self.inference_model_path + "_shape.txt" if not os.path.exists(self.inference_model_path + ".pdiparams"): with dygraph_mode_guard(): @@ -217,17 +202,12 @@ def _get_inference_model(self): self._static_model_file = self.inference_model_path + ".pdmodel" self._static_params_file = self.inference_model_path + ".pdiparams" - if (paddle.get_device().split(":", 1)[0] == "npu" and - self._infer_precision == "fp16"): + if paddle.get_device().split(":", 1)[0] == "npu" and self._infer_precision == "fp16": # transform fp32 model tp fp16 model self._static_fp16_model_file = self.inference_model_path + "-fp16.pdmodel" - self._static_fp16_params_file = ( - self.inference_model_path + "-fp16.pdiparams") - if not os.path.exists( - self._static_fp16_model_file) and not os.path.exists( - self._static_fp16_params_file): - logger.info( - "Converting to the inference model from fp32 to fp16.") + self._static_fp16_params_file = self.inference_model_path + "-fp16.pdiparams" + if not os.path.exists(self._static_fp16_model_file) and not os.path.exists(self._static_fp16_params_file): + logger.info("Converting to the inference model from fp32 to fp16.") paddle.inference.convert_to_mixed_precision( os.path.join(self._static_model_file), os.path.join(self._static_params_file), @@ -237,16 +217,16 @@ def _get_inference_model(self): mixed_precision=paddle.inference.PrecisionType.Half, # Here, npu sigmoid will lead to OOM and cpu sigmoid don't support fp16. # So, we add sigmoid to black list temporarily. - black_list={"sigmoid"}, ) + black_list={"sigmoid"}, + ) logger.info( - "The inference model in fp16 precison save in the path:{}". - format(self._static_fp16_model_file)) + "The inference model in fp16 precison save in the path:{}".format(self._static_fp16_model_file) + ) self._static_model_file = self._static_fp16_model_file self._static_params_file = self._static_fp16_params_file if self._predictor_type == "paddle-inference": - self._config = paddle.inference.Config(self._static_model_file, - self._static_params_file) + self._config = paddle.inference.Config(self._static_model_file, self._static_params_file) self._prepare_static_mode() else: self._prepare_onnx_mode() diff --git a/paddlemix/appflow/configuration.py b/paddlemix/appflow/configuration.py index 174729a2d8a19..bf2bd400d5b32 100644 --- a/paddlemix/appflow/configuration.py +++ b/paddlemix/appflow/configuration.py @@ -12,12 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .image2image_text_guided_generation import (StableDiffusionImg2ImgTask, - StableDiffusionUpscaleTask) +from .image2image_text_guided_generation import ( + StableDiffusionImg2ImgTask, + StableDiffusionUpscaleTask, +) from .image2text_generation import Blip2CaptionTask from .openset_det_sam import OpenSetDetTask, OpenSetSegTask -from .text2image_generation import (StableDiffusionTask, - VersatileDiffusionDualGuidedTask) +from .text2image_generation import StableDiffusionTask, VersatileDiffusionDualGuidedTask from .text2image_inpaiting import StableDiffusionInpaintTask from .text2text_generation import ChatGlmTask from .text2video_generation import TextToVideoSDTask @@ -92,8 +93,7 @@ "models": { "Linaqruf/anything-v3.0": { "task_class": StableDiffusionImg2ImgTask, - "task_flag": - "image2image_text_guided_generation-Linaqruf/anything-v3.0", + "task_flag": "image2image_text_guided_generation-Linaqruf/anything-v3.0", } }, "default": { @@ -104,8 +104,7 @@ "models": { "stabilityai/stable-diffusion-x4-upscaler": { "task_class": StableDiffusionUpscaleTask, - "task_flag": - "image2image_text_guided_upscaling-stabilityai/stable-diffusion-x4-upscaler", + "task_flag": "image2image_text_guided_upscaling-stabilityai/stable-diffusion-x4-upscaler", } }, "default": { @@ -116,8 +115,7 @@ "models": { "shi-labs/versatile-diffusion": { "task_class": VersatileDiffusionDualGuidedTask, - "task_flag": - "dual_text_and_image_guided_generation-shi-labs/versatile-diffusion", + "task_flag": "dual_text_and_image_guided_generation-shi-labs/versatile-diffusion", } }, "default": { @@ -128,8 +126,7 @@ "models": { "damo-vilab/text-to-video-ms-1.7b": { "task_class": TextToVideoSDTask, - "task_flag": - "text_to_video_generation-damo-vilab/text-to-video-ms-1.7b", + "task_flag": "text_to_video_generation-damo-vilab/text-to-video-ms-1.7b", } }, "default": { diff --git a/paddlemix/appflow/image2image_text_guided_generation.py b/paddlemix/appflow/image2image_text_guided_generation.py index b58904d14ba7c..2894d7cbbafc1 100644 --- a/paddlemix/appflow/image2image_text_guided_generation.py +++ b/paddlemix/appflow/image2image_text_guided_generation.py @@ -12,10 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle - -from ppdiffusers import (StableDiffusionImg2ImgPipeline, - StableDiffusionUpscalePipeline) +from ppdiffusers import StableDiffusionImg2ImgPipeline, StableDiffusionUpscalePipeline from .apptask import AppTask @@ -37,19 +34,18 @@ def _construct_model(self, model): """ # bulid model - model_instance = StableDiffusionImg2ImgPipeline.from_pretrained( - model, safety_checker=None) + model_instance = StableDiffusionImg2ImgPipeline.from_pretrained(model, safety_checker=None) self._model = model_instance def _preprocess(self, inputs): """ """ image = inputs.get("image", None) - assert image is not None, f"The image is None" + assert image is not None, "The image is None" prompt = inputs.get("prompt", None) - assert prompt is not None, f"The prompt is None" + assert prompt is not None, "The prompt is None" negative_prompt = inputs.get("negative_prompt", None) - assert negative_prompt is not None, f"The negative_prompt is None" + assert negative_prompt is not None, "The negative_prompt is None" return inputs @@ -63,7 +59,8 @@ def _run_model(self, inputs): negative_prompt=inputs["negative_prompt"], image=inputs["image"], guidance_scale=self._guidance_scale, - strength=self._strength, ).images[0] + strength=self._strength, + ).images[0] inputs.pop("prompt", None) inputs.pop("negative_prompt", None) @@ -101,9 +98,9 @@ def _construct_model(self, model): def _preprocess(self, inputs): """ """ image = inputs.get("image", None) - assert image is not None, f"The image is None" + assert image is not None, "The image is None" prompt = inputs.get("prompt", None) - assert prompt is not None, f"The prompt is None" + assert prompt is not None, "The prompt is None" return inputs @@ -114,7 +111,8 @@ def _run_model(self, inputs): result = self._model( prompt=inputs["prompt"], - image=inputs["image"], ).images[0] + image=inputs["image"], + ).images[0] inputs.pop("prompt", None) inputs.pop("image", None) diff --git a/paddlemix/appflow/image2text_generation.py b/paddlemix/appflow/image2text_generation.py index 8e13a811cac52..1202560ee8fc5 100644 --- a/paddlemix/appflow/image2text_generation.py +++ b/paddlemix/appflow/image2text_generation.py @@ -19,7 +19,10 @@ from paddlemix.models.blip2.modeling import Blip2ForConditionalGeneration from paddlemix.processors.blip_processing import ( - Blip2Processor, BlipImageProcessor, BlipTextProcessor) + Blip2Processor, + BlipImageProcessor, + BlipTextProcessor, +) from paddlemix.utils.log import logger from .apptask import AppTask @@ -41,23 +44,18 @@ def _construct_processor(self, model): Construct the tokenizer for the predictor. """ # bulid processor - tokenizer_class = AutoTokenizer.from_pretrained( - self._text_model, use_fast=False) - image_processor = BlipImageProcessor.from_pretrained( - os.path.join(model, "processor", "eval")) - text_processor_class = BlipTextProcessor.from_pretrained( - os.path.join(model, "processor", "eval")) + tokenizer_class = AutoTokenizer.from_pretrained(self._text_model, use_fast=False) + image_processor = BlipImageProcessor.from_pretrained(os.path.join(model, "processor", "eval")) + text_processor_class = BlipTextProcessor.from_pretrained(os.path.join(model, "processor", "eval")) - self._processor = Blip2Processor(image_processor, text_processor_class, - tokenizer_class) + self._processor = Blip2Processor(image_processor, text_processor_class, tokenizer_class) def _construct_model(self, model): """ Construct the inference model for the predictor. """ # bulid model - model_instance = Blip2ForConditionalGeneration.from_pretrained( - model, cache_dir=self._model_dir) + model_instance = Blip2ForConditionalGeneration.from_pretrained(model, cache_dir=self._model_dir) self._model = model_instance self._model.eval() @@ -65,7 +63,7 @@ def _construct_model(self, model): def _preprocess(self, inputs): """ """ image = inputs.get("image", None) - assert image is not None, f"The image is None" + assert image is not None, "The image is None" prompt = "describe the image" @@ -74,7 +72,8 @@ def _preprocess(self, inputs): text=prompt, return_tensors="pd", return_attention_mask=True, - mode="test", ) + mode="test", + ) inputs["blip2_input"] = blip2_input @@ -97,8 +96,7 @@ def _postprocess(self, inputs): """ The model output is tag ids, this function will convert the model output to raw text. """ - generated_text = self._processor.batch_decode( - inputs["result"], skip_special_tokens=True)[0].strip() + generated_text = self._processor.batch_decode(inputs["result"], skip_special_tokens=True)[0].strip() logger.info("Generate text: {}".format(generated_text)) inputs.pop("result", None) @@ -111,10 +109,7 @@ def _generate_tags(self, caption): lemma = nltk.wordnet.WordNetLemmatizer() nltk.download(["punkt", "averaged_perceptron_tagger", "wordnet"]) - tags_list = [ - word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(caption)) - if pos[0] == "N" - ] + tags_list = [word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(caption)) if pos[0] == "N"] tags_lemma = [lemma.lemmatize(w) for w in tags_list] tags = ", ".join(map(str, tags_lemma)) diff --git a/paddlemix/appflow/openset_det_sam.py b/paddlemix/appflow/openset_det_sam.py index ee2b499856645..187c9ece0a56e 100644 --- a/paddlemix/appflow/openset_det_sam.py +++ b/paddlemix/appflow/openset_det_sam.py @@ -14,13 +14,11 @@ import numpy as np import paddle import paddle.nn.functional as F -from paddlenlp.taskflow.utils import static_mode_guard from paddlemix.models.groundingdino.modeling import GroundingDinoModel from paddlemix.models.sam.modeling import SamModel from paddlemix.processors.groundingdino_processing import GroudingDinoProcessor from paddlemix.processors.sam_processing import SamProcessor -from paddlemix.utils.log import logger from .apptask import AppTask @@ -57,21 +55,16 @@ def _construct_input_spec(self): Construct the input spec for the convert dygraph model to static model. """ self._input_spec = [ - paddle.static.InputSpec( - shape=[None, 3, None, None], name="x", - dtype="float32"), # image features - paddle.static.InputSpec( - shape=[None, None, None], name="m", dtype="int64"), # mask - paddle.static.InputSpec( - shape=[None, None], name="input_ids", dtype="int64"), - paddle.static.InputSpec( - shape=[None, None], name="attention_mask", dtype="int64"), + paddle.static.InputSpec(shape=[None, 3, None, None], name="x", dtype="float32"), # image features + paddle.static.InputSpec(shape=[None, None, None], name="m", dtype="int64"), # mask + paddle.static.InputSpec(shape=[None, None], name="input_ids", dtype="int64"), + paddle.static.InputSpec(shape=[None, None], name="attention_mask", dtype="int64"), paddle.static.InputSpec( shape=[None, None, None], name="text_self_attention_masks", - dtype="int64", ), - paddle.static.InputSpec( - shape=[None, None], name="position_ids", dtype="int64"), + dtype="int64", + ), + paddle.static.InputSpec(shape=[None, None], name="position_ids", dtype="int64"), ] def _construct_processor(self, model): @@ -79,8 +72,7 @@ def _construct_processor(self, model): Construct the tokenizer for the predictor. """ # bulid processor - self._processor = GroudingDinoProcessor.from_pretrained( - model, cache_dir=self._model_dir) + self._processor = GroudingDinoProcessor.from_pretrained(model, cache_dir=self._model_dir) def _construct_model(self, model): """ @@ -88,8 +80,7 @@ def _construct_model(self, model): """ # bulid model - model_instance = GroundingDinoModel.from_pretrained( - model, cache_dir=self._model_dir) + model_instance = GroundingDinoModel.from_pretrained(model, cache_dir=self._model_dir) # Load the model parameter for the predict model_instance.eval() @@ -98,13 +89,12 @@ def _construct_model(self, model): def _preprocess(self, inputs): """ """ image = inputs.get("image", None) - assert image is not None, f"The image is None" + assert image is not None, "The image is None" prompt = inputs.get("prompt", None) - assert prompt is not None, f"The prompt is None" + assert prompt is not None, "The prompt is None" self._size = image.size - image_tensor, mask, tokenized_out = self._processor( - images=image, text=prompt) + image_tensor, mask, tokenized_out = self._processor(images=image, text=prompt) inputs["image_tensor"] = image_tensor inputs["mask"] = mask @@ -121,16 +111,18 @@ def _run_model(self, inputs): inputs["mask"] = paddle.cast(inputs["mask"], dtype="int64") inputs["tokenized_out"]["text_self_attention_masks"] = paddle.cast( - inputs["tokenized_out"]["text_self_attention_masks"], - dtype="int64") - [pred_boxes, pred_logits] = self.predictor.run([ - inputs["image_tensor"], - inputs["mask"], - inputs["tokenized_out"]["input_ids"], - inputs["tokenized_out"]["attention_mask"], - inputs["tokenized_out"]["text_self_attention_masks"], - inputs["tokenized_out"]["position_ids"], - ]) + inputs["tokenized_out"]["text_self_attention_masks"], dtype="int64" + ) + [pred_boxes, pred_logits] = self.predictor.run( + [ + inputs["image_tensor"], + inputs["mask"], + inputs["tokenized_out"]["input_ids"], + inputs["tokenized_out"]["attention_mask"], + inputs["tokenized_out"]["text_self_attention_masks"], + inputs["tokenized_out"]["position_ids"], + ] + ) result = {"pred_logits": pred_logits, "pred_boxes": pred_boxes} else: result = self._model( @@ -138,9 +130,9 @@ def _run_model(self, inputs): inputs["mask"], input_ids=inputs["tokenized_out"]["input_ids"], attention_mask=inputs["tokenized_out"]["attention_mask"], - text_self_attention_masks=inputs["tokenized_out"][ - "text_self_attention_masks"], - position_ids=inputs["tokenized_out"]["position_ids"], ) + text_self_attention_masks=inputs["tokenized_out"]["text_self_attention_masks"], + position_ids=inputs["tokenized_out"]["position_ids"], + ) inputs.pop("image_tensor", None) inputs.pop("mask", None) inputs.pop("tokenized_out", None) @@ -155,10 +147,8 @@ def _postprocess(self, inputs): """ if self._static_mode: - inputs["result"]["pred_logits"] = paddle.to_tensor(inputs["result"][ - "pred_logits"]) - inputs["result"]["pred_boxes"] = paddle.to_tensor(inputs["result"][ - "pred_boxes"]) + inputs["result"]["pred_logits"] = paddle.to_tensor(inputs["result"]["pred_logits"]) + inputs["result"]["pred_boxes"] = paddle.to_tensor(inputs["result"]["pred_boxes"]) logits = F.sigmoid(inputs["result"]["pred_logits"])[0] # (nq, 256) boxes = inputs["result"]["pred_boxes"][0] # (nq, 4) @@ -174,8 +164,7 @@ def _postprocess(self, inputs): pred_phrases = [] for logit, box in zip(logits_filt, boxes_filt): pred_phrase = self._processor.decode(logit > self._text_threshold) - pred_phrases.append(pred_phrase + - f"({str(logit.max().item())[:4]})") + pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})") H, W = self._size[1], self._size[0] boxes = [] @@ -239,10 +228,8 @@ def _construct_input_spec(self): shape2 = [64, 1, 2] self._input_spec = [ - paddle.static.InputSpec( - shape=shape, dtype="float32"), - paddle.static.InputSpec( - shape=shape2, dtype="int32"), + paddle.static.InputSpec(shape=shape, dtype="float32"), + paddle.static.InputSpec(shape=shape2, dtype="int32"), ] def _construct_processor(self, model): @@ -250,8 +237,7 @@ def _construct_processor(self, model): Construct the tokenizer for the predictor. """ # bulid processor - self._processor = SamProcessor.from_pretrained( - model, cache_dir=self._model_dir) + self._processor = SamProcessor.from_pretrained(model, cache_dir=self._model_dir) def _construct_model(self, model): """ @@ -259,8 +245,7 @@ def _construct_model(self, model): """ # bulid model - model_instance = SamModel.from_pretrained( - model, input_type=self._input_type, cache_dir=self._model_dir) + model_instance = SamModel.from_pretrained(model, input_type=self._input_type, cache_dir=self._model_dir) # Load the model parameter for the predict model_instance.eval() @@ -269,15 +254,13 @@ def _construct_model(self, model): def _preprocess(self, inputs): """ """ image = inputs.get("image", None) - assert image is not None, f"The image is None" + assert image is not None, "The image is None" box_prompt = inputs.get("boxes", None) points_prompt = inputs.get("points", None) - assert (box_prompt is not None or - points_prompt is not None), f"The prompt is None" + assert box_prompt is not None or points_prompt is not None, "The prompt is None" if box_prompt is not None: - box_prompt = (box_prompt if isinstance(box_prompt, np.ndarray) else - np.array(box_prompt)) + box_prompt = box_prompt if isinstance(box_prompt, np.ndarray) else np.array(box_prompt) if points_prompt is not None: points_prompt = np.array([points_prompt]) @@ -285,7 +268,8 @@ def _preprocess(self, inputs): image, input_type=self._input_type, box=box_prompt, - point_coords=points_prompt, ) + point_coords=points_prompt, + ) inputs["image_seg"] = image_seg inputs["prompt"] = prompt @@ -306,8 +290,7 @@ def _run_model(self, inputs): result = result[0] else: - result = self._model( - img=inputs["image_seg"], prompt=inputs["prompt"]) + result = self._model(img=inputs["image_seg"], prompt=inputs["prompt"]) inputs.pop("image_seg", None) diff --git a/paddlemix/appflow/text2image_generation.py b/paddlemix/appflow/text2image_generation.py index bebe51fdaf1ef..2b30c0b831245 100644 --- a/paddlemix/appflow/text2image_generation.py +++ b/paddlemix/appflow/text2image_generation.py @@ -12,10 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle - -from ppdiffusers import (StableDiffusionPipeline, - VersatileDiffusionDualGuidedPipeline) +from ppdiffusers import StableDiffusionPipeline, VersatileDiffusionDualGuidedPipeline from .apptask import AppTask @@ -45,7 +42,7 @@ def _construct_model(self, model): def _preprocess(self, inputs): """ """ prompt = inputs.get("prompt", None) - assert prompt is not None, f"The prompt is None" + assert prompt is not None, "The prompt is None" return inputs @@ -58,7 +55,8 @@ def _run_model(self, inputs): prompt=inputs["prompt"], guidance_scale=self._guidance_scale, height=self._height, - width=self._width, ).images[0] + width=self._width, + ).images[0] inputs.pop("prompt", None) @@ -77,8 +75,7 @@ def _postprocess(self, inputs): class VersatileDiffusionDualGuidedTask(AppTask): def __init__(self, task, model, **kwargs): super().__init__(task=task, model=model, **kwargs) - self._text_to_image_strength = kwargs.get("text_to_image_strength", - 0.75) + self._text_to_image_strength = kwargs.get("text_to_image_strength", 0.75) # Default to static mode self._static_mode = False self._construct_model(model) @@ -89,17 +86,16 @@ def _construct_model(self, model): """ # bulid model - model_instance = VersatileDiffusionDualGuidedPipeline.from_pretrained( - model) + model_instance = VersatileDiffusionDualGuidedPipeline.from_pretrained(model) model_instance.remove_unused_weights() self._model = model_instance def _preprocess(self, inputs): """ """ prompt = inputs.get("prompt", None) - assert prompt is not None, f"The prompt is None" + assert prompt is not None, "The prompt is None" image = inputs.get("image", None) - assert image is not None, f"The image is None" + assert image is not None, "The image is None" return inputs @@ -111,7 +107,8 @@ def _run_model(self, inputs): result = self._model( prompt=inputs["prompt"], image=inputs["image"], - text_to_image_strength=self._text_to_image_strength, ).images[0] + text_to_image_strength=self._text_to_image_strength, + ).images[0] inputs.pop("prompt", None) inputs.pop("image", None) diff --git a/paddlemix/appflow/text2image_inpaiting.py b/paddlemix/appflow/text2image_inpaiting.py index 4f363791397cf..311dc5b8acf24 100644 --- a/paddlemix/appflow/text2image_inpaiting.py +++ b/paddlemix/appflow/text2image_inpaiting.py @@ -15,7 +15,6 @@ import paddle from PIL import Image -from paddlemix.utils.log import logger from ppdiffusers import StableDiffusionInpaintPipeline from .apptask import AppTask @@ -45,11 +44,11 @@ def _construct_model(self, model): def _preprocess(self, inputs): """ """ image = inputs.get("image", None) - assert image is not None, f"The image is None" + assert image is not None, "The image is None" seg_masks = inputs.get("seg_masks", None) - assert seg_masks is not None, f"The seg masks is None" + assert seg_masks is not None, "The seg masks is None" inpaint_prompt = inputs.get("inpaint_prompt", None) - assert inpaint_prompt is not None, f"The inpaint_prompt is None" + assert inpaint_prompt is not None, "The inpaint_prompt is None" self._org_size = image.size merge_mask = paddle.sum(seg_masks, axis=0).unsqueeze(0) @@ -72,7 +71,8 @@ def _run_model(self, inputs): result = self._model( inputs["inpaint_prompt"], image=inputs["image"], - mask_image=inputs["mask_pil"], ).images[0] + mask_image=inputs["mask_pil"], + ).images[0] inputs.pop("mask_pil", None) inputs.pop("image", None) diff --git a/paddlemix/appflow/text2text_generation.py b/paddlemix/appflow/text2text_generation.py index 0a29e36b71049..9ceb11cbefdb8 100644 --- a/paddlemix/appflow/text2text_generation.py +++ b/paddlemix/appflow/text2text_generation.py @@ -14,8 +14,6 @@ from paddlenlp import Taskflow -from paddlemix.utils.log import logger - from .apptask import AppTask @@ -41,15 +39,16 @@ def _construct_model(self, model): def _preprocess(self, inputs): """ """ image = inputs.get("image", None) - assert image is not None, f"The image is None" + assert image is not None, "The image is None" prompt = inputs.get("prompt", None) - assert prompt is not None, f"The prompt is None" + assert prompt is not None, "The prompt is None" prompt = ( "Given caption,extract the main object to be replaced and marked it as 'main_object', " - + f"Extract the remaining part as 'other prompt', " + - f"Return main_object, other prompt in English" + - f"Given caption: {prompt}.") + + "Extract the remaining part as 'other prompt', " + + "Return main_object, other prompt in English" + + "Given caption: {}.".format(prompt) + ) inputs["prompt"] = prompt @@ -74,7 +73,8 @@ def _postprocess(self, inputs): prompt, inpaint_prompt = ( inputs["result"].split("\n")[0].split(":")[-1].strip(), - inputs["result"].split("\n")[-1].split(":")[-1].strip(), ) + inputs["result"].split("\n")[-1].split(":")[-1].strip(), + ) inputs.pop("result", None) diff --git a/paddlemix/appflow/text2video_generation.py b/paddlemix/appflow/text2video_generation.py index 77b374eaabecb..290917706c77a 100644 --- a/paddlemix/appflow/text2video_generation.py +++ b/paddlemix/appflow/text2video_generation.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle from ppdiffusers import DPMSolverMultistepScheduler, TextToVideoSDPipeline @@ -34,14 +33,13 @@ def _construct_model(self, model): # bulid model model_instance = TextToVideoSDPipeline.from_pretrained(model) - model_instance.scheduler = DPMSolverMultistepScheduler.from_config( - model_instance.scheduler.config) + model_instance.scheduler = DPMSolverMultistepScheduler.from_config(model_instance.scheduler.config) self._model = model_instance def _preprocess(self, inputs): """ """ prompt = inputs.get("prompt", None) - assert prompt is not None, f"The prompt is None" + assert prompt is not None, "The prompt is None" num_inference_steps = inputs.get("num_inference_steps", 25) inputs["num_inference_steps"] = num_inference_steps @@ -54,7 +52,8 @@ def _run_model(self, inputs): result = self._model( prompt=inputs["prompt"], - num_inference_steps=inputs["num_inference_steps"], ).frames + num_inference_steps=inputs["num_inference_steps"], + ).frames inputs.pop("prompt", None) diff --git a/paddlemix/checkpoint.py b/paddlemix/checkpoint.py index 3d885eab2494a..81d44482730d2 100644 --- a/paddlemix/checkpoint.py +++ b/paddlemix/checkpoint.py @@ -13,6 +13,7 @@ # limitations under the License. import os +import shutil import paddle @@ -25,21 +26,17 @@ def save(args, model, optimizer, epoch=0, step=0, output_dir="", is_best=False): return if output_dir and isinstance(output_dir, str): - output_dir = os.path.join(output_dir, - "epoch_%d_step_%d" % (epoch, step)) + output_dir = os.path.join(output_dir, "epoch_%d_step_%d" % (epoch, step)) if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) print("Save model to %s" % output_dir) - save_dir = "{}/mp_{:0>2d}_sharding_{:0>2d}".format( - output_dir, args.mp_rank, args.sharding_rank) + save_dir = "{}/mp_{:0>2d}_sharding_{:0>2d}".format(output_dir, args.mp_rank, args.sharding_rank) # if args.sharding_stage == 3: # model.get_all_parameters(convert2cpu=False) - paddle.save(model.state_dict(), - os.path.join(save_dir, "model.pdparams")) - paddle.save(optimizer.state_dict(), - os.path.join(save_dir, "model_state.pdopt")) + paddle.save(model.state_dict(), os.path.join(save_dir, "model.pdparams")) + paddle.save(optimizer.state_dict(), os.path.join(save_dir, "model_state.pdopt")) if is_best: shutil.copyfile("model.pdparams", "model_best.pdparams") meta_dict = { @@ -60,19 +57,15 @@ def load_model(args, model, optimizer=None, ckpt_dir=""): if ckpt_dir and isinstance(ckpt_dir, str) and os.path.isdir(ckpt_dir): print("Try to load checkpoint from %s " % ckpt_dir) - load_dir = "{}/mp_{:0>2d}_sharding_{:0>2d}".format( - ckpt_dir, args.mp_rank, args.sharding_rank) + load_dir = "{}/mp_{:0>2d}_sharding_{:0>2d}".format(ckpt_dir, args.mp_rank, args.sharding_rank) model_path = os.path.join(load_dir, "model.pdparams") opt_path = os.path.join(load_dir, "model_state.pdopt") - meta_path = os.path.join(load_dir, "meta_state.pdopt") + # meta_path = os.path.join(load_dir, "meta_state.pdopt") if os.path.exists(model_path): model_dict = paddle.load(model_path) for name, param in model.state_dict().items(): - assert ( - name in model_dict.keys() - ), "No param named `{}` was found in checkpoint file.".format( - name) + assert name in model_dict.keys(), "No param named `{}` was found in checkpoint file.".format(name) if param.dtype != model_dict[name].dtype: model_dict[name] = model_dict[name].cast(param.dtype) @@ -125,9 +118,7 @@ def load_model(args, model, optimizer=None, ckpt_dir=""): ] rowlinear_list = [] all_list = collinear_list + rowlinear_list + embedding_list - skip_list = [ - "visual.patch_embed.proj.weight", "visual.patch_embed.proj.bias" - ] + skip_list = ["visual.patch_embed.proj.weight", "visual.patch_embed.proj.bias"] col_list = [] row_list = [] @@ -148,22 +139,21 @@ def renamebias(model_dict, whole_key): def col_split_modeldict(model_dict): if len(model_dict.shape) == 2: subbatch = model_dict.shape[1] // mp_size - return model_dict[:, mp_rank * subbatch:(mp_rank + 1) * - subbatch] + return model_dict[:, mp_rank * subbatch : (mp_rank + 1) * subbatch] elif len(model_dict.shape) == 1: subbatch = model_dict.shape[0] // mp_size - return model_dict[mp_rank * subbatch:(mp_rank + 1) * subbatch] + return model_dict[mp_rank * subbatch : (mp_rank + 1) * subbatch] def row_split_modeldict(model_dict): if len(model_dict.shape) == 2: subbatch = model_dict.shape[0] // mp_size - return model_dict[mp_rank * subbatch:(mp_rank + 1) * subbatch] + return model_dict[mp_rank * subbatch : (mp_rank + 1) * subbatch] else: return model_dict def emb_split_modeldict(model_dict): subbatch = model_dict.shape[0] // mp_size - return model_dict[mp_rank * subbatch:(mp_rank + 1) * subbatch] + return model_dict[mp_rank * subbatch : (mp_rank + 1) * subbatch] model_dict = paddle.load(ckpt_dir) modelkeys = list(model_dict.keys()) @@ -180,28 +170,22 @@ def emb_split_modeldict(model_dict): if key in all_list: if key in collinear_list: col_list.append((key, model_dict[whole_key].shape)) - model_dict[whole_key] = col_split_modeldict(model_dict[ - whole_key]) + model_dict[whole_key] = col_split_modeldict(model_dict[whole_key]) elif key in rowlinear_list: row_list.append((key, model_dict[whole_key].shape)) - model_dict[whole_key] = row_split_modeldict(model_dict[ - whole_key]) + model_dict[whole_key] = row_split_modeldict(model_dict[whole_key]) else: emb_list.append((key, model_dict[whole_key].shape)) - model_dict[whole_key] = emb_split_modeldict(model_dict[ - whole_key]) + model_dict[whole_key] = emb_split_modeldict(model_dict[whole_key]) if args.context_length != 77: - model_dict["text.positional_embedding"] = model_dict[ - "text.positional_embedding"][:args.context_length, :] + model_dict["text.positional_embedding"] = model_dict["text.positional_embedding"][: args.context_length, :] - print("cast state_dict to default dtype:{}".format( - paddle.get_default_dtype())) + print("cast state_dict to default dtype:{}".format(paddle.get_default_dtype())) for key, value in model_dict.items(): if "freqs_cos" in key or "freqs_sin" in key: continue - model_dict[key] = paddle.cast( - value, dtype=paddle.get_default_dtype()) + model_dict[key] = paddle.cast(value, dtype=paddle.get_default_dtype()) model.set_state_dict(model_dict) del model_dict else: diff --git a/paddlemix/datasets/caption_dataset.py b/paddlemix/datasets/caption_dataset.py index 54ab650f3ddfe..3bcff989fe2bf 100644 --- a/paddlemix/datasets/caption_dataset.py +++ b/paddlemix/datasets/caption_dataset.py @@ -33,25 +33,27 @@ class CaptionDataset(DatasetBuilder): """ URL = "https://bj.bcebos.com/paddlemix/datasets/coco.tar.gz" - META_INFO = collections.namedtuple( - "META_INFO", ("images", "annotations", "images_md5", "annotations_md5")) + META_INFO = collections.namedtuple("META_INFO", ("images", "annotations", "images_md5", "annotations_md5")) MD5 = "" SPLITS = { "train": META_INFO( os.path.join("coco", "images"), os.path.join("coco", "annotations/coco_karpathy_train.json"), "", - "aa31ac474cf6250ebb81d18348a07ed8", ), + "aa31ac474cf6250ebb81d18348a07ed8", + ), "val": META_INFO( os.path.join("coco", "images"), os.path.join("coco", "annotations/coco_karpathy_val.json"), "", - "b273847456ef5580e33713b1f7de52a0", ), + "b273847456ef5580e33713b1f7de52a0", + ), "test": META_INFO( os.path.join("coco", "images"), os.path.join("coco", "annotations/coco_karpathy_test.json"), "", - "3ff34b0ef2db02d01c37399f6a2a6cd1", ), + "3ff34b0ef2db02d01c37399f6a2a6cd1", + ), } def _get_data(self, mode, **kwargs): @@ -108,7 +110,6 @@ def _read(self, filename, *args): else: yield_data = { "image": image_path, - "image_id": ann["image"].split("/")[-1].strip(".jpg") - .split("_")[-1], + "image_id": ann["image"].split("/")[-1].strip(".jpg").split("_")[-1], } yield yield_data diff --git a/paddlemix/datasets/coco_clip.py b/paddlemix/datasets/coco_clip.py index 17166b44859d9..7dcd6e6c26661 100644 --- a/paddlemix/datasets/coco_clip.py +++ b/paddlemix/datasets/coco_clip.py @@ -26,25 +26,27 @@ class CaptionCLIP(DatasetBuilder): URL = "https://bj.bcebos.com/paddlemix/datasets/coco.tar.gz" - META_INFO = collections.namedtuple( - "META_INFO", ("images", "annotations", "images_md5", "annotations_md5")) + META_INFO = collections.namedtuple("META_INFO", ("images", "annotations", "images_md5", "annotations_md5")) MD5 = "" SPLITS = { "train": META_INFO( os.path.join("coco", "images"), os.path.join("coco", "annotations/coco_karpathy_train.json"), "", - "aa31ac474cf6250ebb81d18348a07ed8", ), + "aa31ac474cf6250ebb81d18348a07ed8", + ), "val": META_INFO( os.path.join("coco", "images"), os.path.join("coco", "annotations/coco_karpathy_val.json"), "", - "b273847456ef5580e33713b1f7de52a0", ), + "b273847456ef5580e33713b1f7de52a0", + ), "test": META_INFO( os.path.join("coco", "images"), os.path.join("coco", "annotations/coco_karpathy_test.json"), "", - "3ff34b0ef2db02d01c37399f6a2a6cd1", ), + "3ff34b0ef2db02d01c37399f6a2a6cd1", + ), } def _get_data(self, mode, **kwargs): @@ -74,7 +76,6 @@ def _gen_image_id(self, anno): def _read(self, filename, *args): image_root, anno_path, mode = filename annotations = json.load(open(anno_path, "r")) - image_ids = self._gen_image_id(annotations) for ann in annotations: image_path = os.path.join(image_root, ann["image"]) diff --git a/paddlemix/datasets/dataset.py b/paddlemix/datasets/dataset.py index 96452fb68de78..047bbfd796e57 100644 --- a/paddlemix/datasets/dataset.py +++ b/paddlemix/datasets/dataset.py @@ -64,7 +64,7 @@ class DatasetTuple: def __init__(self, splits): self.identifier_map, identifiers = self._gen_identifier_map(splits) self.tuple_cls = namedtuple("datasets", identifiers) - self.tuple = self.tuple_cls(* [None for _ in splits]) + self.tuple = self.tuple_cls(*[None for _ in splits]) def __getitem__(self, key): if isinstance(key, (int, slice)): @@ -116,8 +116,7 @@ def load_from_hf(path, name=None, splits=None, **kwargs): try: hf_datasets = load_hf_dataset(path, name=name, split=splits, **kwargs) except FileNotFoundError: - raise FileNotFoundError("Couldn't find the dataset script for '" + path - + "' on PaddleNLP or HuggingFace") + raise FileNotFoundError("Couldn't find the dataset script for '" + path + "' on PaddleNLP or HuggingFace") else: label_list = [] if isinstance(hf_datasets, DatasetDict): @@ -133,8 +132,7 @@ def load_from_hf(path, name=None, splits=None, **kwargs): for feature in hf_datasets[i].features.values(): if isinstance(feature, ClassLabel): label_list = feature.names - datasets[split] = MapDataset( - hf_datasets[i], label_list=label_list) + datasets[split] = MapDataset(hf_datasets[i], label_list=label_list) else: for feature in hf_datasets.features.values(): if isinstance(feature, ClassLabel): @@ -143,12 +141,7 @@ def load_from_hf(path, name=None, splits=None, **kwargs): return datasets -def load_dataset(path_or_read_func, - name=None, - data_files=None, - splits=None, - lazy=None, - **kwargs): +def load_dataset(path_or_read_func, name=None, data_files=None, splits=None, lazy=None, **kwargs): """ This method will load a dataset, either form PaddleNLP library or from a self-defined data loading script, by calling functions in `DatasetBuilder`. @@ -197,26 +190,22 @@ def load_dataset(path_or_read_func, try: reader_cls = import_main_class(path_or_read_func) except ModuleNotFoundError: - datasets = load_from_hf( - path_or_read_func, name=name, splits=splits, **kwargs) + datasets = load_from_hf(path_or_read_func, name=name, splits=splits, **kwargs) else: reader_instance = reader_cls(lazy=lazy, name=name, **kwargs) # Check if selected name and split is valid in this DatasetBuilder if hasattr(reader_instance, "BUILDER_CONFIGS"): if name in reader_cls.BUILDER_CONFIGS.keys(): - split_names = reader_cls.BUILDER_CONFIGS[name][ - "splits"].keys() + split_names = reader_cls.BUILDER_CONFIGS[name]["splits"].keys() else: raise ValueError( - 'Invalid name "{}". Should be one of {}.'.format( - name, list(reader_cls.BUILDER_CONFIGS.keys()))) + 'Invalid name "{}". Should be one of {}.'.format(name, list(reader_cls.BUILDER_CONFIGS.keys())) + ) elif hasattr(reader_instance, "SPLITS"): split_names = reader_instance.SPLITS.keys() else: - raise AttributeError( - "Either 'SPLITS' or 'BUILDER_CONFIGS' must be implemented for DatasetBuilder." - ) + raise AttributeError("Either 'SPLITS' or 'BUILDER_CONFIGS' must be implemented for DatasetBuilder.") selected_splits = [] if isinstance(splits, list) or isinstance(splits, tuple): @@ -226,11 +215,9 @@ def load_dataset(path_or_read_func, for split_name in selected_splits: if split_name not in split_names and split_name is not None: - raise ValueError('Invalid split "{}". Should be one of {}.'. - format(split_name, list(split_names))) + raise ValueError('Invalid split "{}". Should be one of {}.'.format(split_name, list(split_names))) - datasets = reader_instance.read_datasets( - data_files=data_files, splits=splits) + datasets = reader_instance.read_datasets(data_files=data_files, splits=splits) return datasets @@ -268,8 +255,7 @@ def __getitem__(self, idx): Basic function of `MapDataset` to get sample from dataset with a given index. """ - return (self._transform(self.new_data[idx]) - if self._transform_pipline else self.new_data[idx]) + return self._transform(self.new_data[idx]) if self._transform_pipline else self.new_data[idx] def __len__(self): """ @@ -291,21 +277,12 @@ def filter(self, fn, num_workers=0): assert num_workers >= 0, "num_workers should be a non-negative value" if num_workers > 1: shards = [ - self._shard( - num_shards=num_workers, index=index, contiguous=True) - for index in range(num_workers) - ] - kwds_per_shard = [ - dict( - self=shards[rank], fn=fn) for rank in range(num_workers) + self._shard(num_shards=num_workers, index=index, contiguous=True) for index in range(num_workers) ] - pool = Pool(num_workers, initargs=(RLock(), )) + kwds_per_shard = [dict(self=shards[rank], fn=fn) for rank in range(num_workers)] + pool = Pool(num_workers, initargs=(RLock(),)) - results = [ - pool.apply_async( - self.__class__._filter, kwds=kwds) - for kwds in kwds_per_shard - ] + results = [pool.apply_async(self.__class__._filter, kwds=kwds) for kwds in kwds_per_shard] transformed_shards = [r.get() for r in results] pool.close() @@ -318,15 +295,11 @@ def filter(self, fn, num_workers=0): return self._filter(fn) def _filter(self, fn): - self.new_data = [ - self.new_data[idx] for idx in range(len(self.new_data)) - if fn(self.new_data[idx]) - ] + self.new_data = [self.new_data[idx] for idx in range(len(self.new_data)) if fn(self.new_data[idx])] return self def shard(self, num_shards=None, index=None, contiguous=False): - self.new_data = self._shard( - num_shards=num_shards, index=index, contiguous=contiguous).data + self.new_data = self._shard(num_shards=num_shards, index=index, contiguous=contiguous).data return self def _shard(self, num_shards=None, index=None, contiguous=False): @@ -359,10 +332,7 @@ def _shard(self, num_shards=None, index=None, contiguous=False): end = start + div + (1 if index < mod else 0) new_data = [self.new_data[idx] for idx in range(start, end)] else: - new_data = [ - self.new_data[idx] for idx in range(len(self.new_data)) - if idx % num_shards == index - ] + new_data = [self.new_data[idx] for idx in range(len(self.new_data)) if idx % num_shards == index] return MapDataset(new_data) @@ -388,20 +358,13 @@ def map(self, fn, lazy=True, batched=False, num_workers=0): assert num_workers >= 0, "num_workers should be a non-negative value" if num_workers > 1: shards = [ - self._shard( - num_shards=num_workers, index=index, contiguous=True) - for index in range(num_workers) + self._shard(num_shards=num_workers, index=index, contiguous=True) for index in range(num_workers) ] kwds_per_shard = [ - dict( - self=shards[rank], fn=fn, lazy=False, batched=batched) - for rank in range(num_workers) - ] - pool = Pool(num_workers, initargs=(RLock(), )) - results = [ - pool.apply_async( - self.__class__._map, kwds=kwds) for kwds in kwds_per_shard + dict(self=shards[rank], fn=fn, lazy=False, batched=batched) for rank in range(num_workers) ] + pool = Pool(num_workers, initargs=(RLock(),)) + results = [pool.apply_async(self.__class__._map, kwds=kwds) for kwds in kwds_per_shard] transformed_shards = [r.get() for r in results] pool.close() pool.join() @@ -418,9 +381,7 @@ def _map(self, fn, lazy=True, batched=False): elif lazy: self._transform_pipline.append(fn) else: - self.new_data = [ - fn(self.new_data[idx]) for idx in range(len(self.new_data)) - ] + self.new_data = [fn(self.new_data[idx]) for idx in range(len(self.new_data))] return self @@ -468,23 +429,19 @@ def __iter__(self): num_samples = 0 if inspect.isfunction(self.data): for example in self.data(): - if (not self._filter_pipline or - self._filter(self._filter_pipline) - ) and self._shard_filter(num_samples=num_samples): - yield self._transform( - example) if self._transform_pipline else example + if (not self._filter_pipline or self._filter(self._filter_pipline)) and self._shard_filter( + num_samples=num_samples + ): + yield self._transform(example) if self._transform_pipline else example num_samples += 1 else: if inspect.isgenerator(self.data): - warnings.warn( - "Reciving generator as data source, data can only be iterated once" - ) + warnings.warn("Reciving generator as data source, data can only be iterated once") for example in self.data: - if (not self._filter_pipline or - self._filter(self._filter_pipline) - ) and self._shard_filter(num_samples=num_samples): - yield self._transform( - example) if self._transform_pipline else example + if (not self._filter_pipline or self._filter(self._filter_pipline)) and self._shard_filter( + num_samples=num_samples + ): + yield self._transform(example) if self._transform_pipline else example num_samples += 1 def filter(self, fn): @@ -578,22 +535,23 @@ def remove_if_exit(filepath): if data_files is None: if splits is None: - splits = (list(self.BUILDER_CONFIGS[self.name]["splits"].keys()) - if hasattr(self, "BUILDER_CONFIGS") else - list(self.SPLITS.keys())) + splits = ( + list(self.BUILDER_CONFIGS[self.name]["splits"].keys()) + if hasattr(self, "BUILDER_CONFIGS") + else list(self.SPLITS.keys()) + ) assert ( - isinstance(splits, str) or - (isinstance(splits, list) and isinstance(splits[0], str)) or - (isinstance(splits, tuple) and isinstance(splits[0], str)) + isinstance(splits, str) + or (isinstance(splits, list) and isinstance(splits[0], str)) + or (isinstance(splits, tuple) and isinstance(splits[0], str)) ), "`splits` should be a string or list of string or a tuple of string." if isinstance(splits, str): splits = [splits] datasets = DatasetTuple(splits) parallel_env = dist.ParallelEnv() - unique_endpoints = _get_unique_endpoints( - parallel_env.trainer_endpoints[:]) + unique_endpoints = _get_unique_endpoints(parallel_env.trainer_endpoints[:]) # move register hook to first and register togather lock_files = [] for split in splits: @@ -625,8 +583,7 @@ def remove_if_exit(filepath): datasets[split] = self.read(filename=filename, split=split) else: assert ( - isinstance(data_files, str) or isinstance(data_files, tuple) or - isinstance(data_files, list) + isinstance(data_files, str) or isinstance(data_files, tuple) or isinstance(data_files, list) ), "`data_files` should be a string or tuple or list of strings." if isinstance(data_files, str): data_files = [data_files] @@ -639,14 +596,11 @@ def remove_if_exit(filepath): data_files ), "Number of `splits` and number of `data_files` should be the same if you want to specify the split of loacl data file." for i in range(len(data_files)): - datasets[splits[i]] = self.read( - filename=data_files[i], split=splits[i]) + datasets[splits[i]] = self.read(filename=data_files[i], split=splits[i]) else: - datasets = DatasetTuple( - ["split" + str(i) for i in range(len(data_files))]) + datasets = DatasetTuple(["split" + str(i) for i in range(len(data_files))]) for i in range(len(data_files)): - datasets["split" + str(i)] = self.read( - filename=data_files[i], split=default_split) + datasets["split" + str(i)] = self.read(filename=data_files[i], split=default_split) return datasets if len(datasets) > 1 else datasets[0] @@ -701,9 +655,9 @@ def _convert_label_to_id(labels, label_dict): if self.lazy: def generate_examples(): - generator = (self._read(filename, split) - if self._read.__code__.co_argcount > 2 else - self._read(filename)) + generator = ( + self._read(filename, split) if self._read.__code__.co_argcount > 2 else self._read(filename) + ) for example in generator: # We need to check if the example contains label column and confirm its name. # For now we only allow `label` or `labels` to be the name of label column. @@ -720,24 +674,17 @@ def generate_examples(): # For multiple labels in the form of list. if isinstance(label_dict, list): for idx, sub_dict in enumerate(label_dict): - example[label_col][idx] = _convert_label_to_id( - example[label_col][idx], sub_dict) + example[label_col][idx] = _convert_label_to_id(example[label_col][idx], sub_dict) else: - example[label_col] = _convert_label_to_id( - example[label_col], label_dict) + example[label_col] = _convert_label_to_id(example[label_col], label_dict) yield example else: yield example - return IterDataset( - generate_examples(), - label_list=label_list, - vocab_info=vocab_info) + return IterDataset(generate_examples(), label_list=label_list, vocab_info=vocab_info) else: - examples = (self._read(filename, split) - if self._read.__code__.co_argcount > 2 else - self._read(filename)) + examples = self._read(filename, split) if self._read.__code__.co_argcount > 2 else self._read(filename) # Then some validation. if not isinstance(examples, list): @@ -745,8 +692,8 @@ def generate_examples(): if not examples: raise ValueError( - "No instances were read from the given filepath {}. " - "Is the path correct?".format(filename)) + "No instances were read from the given filepath {}. " "Is the path correct?".format(filename) + ) # We need to check if the example contains label column and confirm its name. # For now we only allow `label` or `labels` to be the name of label column. @@ -764,14 +711,11 @@ def generate_examples(): # For multiple labels in the form of list. if isinstance(label_dict, list): for i, sub_dict in enumerate(label_dict): - examples[idx][label_col][i] = _convert_label_to_id( - examples[idx][label_col][i], sub_dict) + examples[idx][label_col][i] = _convert_label_to_id(examples[idx][label_col][i], sub_dict) else: - examples[idx][label_col] = _convert_label_to_id( - examples[idx][label_col], label_dict) + examples[idx][label_col] = _convert_label_to_id(examples[idx][label_col], label_dict) - return MapDataset( - examples, label_list=label_list, vocab_info=vocab_info) + return MapDataset(examples, label_list=label_list, vocab_info=vocab_info) def _read(self, filename: str, *args): """ @@ -820,15 +764,13 @@ def generate_examples(): return IterDataset(generate_examples) else: examples = self._read(**kwargs) - if hasattr(examples, "__len__") and hasattr(examples, - "__getitem__"): + if hasattr(examples, "__len__") and hasattr(examples, "__getitem__"): return MapDataset(examples) else: return MapDataset(list(examples)) -def has_file_allowed_extension(filename: str, - extensions: Union[str, Tuple[str, ...]]) -> bool: +def has_file_allowed_extension(filename: str, extensions: Union[str, Tuple[str, ...]]) -> bool: """Checks if a file is an allowed extension. Args: @@ -838,8 +780,7 @@ def has_file_allowed_extension(filename: str, Returns: bool: True if the filename ends with one of given extensions """ - return filename.lower().endswith( - extensions if isinstance(extensions, str) else tuple(extensions)) + return filename.lower().endswith(extensions if isinstance(extensions, str) else tuple(extensions)) def find_classes(directory: str) -> Tuple[List[str], Dict[str, int]]: @@ -847,22 +788,20 @@ def find_classes(directory: str) -> Tuple[List[str], Dict[str, int]]: See :class:`DatasetFolder` for details. """ - classes = sorted( - entry.name for entry in os.scandir(directory) if entry.is_dir()) + classes = sorted(entry.name for entry in os.scandir(directory) if entry.is_dir()) if not classes: - raise FileNotFoundError( - f"Couldn't find any class folder in {directory}.") + raise FileNotFoundError(f"Couldn't find any class folder in {directory}.") class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)} return classes, class_to_idx def make_dataset( - directory: str, - class_to_idx: Optional[Dict[str, int]]=None, - extensions: Optional[Union[str, Tuple[str, ...]]]=None, - is_valid_file: Optional[Callable[[str], bool]]=None, ) -> List[Tuple[ - str, int]]: + directory: str, + class_to_idx: Optional[Dict[str, int]] = None, + extensions: Optional[Union[str, Tuple[str, ...]]] = None, + is_valid_file: Optional[Callable[[str], bool]] = None, +) -> List[Tuple[str, int]]: """Generates a list of samples of a form (path_to_sample, class). See :class:`DatasetFolder` for details. @@ -875,22 +814,17 @@ def make_dataset( if class_to_idx is None: _, class_to_idx = find_classes(directory) elif not class_to_idx: - raise ValueError( - "'class_to_index' must have at least one entry to collect any samples." - ) + raise ValueError("'class_to_index' must have at least one entry to collect any samples.") both_none = extensions is None and is_valid_file is None both_something = extensions is not None and is_valid_file is not None if both_none or both_something: - raise ValueError( - "Both extensions and is_valid_file cannot be None or not None at the same time" - ) + raise ValueError("Both extensions and is_valid_file cannot be None or not None at the same time") if extensions is not None: def is_valid_file(x: str) -> bool: - return has_file_allowed_extension( - x, extensions) # type: ignore[arg-type] + return has_file_allowed_extension(x, extensions) # type: ignore[arg-type] is_valid_file = cast(Callable[[str], bool], is_valid_file) @@ -913,9 +847,7 @@ def is_valid_file(x: str) -> bool: empty_classes = set(class_to_idx.keys()) - available_classes if empty_classes: - msg = ( - f"Found no valid file for the classes {', '.join(sorted(empty_classes))}. " - ) + msg = f"Found no valid file for the classes {', '.join(sorted(empty_classes))}. " if extensions is not None: msg += f"Supported extensions are: {extensions if isinstance(extensions, str) else ', '.join(extensions)}" raise FileNotFoundError(msg) @@ -951,13 +883,14 @@ class DatasetFolder(Dataset): """ def __init__( - self, - root: str, - loader: Callable[[str], Any], - extensions: Optional[Tuple[str, ...]]=None, - transform: Optional[Callable]=None, - target_transform: Optional[Callable]=None, - is_valid_file: Optional[Callable[[str], bool]]=None, ) -> None: + self, + root: str, + loader: Callable[[str], Any], + extensions: Optional[Tuple[str, ...]] = None, + transform: Optional[Callable] = None, + target_transform: Optional[Callable] = None, + is_valid_file: Optional[Callable[[str], bool]] = None, + ) -> None: # super().__init__(root, transform=transform, target_transform=target_transform) # super().__init__() self.root = root @@ -965,8 +898,7 @@ def __init__( self.target_transform = target_transform classes, class_to_idx = self.find_classes(self.root) - samples = self.make_dataset(self.root, class_to_idx, extensions, - is_valid_file) + samples = self.make_dataset(self.root, class_to_idx, extensions, is_valid_file) self.loader = loader self.extensions = extensions @@ -978,11 +910,11 @@ def __init__( @staticmethod def make_dataset( - directory: str, - class_to_idx: Dict[str, int], - extensions: Optional[Tuple[str, ...]]=None, - is_valid_file: Optional[Callable[[str], bool]]=None, ) -> List[ - Tuple[str, int]]: + directory: str, + class_to_idx: Dict[str, int], + extensions: Optional[Tuple[str, ...]] = None, + is_valid_file: Optional[Callable[[str], bool]] = None, + ) -> List[Tuple[str, int]]: """Generates a list of samples of a form (path_to_sample, class). This can be overridden to e.g. read files from a compressed zip file instead of from the disk. @@ -1010,11 +942,7 @@ def make_dataset( # find_classes() function, instead of using that of the find_classes() method, which # is potentially overridden and thus could have a different logic. raise ValueError("The class_to_idx parameter cannot be None.") - return make_dataset( - directory, - class_to_idx, - extensions=extensions, - is_valid_file=is_valid_file) + return make_dataset(directory, class_to_idx, extensions=extensions, is_valid_file=is_valid_file) def find_classes(self, directory: str) -> Tuple[List[str], Dict[str, int]]: """Find the class folders in a dataset structured as follows:: @@ -1075,7 +1003,8 @@ def __len__(self) -> int: ".pgm", ".tif", ".tiff", - ".webp", ) + ".webp", +) def pil_loader(path: str) -> Image.Image: @@ -1120,17 +1049,19 @@ class ImageFolder(DatasetFolder): """ def __init__( - self, - root: str, - transform: Optional[Callable]=None, - target_transform: Optional[Callable]=None, - loader: Callable[[str], Any]=default_loader, - is_valid_file: Optional[Callable[[str], bool]]=None, ): + self, + root: str, + transform: Optional[Callable] = None, + target_transform: Optional[Callable] = None, + loader: Callable[[str], Any] = default_loader, + is_valid_file: Optional[Callable[[str], bool]] = None, + ): super().__init__( root, loader, IMG_EXTENSIONS if is_valid_file is None else None, transform=transform, target_transform=target_transform, - is_valid_file=is_valid_file, ) + is_valid_file=is_valid_file, + ) self.imgs = self.samples diff --git a/paddlemix/datasets/laion_clip.py b/paddlemix/datasets/laion_clip.py index 24edc6a38e340..4d4fa6c5a104f 100644 --- a/paddlemix/datasets/laion_clip.py +++ b/paddlemix/datasets/laion_clip.py @@ -11,19 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import base64 -import gzip -import io + import logging import os -import random -from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast -import paddle import paddle.vision.datasets as datasets from easydict import EasyDict as edict -from paddle.io import DataLoader, Dataset, IterableDataset, get_worker_info -from PIL import Image +from paddle.io import DataLoader from .dataset import ImageFolder @@ -39,8 +33,7 @@ def get_classification(args, preprocess_fns): for data_path in data_paths: data_path = data_path.rstrip("/") logging.info(f"adding classification dataset: {data_path}") - dataset = datasets.ImageFolder( - f"{data_path}/images", transform=preprocess_fn) + dataset = datasets.ImageFolder(f"{data_path}/images", transform=preprocess_fn) dataset = ImageFolder(f"{data_path}/images", transform=preprocess_fn) @@ -48,7 +41,8 @@ def get_classification(args, preprocess_fns): dataset, batch_size=args.per_device_eval_batch_size, # hard code num_workers=args.dataloader_num_workers, - shuffle=False, ) + shuffle=False, + ) classname_filename = f"{data_path}/labels.txt" template_filename = f"{data_path}/templates.txt" @@ -56,7 +50,8 @@ def get_classification(args, preprocess_fns): result[f"{os.path.basename(data_path)}"] = edict( dataloader=dataloader, classname_filename=classname_filename, - template_filename=template_filename, ) + template_filename=template_filename, + ) return result diff --git a/paddlemix/examples/Sam/run_predict.py b/paddlemix/examples/Sam/run_predict.py index 46591f2abd0cd..cbfe59f068785 100644 --- a/paddlemix/examples/Sam/run_predict.py +++ b/paddlemix/examples/Sam/run_predict.py @@ -18,11 +18,9 @@ import matplotlib.pyplot as plt import numpy as np -import paddle -import paddle.nn.functional as F import requests from paddlenlp.trainer import PdArgumentParser -from PIL import Image, ImageDraw, ImageFont +from PIL import Image from paddlemix.models.sam.modeling import SamModel from paddlemix.processors.sam_processing import SamProcessor @@ -56,11 +54,8 @@ class DataArguments: """ input_image: str = field(metadata={"help": "The name of input image."}) - box_prompt: List[int] = field( - default=None, metadata={"help": "box promt format as xyxyxyxy...]."}) - points_prompt: List[int] = field( - default=None, - metadata={"help": "point promt format as [[xy],[xy]...]."}) + box_prompt: List[int] = field(default=None, metadata={"help": "box promt format as xyxyxyxy...]."}) + points_prompt: List[int] = field(default=None, metadata={"help": "point promt format as [[xy],[xy]...]."}) @dataclass @@ -71,19 +66,20 @@ class ModelArguments: model_name_or_path: str = field( default="Sam/SamVitH-1024", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) input_type: str = field( default="boxs", - metadata={ - "help": - "The model prompt type, choices ['boxs', 'points', 'points_grid']." - }, ) + metadata={"help": "The model prompt type, choices ['boxs', 'points', 'points_grid']."}, + ) output_dir: str = field( default="seg_output", - metadata={"help": "output directory."}, ) + metadata={"help": "output directory."}, + ) visual: bool = field( default=True, - metadata={"help": "save visual image."}, ) + metadata={"help": "save visual image."}, + ) def main(): @@ -94,15 +90,13 @@ def main(): # read image image_pil = Image.open(data_args.input_image).convert("RGB") else: - image_pil = Image.open(requests.get(url, stream=True).raw).convert( - "RGB") + image_pil = Image.open(requests.get(url, stream=True).raw).convert("RGB") # bulid processor processor = SamProcessor.from_pretrained(model_args.model_name_or_path) # bulid model logger.info("SamModel: {}".format(model_args.model_name_or_path)) - sam_model = SamModel.from_pretrained( - model_args.model_name_or_path, input_type=model_args.input_type) + sam_model = SamModel.from_pretrained(model_args.model_name_or_path, input_type=model_args.input_type) if data_args.box_prompt is not None: data_args.box_prompt = np.array(data_args.box_prompt) @@ -113,7 +107,8 @@ def main(): image_pil, input_type=model_args.input_type, box=data_args.box_prompt, - point_coords=data_args.points_prompt, ) + point_coords=data_args.points_prompt, + ) seg_masks = sam_model(img=image_seg, prompt=prompt) seg_masks = processor.postprocess_masks(seg_masks) @@ -131,7 +126,8 @@ def main(): os.path.join(model_args.output_dir, "mask_pred.jpg"), bbox_inches="tight", dpi=300, - pad_inches=0.0, ) + pad_inches=0.0, + ) if __name__ == "__main__": diff --git a/paddlemix/examples/blip2/export.py b/paddlemix/examples/blip2/export.py index 745b2f3f5ef7f..d206f4f47a749 100644 --- a/paddlemix/examples/blip2/export.py +++ b/paddlemix/examples/blip2/export.py @@ -11,24 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os import sys + +sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../..")) import os -sys.path.insert( - 0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../..')) from dataclasses import dataclass, field + import paddle -import requests +import yaml from paddlenlp.trainer import PdArgumentParser -from PIL import Image + from paddlemix.models.blip2.modeling import Blip2ForConditionalGeneration -from paddlemix.processors.blip_processing import Blip2Processor from paddlemix.utils.log import logger -import os -import yaml -import paddle -import argparse -import os -import paddle @dataclass @@ -41,13 +36,11 @@ class DataArguments: """ input_image: str = field( - default="http://images.cocodataset.org/val2017/000000039769.jpg", - metadata={"help": "The name of input image." - }) # "http://images.cocodataset.org/val2017/000000039769.jpg" + default="http://images.cocodataset.org/val2017/000000039769.jpg", metadata={"help": "The name of input image."} + ) # "http://images.cocodataset.org/val2017/000000039769.jpg" prompt: str = field( - default=None, - metadata={"help": "The prompt of the image to be generated." - }) # "Question: how many cats are there? Answer:" + default=None, metadata={"help": "The prompt of the image to be generated."} + ) # "Question: how many cats are there? Answer:" @dataclass @@ -58,64 +51,62 @@ class ModelArguments: model_name_or_path: str = field( default="paddlemix/blip2-caption-opt2.7b", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) pretrained_model_path: str = field( default=None, - metadata={ - "help": - "The path to pre-trained model that we will use for inference." - }, ) + metadata={"help": "The path to pre-trained model that we will use for inference."}, + ) fp16: str = field( default=True, - metadata={"help": "Export with mixed precision."}, ) + metadata={"help": "Export with mixed precision."}, + ) def main(): parser = PdArgumentParser((ModelArguments, DataArguments)) model_args, data_args = parser.parse_args_into_dataclasses() - url = (data_args.input_image - ) # "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - - prompt = "a photo of " - processor = Blip2Processor.from_pretrained(model_args.model_name_or_path) - model = Blip2ForConditionalGeneration.from_pretrained( - model_args.model_name_or_path) + # url = data_args.input_image # "http://images.cocodataset.org/val2017/000000039769.jpg" + # image = Image.open(requests.get(url, stream=True).raw) + + # prompt = "a photo of " + # processor = Blip2Processor.from_pretrained(model_args.model_name_or_path) + model = Blip2ForConditionalGeneration.from_pretrained(model_args.model_name_or_path) model.eval() dtype = "float32" if model_args.fp16: decorated = paddle.amp.decorate( - models=[model.visual_encoder, model.language_model], - optimizers=None, - level="O2") + models=[model.visual_encoder, model.language_model], optimizers=None, level="O2" + ) model.visual_encoder, model.language_model = decorated dtype = "float16" shape1 = [None, 3, None, None] - input_spec = [paddle.static.InputSpec(shape=shape1, dtype='float32'), ] - image_encoder = paddle.jit.to_static( - model.encode_image, input_spec=input_spec) + input_spec = [ + paddle.static.InputSpec(shape=shape1, dtype="float32"), + ] + image_encoder = paddle.jit.to_static(model.encode_image, input_spec=input_spec) save_path = "blip2_export" - paddle.jit.save(image_encoder, os.path.join(save_path, 'image_encoder')) + paddle.jit.save(image_encoder, os.path.join(save_path, "image_encoder")) # TODO add test config deploy_info = { - 'Deploy': { - 'model': 'image_encoder.pdmodel', - 'params': 'image_encoder.pdiparams', - 'input_img_shape': shape1, - 'output_dtype': dtype + "Deploy": { + "model": "image_encoder.pdmodel", + "params": "image_encoder.pdiparams", + "input_img_shape": shape1, + "output_dtype": dtype, } } - msg = '\n---------------Deploy Information---------------\n' + msg = "\n---------------Deploy Information---------------\n" msg += str(yaml.dump(deploy_info)) logger.info(msg) - yml_file = os.path.join(save_path, 'deploy.yaml') - with open(yml_file, 'w') as file: + yml_file = os.path.join(save_path, "deploy.yaml") + with open(yml_file, "w") as file: yaml.dump(deploy_info, file) - logger.info(f'The inference model is saved in {save_path}') + logger.info(f"The inference model is saved in {save_path}") if __name__ == "__main__": diff --git a/paddlemix/examples/blip2/merge_weight.py b/paddlemix/examples/blip2/merge_weight.py new file mode 100644 index 0000000000000..ae7adabe81dff --- /dev/null +++ b/paddlemix/examples/blip2/merge_weight.py @@ -0,0 +1,114 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +from paddlemix.utils.log import logger + +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +os.environ["FLAGS_use_cuda_managed_memory"] = "true" + +import paddle +import torch +from paddlenlp.transformers import LlamaForCausalLM +from paddlenlp.transformers.opt.modeling import OPTForCausalLM + + +def merge(args): + model_dict = {} + # load the first item: vision_model + state_dict = paddle.load(args.blip2_path) + for n, p in state_dict.items(): + if n.startswith("vision_model") or n.startswith("qformer") or n == "query_tokens": + model_dict[n] = p + logger.info("[1/3] load ViT, qformer and query_tokens done!") + + # load the second item: llm model + if "opt" in args.llm_name: + llm_model = OPTForCausalLM.from_pretrained(args.llm_path) + elif "llama" in args.llm_name: + llm_model = LlamaForCausalLM.from_pretrained(args.llm_path) + else: + ValueError(f"The LLM model {args.llm_name} is not supported.") + + for n, p in llm_model.named_parameters(): + new_name = "language_model." + n + model_dict[new_name] = p + logger.info("[2/3] load language_model done!") + + # load the third item: blip2 + llm_state_dict = torch.load(args.llm_path) + for n, p in llm_state_dict["model"].items(): + if n.startswith(args.llm_name + "_model.model"): + new_name = n.replace(args.llm_name + "_model.model", "language_model." + args.llm_name) + new_p = paddle.to_tensor(p.cpu().numpy()) + model_dict[new_name] = new_p + + if n.startswith(args.llm_name + args.llm_name + "_proj"): + new_name = n.replace(args.llm_name + "_proj", "language_projection") + if n.endswith("weight"): + new_p = paddle.to_tensor(p.cpu().numpy()).transpose([1, 0]) + else: + new_p = paddle.to_tensor(p.cpu().numpy()) + model_dict[new_name] = new_p + + logger.info("[3/3] load language_projection, some llm weights from blip2 done!") + + save_path = os.path.join(args.save_path, "model_state.pdparams") + paddle.save(model_dict, save_path) + logger.info("The checkpoint of blip2 has been saved to :{}".format(save_path)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--blip2_path", + default="/blip2/dirname", + type=str, + help="The dir name of blip2-flan-t5-xxl.", + ) + parser.add_argument("--llm_name", default="opt", type=str, help="Thename of llm model.") + parser.add_argument( + "--llm_path", + default="/llm/dirname", + type=str, + help="The dir name of llm model.", + ) + parser.add_argument( + "--blip2_path", + default="/blip2/prerained_blip2.pth", + type=str, + help="The checkpoint path of blip2.", + ) + parser.add_argument( + "--save_path", + default="/save/to/dirname", + type=str, + help="The saving path of blip2.", + ) + args = parser.parse_args() + + args.blip2_path = os.path.join(args.blip2_path, "model_state.pdparams") + if not os.path.exists(args.blip2_path): + raise ValueError("Not found the file: {}".format(args.blip2_path)) + if not os.path.isdir(args.llm_path): + raise ValueError("It is not a directory: {}".format(args.llm_path)) + if not os.path.exists(args.llm_path): + raise ValueError("Not found the file: {}".format(args.llm_path)) + if not os.path.exists(args.save_path): + os.makedirs(args.save_path) + + merge(args) diff --git a/paddlemix/examples/blip2/run_eval_caption.py b/paddlemix/examples/blip2/run_eval_caption.py index 91443a8a4fc53..85ed6959f0e8d 100644 --- a/paddlemix/examples/blip2/run_eval_caption.py +++ b/paddlemix/examples/blip2/run_eval_caption.py @@ -12,31 +12,31 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys import os -sys.path.insert( - 0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../..')) -import paddle.distributed as dist -from paddle.distributed import fleet +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../..")) +import random from dataclasses import dataclass, field + import numpy as np -import random import paddle +import paddle.distributed as dist +from paddle.distributed import fleet from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker -from sklearn.utils import compute_sample_weight -from paddlenlp.trainer import (PdArgumentParser, TrainingArguments, - get_last_checkpoint) -from paddlenlp.transformers import AutoConfig, OPTConfig, T5Config +from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint +from paddlenlp.transformers import AutoTokenizer + from paddlemix.datasets import load_dataset -from paddlemix.models.blip2.configuration import ( - Blip2Config, Blip2QFormerConfig, Blip2VisionConfig) +from paddlemix.examples.blip2.utils import BlipCollator from paddlemix.models.blip2.modeling import Blip2ForConditionalGeneration -from paddlemix.processors.blip_processing import Blip2Processor +from paddlemix.processors.blip_processing import ( + Blip2Processor, + BlipImageProcessor, + BlipTextProcessor, +) from paddlemix.trainer.blip2_trainer import BLIP2Trainer as Trainer from paddlemix.utils.log import logger -from paddlenlp.transformers import AutoTokenizer -from paddlemix.processors.blip_processing import BlipImageProcessor, BlipTextProcessor -from paddlemix.examples.blip2.utils import BlipCollator @dataclass @@ -50,13 +50,11 @@ class DataArguments: task_name: str = field( default="coco_caption", - metadata={ - "help": "The name of the task to use (via the datasets library)." - }, ) + metadata={"help": "The name of the task to use (via the datasets library)."}, + ) prompt: str = field( - default="a photo of ", - metadata={"help": "The prompt of the image to be generated." - }) # "Question: how many cats are there? Answer:" + default="a photo of ", metadata={"help": "The prompt of the image to be generated."} + ) # "Question: how many cats are there? Answer:" @dataclass @@ -67,11 +65,13 @@ class ModelArguments: model_name_or_path: str = field( default="paddlemix/blip2-caption-opt2.7b", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) text_model_name_or_path: str = field( default="facebook/opt-2.7b", - metadata={"help": "The type of text model to use (OPT, T5)."}, ) + metadata={"help": "The type of text model to use (OPT, T5)."}, + ) @dataclass @@ -79,99 +79,63 @@ class PreTrainingArguments(TrainingArguments): """ Arguments pertaining to what training options we are going to use during pretraining. """ - weight_decay: float = field( - default=0.05, metadata={"help": "Weight decay if we apply some."}) - learning_rate: float = field( - default=0.0001, metadata={"help": "The initial learning rate."}) - num_train_epochs: float = field( - default=10.0, - metadata={"help": "Total number of training epochs to perform."}) - warmup_start_lr: float = field( - default=1e-6, metadata={"help": "Initial learning rate of warm up."}) - eta_min: float = field( - default=1e-5, metadata={"help": "The minimum value of learning rate."}) - warmup_steps: int = field( - default=2000, metadata={"help": "Number of warmup steps."}) - lr_scheduler_name: str = field( - default="CosineDecayWithWarmup", - metadata={"help": "The scheduler name to use."}) + + weight_decay: float = field(default=0.05, metadata={"help": "Weight decay if we apply some."}) + learning_rate: float = field(default=0.0001, metadata={"help": "The initial learning rate."}) + num_train_epochs: float = field(default=10.0, metadata={"help": "Total number of training epochs to perform."}) + warmup_start_lr: float = field(default=1e-6, metadata={"help": "Initial learning rate of warm up."}) + eta_min: float = field(default=1e-5, metadata={"help": "The minimum value of learning rate."}) + warmup_steps: int = field(default=2000, metadata={"help": "Number of warmup steps."}) + lr_scheduler_name: str = field(default="CosineDecayWithWarmup", metadata={"help": "The scheduler name to use."}) per_device_train_batch_size: int = field( - default=128, - metadata={ - "help": "Batch size per GPU core/CPU for training. (default: 8)" - }) + default=128, metadata={"help": "Batch size per GPU core/CPU for training. (default: 8)"} + ) per_device_eval_batch_size: int = field( - default=1, - metadata={ - "help": " Batch size per GPU core/CPU for evaluation. (default:8)" - }) - warmup_start_lr: float = field( - default=1e-6, - metadata={"help": " The initial learning rate of blip2."}) + default=1, metadata={"help": " Batch size per GPU core/CPU for evaluation. (default:8)"} + ) + warmup_start_lr: float = field(default=1e-6, metadata={"help": " The initial learning rate of blip2."}) output_dir: str = field(default=".", metadata={"help": "The output path"}) - do_eval: bool = field( - default=True, metadata={"help": "Whether to evaluation."}) + do_eval: bool = field(default=True, metadata={"help": "Whether to evaluation."}) do_train: bool = field(default=True, metadata={"help": "Whether to train."}) - logging_steps: int = field( - default=50, metadata={"help": "Logging interval"}) - evaluation_strategy: str = field( - default="no", - metadata={"help": "Evaluation strategy (epoch/steps/no)"}) + logging_steps: int = field(default=50, metadata={"help": "Logging interval"}) + evaluation_strategy: str = field(default="no", metadata={"help": "Evaluation strategy (epoch/steps/no)"}) - fp16_opt_level: str = field( - default="O1", metadata={"help": "Mixed Precision Type"}) - fp16: bool = field( - default=True, metadata={"help": "Whether to use mixed Precision"}) + fp16_opt_level: str = field(default="O1", metadata={"help": "Mixed Precision Type"}) + fp16: bool = field(default=True, metadata={"help": "Whether to use mixed Precision"}) gradient_checkpointing: bool = field( - default=False, - metadata={"help": "Forward recompute for saving graphics memory"}) - tensor_parallel_degree: int = field( - default=1, - metadata={"help": "Set the number of tensor model parallel"}) + default=False, metadata={"help": "Forward recompute for saving graphics memory"} + ) + tensor_parallel_degree: int = field(default=1, metadata={"help": "Set the number of tensor model parallel"}) sharding_parallel_degree: int = field( - default=1, - metadata={ - "help": "Set the number of sharding, enable sharding parallel" - }) - pipeline_parallel_degree: int = field( - default=1, metadata={"help": "Enable pipeline parallel"}) - fp16_opt_level: str = field( - default="O1", metadata={"help": "Mixed Precision Type"}) - fp16: bool = field( - default=True, metadata={"help": "Whether to use mixed Precision"}) + default=1, metadata={"help": "Set the number of sharding, enable sharding parallel"} + ) + pipeline_parallel_degree: int = field(default=1, metadata={"help": "Enable pipeline parallel"}) + fp16_opt_level: str = field(default="O1", metadata={"help": "Mixed Precision Type"}) + fp16: bool = field(default=True, metadata={"help": "Whether to use mixed Precision"}) gradient_checkpointing: bool = field( - default=False, - metadata={"help": "Forward recompute for saving graphics memory"}) - tensor_parallel_degree: int = field( - default=1, - metadata={"help": "Set the number of tensor model parallel"}) + default=False, metadata={"help": "Forward recompute for saving graphics memory"} + ) + tensor_parallel_degree: int = field(default=1, metadata={"help": "Set the number of tensor model parallel"}) sharding_parallel_degree: int = field( - default=1, - metadata={ - "help": "Set the number of sharding, enable sharding parallel" - }) - pipeline_parallel_degree: int = field( - default=1, metadata={"help": "Enable pipeline parallel"}) + default=1, metadata={"help": "Set the number of sharding, enable sharding parallel"} + ) + pipeline_parallel_degree: int = field(default=1, metadata={"help": "Enable pipeline parallel"}) model_path: str = field( default=None, - metadata={ - "help": - "The path to model if you want to load weights from the specified path" - }, ) + metadata={"help": "The path to model if you want to load weights from the specified path"}, + ) def create_model(config): # blip2_config = Blip2ForConditionalGeneration(onfig.model_name_or_path) - model = Blip2ForConditionalGeneration.from_pretrained( - pretrained_model_name_or_path=config.model_name_or_path) + model = Blip2ForConditionalGeneration.from_pretrained(pretrained_model_name_or_path=config.model_name_or_path) paddle.device.cuda.empty_cache() return model def main(): - parser = PdArgumentParser( - (ModelArguments, DataArguments, PreTrainingArguments)) + parser = PdArgumentParser((ModelArguments, DataArguments, PreTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Log model and data config @@ -186,14 +150,12 @@ def main(): # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, " - + - f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}" ) # Detecting last checkpoint last_checkpoint = None - if (os.path.isdir(training_args.output_dir) and training_args.do_train and - not training_args.overwrite_output_dir): + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( @@ -202,20 +164,21 @@ def main(): ) # create dataset - tokenizer_class = AutoTokenizer.from_pretrained( - model_args.text_model_name_or_path, use_fast=False) + tokenizer_class = AutoTokenizer.from_pretrained(model_args.text_model_name_or_path, use_fast=False) image_processor = BlipImageProcessor.from_pretrained( - os.path.join(model_args.model_name_or_path, "processor", "train")) + os.path.join(model_args.model_name_or_path, "processor", "train") + ) text_processor_class = BlipTextProcessor.from_pretrained( - os.path.join(model_args.model_name_or_path, "processor", "train")) - processor = Blip2Processor(image_processor, text_processor_class, - tokenizer_class) + os.path.join(model_args.model_name_or_path, "processor", "train") + ) + processor = Blip2Processor(image_processor, text_processor_class, tokenizer_class) image_processor_eval = BlipImageProcessor.from_pretrained( - os.path.join(model_args.model_name_or_path, "processor", "eval")) + os.path.join(model_args.model_name_or_path, "processor", "eval") + ) text_processor_class_eval = BlipTextProcessor.from_pretrained( - os.path.join(model_args.model_name_or_path, "processor", "eval")) - eval_processor = Blip2Processor(image_processor_eval, - text_processor_class_eval, tokenizer_class) + os.path.join(model_args.model_name_or_path, "processor", "eval") + ) + eval_processor = Blip2Processor(image_processor_eval, text_processor_class_eval, tokenizer_class) train_dataset = load_dataset(data_args.task_name, splits="train") eval_dataset = {"test": load_dataset(data_args.task_name, splits="test")} @@ -225,8 +188,7 @@ def main(): model_args.mp_degree = training_args.tensor_parallel_degree model_args.gradient_checkpointing = training_args.gradient_checkpointing model = create_model(model_args) - logger.info("training_args.use_hybrid_parallel:{}".format( - training_args.use_hybrid_parallel)) + logger.info("training_args.use_hybrid_parallel:{}".format(training_args.use_hybrid_parallel)) # create trainer trainer = Trainer( model=model, @@ -237,7 +199,8 @@ def main(): eval_collator=blip_eval_collator, processor=processor, eval_processor=eval_processor, - tokenizer=tokenizer_class) + tokenizer=tokenizer_class, + ) eval_metrics = trainer.evaluate(eval_dataset) trainer.log_metrics("eval", eval_metrics) @@ -245,17 +208,15 @@ def main(): def setdistenv(args): if args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree != 1: args.use_hybrid_parallel = True - args.dp_degree = dist.get_world_size() \ - // (args.tensor_parallel_degree \ - * args.sharding_parallel_degree * \ - args.pipeline_parallel_degree) + args.dp_degree = dist.get_world_size() // ( + args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree + ) strategy = fleet.DistributedStrategy() if args.tensor_parallel_degree > 1: strategy.tensor_parallel = True args.data_parallel_degree = args.dp_degree logger.info("args.dp_degree:{}".format(args.dp_degree)) - logger.info("args.sharding_parallel_degree):{}".format( - args.sharding_parallel_degree)) + logger.info("args.sharding_parallel_degree):{}".format(args.sharding_parallel_degree)) # breakpoint() strategy.hybrid_configs = { "dp_degree": args.dp_degree, @@ -267,7 +228,7 @@ def setdistenv(args): MICRO_BATCH_SIZE = 32 strategy.pipeline_configs = { "accumulate_steps": BATCH_SIZE // MICRO_BATCH_SIZE, - "micro_batch_size": MICRO_BATCH_SIZE + "micro_batch_size": MICRO_BATCH_SIZE, } strategy.find_unused_parameters = True @@ -287,8 +248,7 @@ def setdistenv(args): args.sharding_rank = hcg.get_sharding_parallel_rank() args.data_world_rank = args.dp_rank * args.sharding_parallel_degree + args.sharding_rank - args.data_world_size = dist.get_world_size() // abs( - args.tensor_parallel_degree * args.pipeline_parallel_degree) + args.data_world_size = dist.get_world_size() // abs(args.tensor_parallel_degree * args.pipeline_parallel_degree) # seed control in hybrid parallel set_hyrbid_parallel_seed(args.seed, args.data_world_rank, args.mp_rank) @@ -296,12 +256,12 @@ def setdistenv(args): def set_hyrbid_parallel_seed(basic_seed, data_world_rank, mp_rank, pp_rank=0): device_id = paddle.device.get_device() - assert 'gpu' in device_id + assert "gpu" in device_id random.seed(basic_seed + data_world_rank) np.random.seed(basic_seed + data_world_rank) paddle.seed(basic_seed + data_world_rank) - #TODO add manual_seed + # TODO add manual_seed # local_seed/ global_seed is used to control dropout in ModelParallel local_seed = 1024 + basic_seed + mp_rank * 100 + data_world_rank global_seed = 2048 + basic_seed + data_world_rank diff --git a/paddlemix/examples/blip2/run_predict.py b/paddlemix/examples/blip2/run_predict.py index 29a24d402df11..50c870cedfdf7 100644 --- a/paddlemix/examples/blip2/run_predict.py +++ b/paddlemix/examples/blip2/run_predict.py @@ -11,30 +11,32 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import sys import os -import paddle.distributed as dist -from paddle.distributed import fleet +import random +import sys from dataclasses import dataclass, field + import numpy as np -import random import paddle +import paddle.distributed as dist +from paddle.distributed import fleet from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker -sys.path.insert( - 0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../..')) -from dataclasses import dataclass, field -import paddle + +sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../..")) + import requests -from paddlenlp.trainer import PdArgumentParser +from paddlenlp.trainer import PdArgumentParser, TrainingArguments +from paddlenlp.transformers import AutoTokenizer from PIL import Image +from paddlemix.examples.blip2.utils import LLM_LIST, load_model from paddlemix.models.blip2.modeling import Blip2ForConditionalGeneration -from paddlemix.processors.blip_processing import Blip2Processor +from paddlemix.processors.blip_processing import ( + Blip2Processor, + BlipImageProcessor, + BlipTextProcessor, +) from paddlemix.utils.log import logger -from paddlenlp.transformers import AutoTokenizer -from paddlemix.processors.blip_processing import BlipImageProcessor, BlipTextProcessor -from paddlemix.examples.blip2.utils import load_model, LLM_LIST -from paddlenlp.trainer import (PdArgumentParser, TrainingArguments) @dataclass @@ -47,13 +49,11 @@ class DataArguments: """ input_image: str = field( - default="http://images.cocodataset.org/val2017/000000039769.jpg", - metadata={"help": "The name of input image." - }) # "http://images.cocodataset.org/val2017/000000039769.jpg" + default="http://images.cocodataset.org/val2017/000000039769.jpg", metadata={"help": "The name of input image."} + ) # "http://images.cocodataset.org/val2017/000000039769.jpg" prompt: str = field( - default="describe the image", - metadata={"help": "The prompt of the image to be generated." - }) # "Question: how many cats are there? Answer:" + default="describe the image", metadata={"help": "The prompt of the image to be generated."} + ) # "Question: how many cats are there? Answer:" @dataclass @@ -64,14 +64,14 @@ class ModelArguments: model_name_or_path: str = field( default="paddlemix/blip2-caption-opt2.7b", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) text_model_name_or_path: str = field( default="facebook/opt-2.7b", - metadata={"help": "The type of text model to use (OPT, T5)."}, ) - image_size: int = field( - default=224, - metadata={"help": " Image size for training. (default:224)"}) + metadata={"help": "The type of text model to use (OPT, T5)."}, + ) + image_size: int = field(default=224, metadata={"help": " Image size for training. (default:224)"}) @dataclass @@ -79,84 +79,54 @@ class PreTrainingArguments(TrainingArguments): """ Arguments pertaining to what training options we are going to use during pretraining. """ - weight_decay: float = field( - default=0.05, metadata={"help": "Weight decay if we apply some."}) - learning_rate: float = field( - default=0.0001, metadata={"help": "The initial learning rate."}) - num_train_epochs: float = field( - default=10.0, - metadata={"help": "Total number of training epochs to perform."}) - warmup_start_lr: float = field( - default=1e-6, metadata={"help": "Initial learning rate of warm up."}) - eta_min: float = field( - default=1e-5, metadata={"help": "The minimum value of learning rate."}) - warmup_steps: int = field( - default=2000, metadata={"help": "Number of warmup steps."}) - lr_scheduler_name: str = field( - default="CosineDecayWithWarmup", - metadata={"help": "The scheduler name to use."}) + + weight_decay: float = field(default=0.05, metadata={"help": "Weight decay if we apply some."}) + learning_rate: float = field(default=0.0001, metadata={"help": "The initial learning rate."}) + num_train_epochs: float = field(default=10.0, metadata={"help": "Total number of training epochs to perform."}) + warmup_start_lr: float = field(default=1e-6, metadata={"help": "Initial learning rate of warm up."}) + eta_min: float = field(default=1e-5, metadata={"help": "The minimum value of learning rate."}) + warmup_steps: int = field(default=2000, metadata={"help": "Number of warmup steps."}) + lr_scheduler_name: str = field(default="CosineDecayWithWarmup", metadata={"help": "The scheduler name to use."}) per_device_train_batch_size: int = field( - default=128, - metadata={ - "help": "Batch size per GPU core/CPU for training. (default: 8)" - }) + default=128, metadata={"help": "Batch size per GPU core/CPU for training. (default: 8)"} + ) per_device_eval_batch_size: int = field( - default=128, - metadata={ - "help": " Batch size per GPU core/CPU for evaluation. (default:8)" - }) - warmup_start_lr: float = field( - default=1e-6, - metadata={"help": " The initial learning rate of blip2."}) + default=128, metadata={"help": " Batch size per GPU core/CPU for evaluation. (default:8)"} + ) + warmup_start_lr: float = field(default=1e-6, metadata={"help": " The initial learning rate of blip2."}) output_dir: str = field(default=".", metadata={"help": "The output path"}) - do_eval: bool = field( - default=False, metadata={"help": "Whether to evaluation."}) + do_eval: bool = field(default=False, metadata={"help": "Whether to evaluation."}) do_train: bool = field(default=True, metadata={"help": "Whether to train."}) - logging_steps: int = field( - default=50, metadata={"help": "Logging interval"}) - evaluation_strategy: str = field( - default="no", - metadata={"help": "Evaluation strategy (epoch/steps/no)"}) + logging_steps: int = field(default=50, metadata={"help": "Logging interval"}) + evaluation_strategy: str = field(default="no", metadata={"help": "Evaluation strategy (epoch/steps/no)"}) - fp16_opt_level: str = field( - default="O1", metadata={"help": "Mixed Precision Type"}) - fp16: bool = field( - default=True, metadata={"help": "Whether to use mixed Precision"}) + fp16_opt_level: str = field(default="O1", metadata={"help": "Mixed Precision Type"}) + fp16: bool = field(default=True, metadata={"help": "Whether to use mixed Precision"}) gradient_checkpointing: bool = field( - default=False, - metadata={"help": "Forward recompute for saving graphics memory"}) - tensor_parallel_degree: int = field( - default=1, - metadata={"help": "Set the number of tensor model parallel"}) + default=False, metadata={"help": "Forward recompute for saving graphics memory"} + ) + tensor_parallel_degree: int = field(default=1, metadata={"help": "Set the number of tensor model parallel"}) sharding_parallel_degree: int = field( - default=1, - metadata={ - "help": "Set the number of sharding, enable sharding parallel" - }) - pipeline_parallel_degree: int = field( - default=1, metadata={"help": "Enable pipeline parallel"}) + default=1, metadata={"help": "Set the number of sharding, enable sharding parallel"} + ) + pipeline_parallel_degree: int = field(default=1, metadata={"help": "Enable pipeline parallel"}) model_path: str = field( default=None, - metadata={ - "help": - "The path to model if you want to load weights from the specified path" - }, ) + metadata={"help": "The path to model if you want to load weights from the specified path"}, + ) def create_model(config): - model = Blip2ForConditionalGeneration.from_pretrained( - pretrained_model_name_or_path=config.model_name_or_path) + model = Blip2ForConditionalGeneration.from_pretrained(pretrained_model_name_or_path=config.model_name_or_path) paddle.device.cuda.empty_cache() return model def main(): - parser = PdArgumentParser( - (ModelArguments, DataArguments, PreTrainingArguments)) + parser = PdArgumentParser((ModelArguments, DataArguments, PreTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() - url = (data_args.input_image - ) # "http://images.cocodataset.org/val2017/000000039769.jpg" + url = data_args.input_image # "http://images.cocodataset.org/val2017/000000039769.jpg" image = Image.open(requests.get(url, stream=True).raw) training_args.print_config(model_args, "Model") training_args.print_config(data_args, "Data") @@ -167,36 +137,29 @@ def main(): model_args.data_world_size = training_args.data_world_size paddle.set_device(training_args.device) prompt = data_args.prompt - tokenizer_class = AutoTokenizer.from_pretrained( - model_args.text_model_name_or_path, use_fast=False) + tokenizer_class = AutoTokenizer.from_pretrained(model_args.text_model_name_or_path, use_fast=False) image_processor = BlipImageProcessor.from_pretrained( - os.path.join(model_args.model_name_or_path, "processor", "eval")) + os.path.join(model_args.model_name_or_path, "processor", "eval") + ) text_processor_class = BlipTextProcessor.from_pretrained( - os.path.join(model_args.model_name_or_path, "processor", "eval")) - processor = Blip2Processor(image_processor, text_processor_class, - tokenizer_class) + os.path.join(model_args.model_name_or_path, "processor", "eval") + ) + processor = Blip2Processor(image_processor, text_processor_class, tokenizer_class) inputs = processor( images=image, text=prompt, return_tensors="pd", return_attention_mask=True, - mode="test", ) + mode="test", + ) model = create_model(model_args) model.eval() if training_args.model_path is not None: checkpoint = training_args.model_path - load_model( - training_args, - model, - ckpt_dir=checkpoint, - load_language_model=False) - load_model( - training_args, - model.language_model, - ckpt_dir=LLM_LIST[model_args.text_model_name_or_path]) + load_model(training_args, model, ckpt_dir=checkpoint, load_language_model=False) + load_model(training_args, model.language_model, ckpt_dir=LLM_LIST[model_args.text_model_name_or_path]) generated_ids, scores = model.generate(**inputs) - generated_text = processor.batch_decode( - generated_ids, skip_special_tokens=True)[0].strip() + generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() logger.info("Generate text: {}".format(generated_text)) return model @@ -204,17 +167,15 @@ def main(): def setdistenv(args): if args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree != 1: args.use_hybrid_parallel = True - args.dp_degree = dist.get_world_size() \ - // (args.tensor_parallel_degree \ - * args.sharding_parallel_degree * \ - args.pipeline_parallel_degree) + args.dp_degree = dist.get_world_size() // ( + args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree + ) strategy = fleet.DistributedStrategy() if args.tensor_parallel_degree > 1: strategy.tensor_parallel = True args.data_parallel_degree = args.dp_degree logger.info("args.dp_degree:{}".format(args.dp_degree)) - logger.info("args.sharding_parallel_degree):{}".format( - args.sharding_parallel_degree)) + logger.info("args.sharding_parallel_degree):{}".format(args.sharding_parallel_degree)) if args.sharding_parallel_degree > 1: args.sharding = "stage1" strategy.hybrid_configs = { @@ -227,7 +188,7 @@ def setdistenv(args): MICRO_BATCH_SIZE = 32 strategy.pipeline_configs = { "accumulate_steps": BATCH_SIZE // MICRO_BATCH_SIZE, - "micro_batch_size": MICRO_BATCH_SIZE + "micro_batch_size": MICRO_BATCH_SIZE, } strategy.find_unused_parameters = True @@ -244,8 +205,7 @@ def setdistenv(args): args.sharding_rank = hcg.get_sharding_parallel_rank() args.data_world_rank = args.dp_rank * args.sharding_parallel_degree + args.sharding_rank - args.data_world_size = dist.get_world_size() // abs( - args.tensor_parallel_degree * args.pipeline_parallel_degree) + args.data_world_size = dist.get_world_size() // abs(args.tensor_parallel_degree * args.pipeline_parallel_degree) # seed control in hybrid parallel set_hyrbid_parallel_seed(args.seed, args.data_world_rank, args.mp_rank) @@ -253,12 +213,12 @@ def setdistenv(args): def set_hyrbid_parallel_seed(basic_seed, data_world_rank, mp_rank, pp_rank=0): device_id = paddle.device.get_device() - assert 'gpu' in device_id + assert "gpu" in device_id random.seed(basic_seed + data_world_rank) np.random.seed(basic_seed + data_world_rank) paddle.seed(basic_seed + data_world_rank) - #TODO add manual_seed + # TODO add manual_seed # local_seed/ global_seed is used to control dropout in ModelParallel local_seed = 1024 + basic_seed + mp_rank * 100 + data_world_rank global_seed = 2048 + basic_seed + data_world_rank diff --git a/paddlemix/examples/blip2/run_pretrain_stage1.py b/paddlemix/examples/blip2/run_pretrain_stage1.py index 4b6a0847d70b3..bc0302ca7cbdf 100644 --- a/paddlemix/examples/blip2/run_pretrain_stage1.py +++ b/paddlemix/examples/blip2/run_pretrain_stage1.py @@ -12,31 +12,32 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys import os -sys.path.insert( - 0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../..')) -import paddle.distributed as dist -from paddle.distributed import fleet +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../..")) +import random from dataclasses import dataclass, field + import numpy as np -import random import paddle +import paddle.distributed as dist +from paddle.distributed import fleet from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker -from paddlenlp.trainer import (PdArgumentParser, TrainingArguments, - get_last_checkpoint) -from paddlenlp.transformers import AutoConfig, OPTConfig, T5Config +from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint +from paddlenlp.transformers import AutoTokenizer + from paddlemix.datasets import load_dataset -from paddlemix.models.blip2.configuration import ( - Blip2Config, Blip2QFormerConfig, Blip2VisionConfig) +from paddlemix.models.blip2.configuration import Blip2Config +from paddlemix.models.blip2.eva_vit import interpolate_pos_embed from paddlemix.models.blip2.modeling import Blip2ForConditionalGeneration -from paddlemix.processors.blip_processing import Blip2Processor +from paddlemix.processors.blip_processing import ( + Blip2Processor, + BlipImageProcessor, + BlipTextProcessor, +) from paddlemix.trainer.blip2_trainer import BLIP2Trainer as Trainer from paddlemix.utils.log import logger -from paddlenlp.transformers import AutoTokenizer -from paddlemix.models.blip2.eva_vit import interpolate_pos_embed -from paddlemix.processors.blip_processing import BlipImageProcessor, BlipTextProcessor -from paddlemix.examples.blip2.utils import load_model class BlipCollator: @@ -58,15 +59,18 @@ def __call__(self, data_list): text = None else: text = [sample["text_input"] for sample in data_list] - image_id = [sample["image_id"] for sample in data_list] + # image_id = [sample["image_id"] for sample in data_list] batch = self.processor( images=images, return_tensors="pd", - mode=self.mode, ) + mode=self.mode, + ) # batch.update({'image_id':image_id},) - batch.update({'text_input_stage1': text}, ) + batch.update( + {"text_input_stage1": text}, + ) return batch @@ -81,13 +85,11 @@ class DataArguments: task_name: str = field( default="coco_caption", - metadata={ - "help": "The name of the task to use (via the datasets library)." - }, ) + metadata={"help": "The name of the task to use (via the datasets library)."}, + ) prompt: str = field( - default="a photo of ", - metadata={"help": "The prompt of the image to be generated." - }) # "Question: how many cats are there? Answer:" + default="a photo of ", metadata={"help": "The prompt of the image to be generated."} + ) # "Question: how many cats are there? Answer:" @dataclass @@ -98,14 +100,14 @@ class ModelArguments: model_name_or_path: str = field( default="paddlemix/blip2-stage1", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) text_model_name_or_path: str = field( default="facebook/opt-2.7b", - metadata={"help": "The type of text model to use (OPT, T5)."}, ) - image_size: int = field( - default=224, - metadata={"help": " Image size for training. (default:224)"}) + metadata={"help": "The type of text model to use (OPT, T5)."}, + ) + image_size: int = field(default=224, metadata={"help": " Image size for training. (default:224)"}) @dataclass @@ -114,68 +116,41 @@ class PreTrainingArguments(TrainingArguments): Arguments pertaining to what training options we are going to use during pretraining. """ - weight_decay: float = field( - default=0.05, metadata={"help": "Weight decay if we apply some."}) - learning_rate: float = field( - default=0.0001, metadata={"help": "The initial learning rate."}) - num_train_epochs: float = field( - default=10.0, - metadata={"help": "Total number of training epochs to perform."}) - warmup_start_lr: float = field( - default=1e-6, metadata={"help": "Initial learning rate of warm up."}) - eta_min: float = field( - default=1e-5, metadata={"help": "The minimum value of learning rate."}) - warmup_steps: int = field( - default=5000, metadata={"help": "Number of warmup steps."}) - lr_scheduler_name: str = field( - default="CosineDecayWithWarmup", - metadata={"help": "The scheduler name to use."}) + weight_decay: float = field(default=0.05, metadata={"help": "Weight decay if we apply some."}) + learning_rate: float = field(default=0.0001, metadata={"help": "The initial learning rate."}) + num_train_epochs: float = field(default=10.0, metadata={"help": "Total number of training epochs to perform."}) + warmup_start_lr: float = field(default=1e-6, metadata={"help": "Initial learning rate of warm up."}) + eta_min: float = field(default=1e-5, metadata={"help": "The minimum value of learning rate."}) + warmup_steps: int = field(default=5000, metadata={"help": "Number of warmup steps."}) + lr_scheduler_name: str = field(default="CosineDecayWithWarmup", metadata={"help": "The scheduler name to use."}) per_device_train_batch_size: int = field( - default=256, - metadata={ - "help": "Batch size per GPU core/CPU for training. (default: 8)" - }) + default=256, metadata={"help": "Batch size per GPU core/CPU for training. (default: 8)"} + ) per_device_eval_batch_size: int = field( - default=128, - metadata={ - "help": " Batch size per GPU core/CPU for evaluation. (default:8)" - }) + default=128, metadata={"help": " Batch size per GPU core/CPU for evaluation. (default:8)"} + ) output_dir: str = field(default=".", metadata={"help": "The output path"}) - do_eval: bool = field( - default=False, metadata={"help": "Whether to evaluation."}) + do_eval: bool = field(default=False, metadata={"help": "Whether to evaluation."}) do_train: bool = field(default=True, metadata={"help": "Whether to train."}) - logging_steps: int = field( - default=50, metadata={"help": "Logging interval"}) - evaluation_strategy: str = field( - default="no", - metadata={"help": "Evaluation strategy (epoch/steps/no)"}) + logging_steps: int = field(default=50, metadata={"help": "Logging interval"}) + evaluation_strategy: str = field(default="no", metadata={"help": "Evaluation strategy (epoch/steps/no)"}) - fp16_opt_level: str = field( - default="O1", metadata={"help": "Mixed Precision Type"}) - fp16: bool = field( - default=True, metadata={"help": "Whether to use mixed Precision"}) + fp16_opt_level: str = field(default="O1", metadata={"help": "Mixed Precision Type"}) + fp16: bool = field(default=True, metadata={"help": "Whether to use mixed Precision"}) gradient_checkpointing: bool = field( - default=False, - metadata={"help": "Forward recompute for saving graphics memory"}) - tensor_parallel_degree: int = field( - default=1, - metadata={"help": "Set the number of tensor model parallel"}) + default=False, metadata={"help": "Forward recompute for saving graphics memory"} + ) + tensor_parallel_degree: int = field(default=1, metadata={"help": "Set the number of tensor model parallel"}) sharding_parallel_degree: int = field( - default=1, - metadata={ - "help": "Set the number of sharding, enable sharding parallel" - }) - pipeline_parallel_degree: int = field( - default=1, metadata={"help": "Enable pipeline parallel"}) - checkpoint_steps: int = field( - default=1000, metadata={"help": "save checkpoint with x steps"}) + default=1, metadata={"help": "Set the number of sharding, enable sharding parallel"} + ) + pipeline_parallel_degree: int = field(default=1, metadata={"help": "Enable pipeline parallel"}) + checkpoint_steps: int = field(default=1000, metadata={"help": "save checkpoint with x steps"}) model_path: str = field( default=None, - metadata={ - "help": - "The path to model if you want to load weights from the specified path" - }, ) + metadata={"help": "The path to model if you want to load weights from the specified path"}, + ) def create_model(config): @@ -188,8 +163,7 @@ def create_model(config): def main(): - parser = PdArgumentParser( - (ModelArguments, DataArguments, PreTrainingArguments)) + parser = PdArgumentParser((ModelArguments, DataArguments, PreTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Log model and data config @@ -204,14 +178,12 @@ def main(): # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, " - + - f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}" ) # Detecting last checkpoint last_checkpoint = None - if (os.path.isdir(training_args.output_dir) and training_args.do_train and - not training_args.overwrite_output_dir): + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( @@ -220,14 +192,14 @@ def main(): ) # create dataset - tokenizer_class = AutoTokenizer.from_pretrained( - model_args.text_model_name_or_path, use_fast=False) + tokenizer_class = AutoTokenizer.from_pretrained(model_args.text_model_name_or_path, use_fast=False) image_processor = BlipImageProcessor.from_pretrained( - os.path.join(model_args.model_name_or_path, "processor", "train")) + os.path.join(model_args.model_name_or_path, "processor", "train") + ) text_processor_class = BlipTextProcessor.from_pretrained( - os.path.join(model_args.model_name_or_path, "processor", "train")) - processor = Blip2Processor(image_processor, text_processor_class, - tokenizer_class) + os.path.join(model_args.model_name_or_path, "processor", "train") + ) + processor = Blip2Processor(image_processor, text_processor_class, tokenizer_class) train_dataset = load_dataset(data_args.task_name, splits="train") eval_dataset = {"test": load_dataset(data_args.task_name, splits="test")} @@ -236,8 +208,7 @@ def main(): model_args.mp_degree = training_args.tensor_parallel_degree model_args.gradient_checkpointing = training_args.gradient_checkpointing model = create_model(model_args) - logger.info("training_args.use_hybrid_parallel:{}".format( - training_args.use_hybrid_parallel)) + logger.info("training_args.use_hybrid_parallel:{}".format(training_args.use_hybrid_parallel)) # create trainer trainer = Trainer( model=model, @@ -246,7 +217,8 @@ def main(): eval_dataset=eval_dataset, data_collator=blip_collator, processor=processor, - tokenizer=tokenizer_class) + tokenizer=tokenizer_class, + ) # Training checkpoint = None if training_args.resume_from_checkpoint is not None: @@ -263,17 +235,15 @@ def main(): def setdistenv(args): if args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree != 1: args.use_hybrid_parallel = True - args.dp_degree = dist.get_world_size() \ - // (args.tensor_parallel_degree \ - * args.sharding_parallel_degree * \ - args.pipeline_parallel_degree) + args.dp_degree = dist.get_world_size() // ( + args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree + ) strategy = fleet.DistributedStrategy() if args.tensor_parallel_degree > 1: strategy.tensor_parallel = True args.data_parallel_degree = args.dp_degree logger.info("args.dp_degree:{}".format(args.dp_degree)) - logger.info("args.sharding_parallel_degree):{}".format( - args.sharding_parallel_degree)) + logger.info("args.sharding_parallel_degree):{}".format(args.sharding_parallel_degree)) if args.sharding_parallel_degree > 1: args.sharding = "stage1" strategy.hybrid_configs = { @@ -286,7 +256,7 @@ def setdistenv(args): MICRO_BATCH_SIZE = 32 strategy.pipeline_configs = { "accumulate_steps": BATCH_SIZE // MICRO_BATCH_SIZE, - "micro_batch_size": MICRO_BATCH_SIZE + "micro_batch_size": MICRO_BATCH_SIZE, } strategy.find_unused_parameters = True @@ -303,8 +273,7 @@ def setdistenv(args): args.sharding_rank = hcg.get_sharding_parallel_rank() args.data_world_rank = args.dp_rank * args.sharding_parallel_degree + args.sharding_rank - args.data_world_size = dist.get_world_size() // abs( - args.tensor_parallel_degree * args.pipeline_parallel_degree) + args.data_world_size = dist.get_world_size() // abs(args.tensor_parallel_degree * args.pipeline_parallel_degree) # seed control in hybrid parallel set_hyrbid_parallel_seed(args.seed, args.data_world_rank, args.mp_rank) @@ -312,12 +281,12 @@ def setdistenv(args): def set_hyrbid_parallel_seed(basic_seed, data_world_rank, mp_rank, pp_rank=0): device_id = paddle.device.get_device() - assert 'gpu' in device_id + assert "gpu" in device_id random.seed(basic_seed + data_world_rank) np.random.seed(basic_seed + data_world_rank) paddle.seed(basic_seed + data_world_rank) - #TODO add manual_seed + # TODO add manual_seed # local_seed/ global_seed is used to control dropout in ModelParallel local_seed = 1024 + basic_seed + mp_rank * 100 + data_world_rank global_seed = 2048 + basic_seed + data_world_rank diff --git a/paddlemix/examples/blip2/run_pretrain_stage2.py b/paddlemix/examples/blip2/run_pretrain_stage2.py index bff19a553dc73..e34cc7e229d0e 100644 --- a/paddlemix/examples/blip2/run_pretrain_stage2.py +++ b/paddlemix/examples/blip2/run_pretrain_stage2.py @@ -12,31 +12,32 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys import os -sys.path.insert( - 0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../..')) -import paddle.distributed as dist -from paddle.distributed import fleet +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../..")) +import random from dataclasses import dataclass, field + import numpy as np -import random import paddle +import paddle.distributed as dist +from paddle.distributed import fleet from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker -from paddlenlp.trainer import (PdArgumentParser, TrainingArguments, - get_last_checkpoint) -from paddlenlp.transformers import AutoConfig, OPTConfig, T5Config +from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint +from paddlenlp.transformers import AutoTokenizer + from paddlemix.datasets import load_dataset -from paddlemix.models.blip2.configuration import ( - Blip2Config, Blip2QFormerConfig, Blip2VisionConfig) +from paddlemix.examples.blip2.utils import LLM_LIST, BlipCollator, load_model +from paddlemix.models.blip2.configuration import Blip2Config from paddlemix.models.blip2.modeling import Blip2ForConditionalGeneration -from paddlemix.processors.blip_processing import Blip2Processor +from paddlemix.processors.blip_processing import ( + Blip2Processor, + BlipImageProcessor, + BlipTextProcessor, +) from paddlemix.trainer.blip2_trainer import BLIP2Trainer as Trainer from paddlemix.utils.log import logger -from paddlenlp.transformers import AutoTokenizer -from paddlemix.models.blip2.eva_vit import interpolate_pos_embed -from paddlemix.processors.blip_processing import BlipImageProcessor, BlipTextProcessor -from paddlemix.examples.blip2.utils import BlipCollator, LLM_LIST, load_model @dataclass @@ -50,13 +51,11 @@ class DataArguments: task_name: str = field( default="coco_caption", - metadata={ - "help": "The name of the task to use (via the datasets library)." - }, ) + metadata={"help": "The name of the task to use (via the datasets library)."}, + ) prompt: str = field( - default="a photo of ", - metadata={"help": "The prompt of the image to be generated." - }) # "Question: how many cats are there? Answer:" + default="a photo of ", metadata={"help": "The prompt of the image to be generated."} + ) # "Question: how many cats are there? Answer:" @dataclass @@ -67,17 +66,15 @@ class ModelArguments: model_name_or_path: str = field( default="paddlemix/blip2-stage2", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) text_model_name_or_path: str = field( default="facebook/opt-2.7b", - metadata={"help": "The type of text model to use (OPT, T5)."}, ) - image_size: int = field( - default=224, - metadata={"help": " Image size for training. (default:224)"}) - llm_name: str = field( - default="opt-2.7b", - metadata={"help": "llm name which you ned to load in LLM_LIST"}) + metadata={"help": "The type of text model to use (OPT, T5)."}, + ) + image_size: int = field(default=224, metadata={"help": " Image size for training. (default:224)"}) + llm_name: str = field(default="opt-2.7b", metadata={"help": "llm name which you ned to load in LLM_LIST"}) @dataclass @@ -85,75 +82,46 @@ class PreTrainingArguments(TrainingArguments): """ Arguments pertaining to what training options we are going to use during pretraining. """ - weight_decay: float = field( - default=0.05, metadata={"help": "Weight decay if we apply some."}) - learning_rate: float = field( - default=0.0001, metadata={"help": "The initial learning rate."}) - num_train_epochs: float = field( - default=10.0, - metadata={"help": "Total number of training epochs to perform."}) - warmup_start_lr: float = field( - default=1e-6, metadata={"help": "Initial learning rate of warm up."}) - eta_min: float = field( - default=1e-5, metadata={"help": "The minimum value of learning rate."}) - warmup_steps: int = field( - default=2000, metadata={"help": "Number of warmup steps."}) - lr_scheduler_name: str = field( - default="CosineDecayWithWarmup", - metadata={"help": "The scheduler name to use."}) + + weight_decay: float = field(default=0.05, metadata={"help": "Weight decay if we apply some."}) + learning_rate: float = field(default=0.0001, metadata={"help": "The initial learning rate."}) + num_train_epochs: float = field(default=10.0, metadata={"help": "Total number of training epochs to perform."}) + warmup_start_lr: float = field(default=1e-6, metadata={"help": "Initial learning rate of warm up."}) + eta_min: float = field(default=1e-5, metadata={"help": "The minimum value of learning rate."}) + warmup_steps: int = field(default=2000, metadata={"help": "Number of warmup steps."}) + lr_scheduler_name: str = field(default="CosineDecayWithWarmup", metadata={"help": "The scheduler name to use."}) per_device_train_batch_size: int = field( - default=32, - metadata={ - "help": "Batch size per GPU core/CPU for training. (default: 8)" - }) + default=32, metadata={"help": "Batch size per GPU core/CPU for training. (default: 8)"} + ) per_device_eval_batch_size: int = field( - default=128, - metadata={ - "help": " Batch size per GPU core/CPU for evaluation. (default:8)" - }) - warmup_start_lr: float = field( - default=1e-6, - metadata={"help": " The initial learning rate of blip2."}) + default=128, metadata={"help": " Batch size per GPU core/CPU for evaluation. (default:8)"} + ) + warmup_start_lr: float = field(default=1e-6, metadata={"help": " The initial learning rate of blip2."}) output_dir: str = field(default=".", metadata={"help": "The output path"}) - do_eval: bool = field( - default=False, metadata={"help": "Whether to evaluation."}) + do_eval: bool = field(default=False, metadata={"help": "Whether to evaluation."}) do_train: bool = field(default=True, metadata={"help": "Whether to train."}) - logging_steps: int = field( - default=50, metadata={"help": "Logging interval"}) - evaluation_strategy: str = field( - default="no", - metadata={"help": "Evaluation strategy (epoch/steps/no)"}) + logging_steps: int = field(default=50, metadata={"help": "Logging interval"}) + evaluation_strategy: str = field(default="no", metadata={"help": "Evaluation strategy (epoch/steps/no)"}) - fp16_opt_level: str = field( - default="O1", metadata={"help": "Mixed Precision Type"}) - fp16: bool = field( - default=True, metadata={"help": "Whether to use mixed Precision"}) + fp16_opt_level: str = field(default="O1", metadata={"help": "Mixed Precision Type"}) + fp16: bool = field(default=True, metadata={"help": "Whether to use mixed Precision"}) gradient_checkpointing: bool = field( - default=False, - metadata={"help": "Forward recompute for saving graphics memory"}) - tensor_parallel_degree: int = field( - default=1, - metadata={"help": "Set the number of tensor model parallel"}) + default=False, metadata={"help": "Forward recompute for saving graphics memory"} + ) + tensor_parallel_degree: int = field(default=1, metadata={"help": "Set the number of tensor model parallel"}) sharding_parallel_degree: int = field( - default=1, - metadata={ - "help": "Set the number of sharding, enable sharding parallel" - }) - pipeline_parallel_degree: int = field( - default=1, metadata={"help": "Enable pipeline parallel"}) + default=1, metadata={"help": "Set the number of sharding, enable sharding parallel"} + ) + pipeline_parallel_degree: int = field(default=1, metadata={"help": "Enable pipeline parallel"}) resume_from_checkpoint: str = field( default=None, - metadata={ - "help": - "The path to a folder with a valid checkpoint for your model." - }, ) + metadata={"help": "The path to a folder with a valid checkpoint for your model."}, + ) model_path: str = field( default=None, - metadata={ - "help": - "The path to model if you want to load weights from the specified path" - }, ) + metadata={"help": "The path to model if you want to load weights from the specified path"}, + ) def create_model(config): @@ -166,8 +134,7 @@ def create_model(config): def main(): - parser = PdArgumentParser( - (ModelArguments, DataArguments, PreTrainingArguments)) + parser = PdArgumentParser((ModelArguments, DataArguments, PreTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Log model and data config @@ -182,14 +149,12 @@ def main(): # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, " - + - f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16 or training_args.bf16}" ) # Detecting last checkpoint last_checkpoint = None - if (os.path.isdir(training_args.output_dir) and training_args.do_train and - not training_args.overwrite_output_dir): + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( @@ -199,20 +164,21 @@ def main(): # create dataset - tokenizer_class = AutoTokenizer.from_pretrained( - model_args.text_model_name_or_path, use_fast=False) + tokenizer_class = AutoTokenizer.from_pretrained(model_args.text_model_name_or_path, use_fast=False) image_processor = BlipImageProcessor.from_pretrained( - os.path.join(model_args.model_name_or_path, "processor", "train")) + os.path.join(model_args.model_name_or_path, "processor", "train") + ) text_processor_class = BlipTextProcessor.from_pretrained( - os.path.join(model_args.model_name_or_path, "processor", "train")) - processor = Blip2Processor(image_processor, text_processor_class, - tokenizer_class) + os.path.join(model_args.model_name_or_path, "processor", "train") + ) + processor = Blip2Processor(image_processor, text_processor_class, tokenizer_class) image_processor_eval = BlipImageProcessor.from_pretrained( - os.path.join(model_args.model_name_or_path, "processor", "eval")) + os.path.join(model_args.model_name_or_path, "processor", "eval") + ) text_processor_class_eval = BlipTextProcessor.from_pretrained( - os.path.join(model_args.model_name_or_path, "processor", "eval")) - eval_processor = Blip2Processor(image_processor_eval, - text_processor_class_eval, tokenizer_class) + os.path.join(model_args.model_name_or_path, "processor", "eval") + ) + eval_processor = Blip2Processor(image_processor_eval, text_processor_class_eval, tokenizer_class) train_dataset = load_dataset(data_args.task_name, splits="train") eval_dataset = {"test": load_dataset(data_args.task_name, splits="test")} @@ -223,8 +189,7 @@ def main(): model_args.gradient_checkpointing = training_args.gradient_checkpointing model = create_model(model_args) - logger.info("training_args.use_hybrid_parallel:{}".format( - training_args.use_hybrid_parallel)) + logger.info("training_args.use_hybrid_parallel:{}".format(training_args.use_hybrid_parallel)) trainer = Trainer( model=model, args=training_args, @@ -234,32 +199,18 @@ def main(): eval_collator=blip_eval_collator, processor=processor, eval_processor=eval_processor, - tokenizer=tokenizer_class) + tokenizer=tokenizer_class, + ) # Training checkpoint = None if training_args.model_path is not None: checkpoint = training_args.model_path - load_model( - training_args, - model, - ckpt_dir=model_args.model_path, - load_language_model=False) - load_model( - training_args, - model.language_model, - ckpt_dir=LLM_LIST[model_args.text_model_name_or_path]) + load_model(training_args, model, ckpt_dir=model_args.model_path, load_language_model=False) + load_model(training_args, model.language_model, ckpt_dir=LLM_LIST[model_args.text_model_name_or_path]) if training_args.resume_from_checkpoint is not None: - checkpoint = os.path.join(training_args.resume_from_checkpoint, - "model_state.pdparams") - load_model( - training_args, - model, - ckpt_dir=checkpoint, - load_language_model=False) - load_model( - training_args, - model.language_model, - ckpt_dir=LLM_LIST[model_args.text_model_name_or_path]) + checkpoint = os.path.join(training_args.resume_from_checkpoint, "model_state.pdparams") + load_model(training_args, model, ckpt_dir=checkpoint, load_language_model=False) + load_model(training_args, model.language_model, ckpt_dir=LLM_LIST[model_args.text_model_name_or_path]) if training_args.do_eval: eval_metrics = trainer.evaluate(eval_dataset) trainer.log_metrics("eval", eval_metrics) @@ -272,17 +223,15 @@ def main(): def setdistenv(args): if args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree != 1: args.use_hybrid_parallel = True - args.dp_degree = dist.get_world_size() \ - // (args.tensor_parallel_degree \ - * args.sharding_parallel_degree * \ - args.pipeline_parallel_degree) + args.dp_degree = dist.get_world_size() // ( + args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree + ) strategy = fleet.DistributedStrategy() if args.tensor_parallel_degree > 1: strategy.tensor_parallel = True args.data_parallel_degree = args.dp_degree logger.info("args.dp_degree:{}".format(args.dp_degree)) - logger.info("args.sharding_parallel_degree):{}".format( - args.sharding_parallel_degree)) + logger.info("args.sharding_parallel_degree):{}".format(args.sharding_parallel_degree)) if args.sharding_parallel_degree > 1: args.sharding = "stage1" strategy.hybrid_configs = { @@ -295,7 +244,7 @@ def setdistenv(args): MICRO_BATCH_SIZE = 32 strategy.pipeline_configs = { "accumulate_steps": BATCH_SIZE // MICRO_BATCH_SIZE, - "micro_batch_size": MICRO_BATCH_SIZE + "micro_batch_size": MICRO_BATCH_SIZE, } strategy.find_unused_parameters = True @@ -312,8 +261,7 @@ def setdistenv(args): args.sharding_rank = hcg.get_sharding_parallel_rank() args.data_world_rank = args.dp_rank * args.sharding_parallel_degree + args.sharding_rank - args.data_world_size = dist.get_world_size() // abs( - args.tensor_parallel_degree * args.pipeline_parallel_degree) + args.data_world_size = dist.get_world_size() // abs(args.tensor_parallel_degree * args.pipeline_parallel_degree) # seed control in hybrid parallel set_hyrbid_parallel_seed(args.seed, args.data_world_rank, args.mp_rank) @@ -321,12 +269,12 @@ def setdistenv(args): def set_hyrbid_parallel_seed(basic_seed, data_world_rank, mp_rank, pp_rank=0): device_id = paddle.device.get_device() - assert 'gpu' in device_id + assert "gpu" in device_id random.seed(basic_seed + data_world_rank) np.random.seed(basic_seed + data_world_rank) paddle.seed(basic_seed + data_world_rank) - #TODO add manual_seed + # TODO add manual_seed # local_seed/ global_seed is used to control dropout in ModelParallel local_seed = 1024 + basic_seed + mp_rank * 100 + data_world_rank global_seed = 2048 + basic_seed + data_world_rank diff --git a/paddlemix/examples/blip2/utils.py b/paddlemix/examples/blip2/utils.py index 7a0077de656b3..c93e0dec959ef 100644 --- a/paddlemix/examples/blip2/utils.py +++ b/paddlemix/examples/blip2/utils.py @@ -11,47 +11,35 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import copy +import datetime +import json import os +import re +import sys +import time + +import paddle from pycocoevalcap.eval import COCOEvalCap from pycocotools.coco import COCO -from paddlemix.utils.downloader import get_weights_path_from_url -from paddlemix.utils.downloader import is_url + from paddlemix.models.blip2.eva_vit import interpolate_pos_embed -import paddle +from paddlemix.utils.downloader import get_weights_path_from_url, is_url from paddlemix.utils.log import logger -import time -import json -import sys -import re -import json -import datetime -import copy LLM_LIST = { - "facebook/opt-2.7b": - "https://bj.bcebos.com/paddlenlp/models/community/facebook/opt-2.7b/model_state.pdparams", - "t5-small": - "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-small/model_state.pdparams", - "t5-base": - "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-base/model_state.pdparams", - "t5-large": - "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-large/model_state.pdparams", - "t5-3b": - "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-3b/model_state.pdparams", - "t5-11b": - "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-11b/model_state.pdparams", - "t5-v1_1-base": - "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-v1_1-base/model_state.pdparams", - "t5-v1_1-large": - "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-v1_1-large/model_state.pdparams", - "facebook/llama-7b": - "https://bj.bcebos.com/paddlenlp/models/community/facebook/llama-7b/model_state.pdparams", - "facebook/llama-13b": - "https://bj.bcebos.com/paddlenlp/models/community/facebook/llama-13b/model_state.pdparams", - "facebook/llama-30b": - "https://bj.bcebos.com/paddlenlp/models/community/facebook/llama-30b/model_state.pdparams", - "facebook/llama-65b": - "https://bj.bcebos.com/paddlenlp/models/community/facebook/llama-65b/model_state.pdparams", + "facebook/opt-2.7b": "https://bj.bcebos.com/paddlenlp/models/community/facebook/opt-2.7b/model_state.pdparams", + "t5-small": "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-small/model_state.pdparams", + "t5-base": "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-base/model_state.pdparams", + "t5-large": "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-large/model_state.pdparams", + "t5-3b": "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-3b/model_state.pdparams", + "t5-11b": "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-11b/model_state.pdparams", + "t5-v1_1-base": "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-v1_1-base/model_state.pdparams", + "t5-v1_1-large": "https://bj.bcebos.com/paddlenlp/models/transformers/t5/t5-v1_1-large/model_state.pdparams", + "facebook/llama-7b": "https://bj.bcebos.com/paddlenlp/models/community/facebook/llama-7b/model_state.pdparams", + "facebook/llama-13b": "https://bj.bcebos.com/paddlenlp/models/community/facebook/llama-13b/model_state.pdparams", + "facebook/llama-30b": "https://bj.bcebos.com/paddlenlp/models/community/facebook/llama-30b/model_state.pdparams", + "facebook/llama-65b": "https://bj.bcebos.com/paddlenlp/models/community/facebook/llama-65b/model_state.pdparams", } @@ -81,25 +69,24 @@ def __call__(self, data_list): max_length=32, return_tensors="pd", return_attention_mask=True, - mode=self.mode, ) - batch.update({'image_id': image_id}) + mode=self.mode, + ) + batch.update({"image_id": image_id}) return batch def coco_caption_eval(coco_gt_root, results_file, split): - urls = { - "val": - "https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val_gt.json", - "test": - "https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test_gt.json", - } + # urls = { + # "val": "https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val_gt.json", + # "test": "https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test_gt.json", + # } filenames = { "val": "coco_karpathy_val_gt.json", "test": "coco_karpathy_test_gt.json", } - #download_url(urls[split], coco_gt_root) - annotation_file = os.path.join(coco_gt_root, filenames['test']) + # download_url(urls[split], coco_gt_root) + annotation_file = os.path.join(coco_gt_root, filenames["test"]) # create coco object and coco_result object coco = COCO(annotation_file) @@ -115,11 +102,7 @@ def coco_caption_eval(coco_gt_root, results_file, split): return coco_eval -def load_model(args, - model, - optimizer=None, - ckpt_dir="", - load_language_model=True): +def load_model(args, model, optimizer=None, ckpt_dir="", load_language_model=True): """ load the saved checkpoint file and update the state dicts of model and optimizer. """ @@ -140,18 +123,27 @@ def load_model(args, if ckpt_dir and os.path.isfile(ckpt_dir): # breakpoint() print("Try to load a whole checkpoint from %s " % ckpt_dir) - embedding_list = ['word_embeddings'] + embedding_list = ["word_embeddings"] collinear_list = [ - "fc1", "fc2", "qkv", "proj", "query", "key", "value", "qkv_proj", - "q_proj", "k_proj", "v_proj", "linear1", "linear2", "project_in", - "project_out" + "fc1", + "fc2", + "qkv", + "proj", + "query", + "key", + "value", + "qkv_proj", + "q_proj", + "k_proj", + "v_proj", + "linear1", + "linear2", + "project_in", + "project_out", ] rowlinear_list = ["out_proj"] all_list = collinear_list + rowlinear_list + embedding_list - skip_list = [ - 'visual_encoder.patch_embed.proj.weight', - 'visual_encoder.patch_embed.proj.bias' - ] + skip_list = ["visual_encoder.patch_embed.proj.weight", "visual_encoder.patch_embed.proj.bias"] col_list = [] row_list = [] @@ -161,10 +153,10 @@ def load_model(args, mp_size = args.tensor_parallel_degree def renamebias(model_dict, whole_key): - if 'q_bias' in whole_key: - key = whole_key.replace('q_bias', 'q_proj.bias') - elif 'v_bias' in whole_key: - key = whole_key.replace('v_bias', 'v_proj.bias') + if "q_bias" in whole_key: + key = whole_key.replace("q_bias", "q_proj.bias") + elif "v_bias" in whole_key: + key = whole_key.replace("v_bias", "v_proj.bias") model_dict[key] = model_dict[whole_key] del model_dict[whole_key] return model_dict @@ -172,47 +164,44 @@ def renamebias(model_dict, whole_key): def col_split_modeldict(model_dict): if len(model_dict.shape) == 2: subbatch = model_dict.shape[1] // mp_size - return model_dict[:, mp_rank * subbatch:(mp_rank + 1) * - subbatch] + return model_dict[:, mp_rank * subbatch : (mp_rank + 1) * subbatch] elif len(model_dict.shape) == 1: subbatch = model_dict.shape[0] // mp_size - return model_dict[mp_rank * subbatch:(mp_rank + 1) * subbatch] + return model_dict[mp_rank * subbatch : (mp_rank + 1) * subbatch] def row_split_modeldict(model_dict): if len(model_dict.shape) == 2: subbatch = model_dict.shape[0] // mp_size - return model_dict[mp_rank * subbatch:(mp_rank + 1) * subbatch] + return model_dict[mp_rank * subbatch : (mp_rank + 1) * subbatch] else: return model_dict def emb_split_modeldict(model_dict): subbatch = model_dict.shape[0] // mp_size - return model_dict[mp_rank * subbatch:(mp_rank + 1) * subbatch] + return model_dict[mp_rank * subbatch : (mp_rank + 1) * subbatch] model_dict = paddle.load(ckpt_dir) for whole_key in model_dict.keys(): - if not '.' in whole_key: + if "." not in whole_key: continue - key = whole_key.split('.')[-2] + key = whole_key.split(".")[-2] if whole_key in skip_list: continue if key in all_list: if key in collinear_list: col_list.append((key, model_dict[whole_key].shape)) - model_dict[whole_key] = col_split_modeldict(model_dict[ - whole_key]) + model_dict[whole_key] = col_split_modeldict(model_dict[whole_key]) elif key in rowlinear_list: row_list.append((key, model_dict[whole_key].shape)) - model_dict[whole_key] = row_split_modeldict(model_dict[ - whole_key]) + model_dict[whole_key] = row_split_modeldict(model_dict[whole_key]) else: emb_list.append((key, model_dict[whole_key].shape)) - model_dict[whole_key] = emb_split_modeldict(model_dict[ - whole_key]) + model_dict[whole_key] = emb_split_modeldict(model_dict[whole_key]) param_state_dict = model_dict import numpy as np + model_dict = model.state_dict() model_weight = {} incorrect_keys = 0 @@ -220,21 +209,19 @@ def emb_split_modeldict(model_dict): if key in param_state_dict.keys(): if isinstance(param_state_dict[key], np.ndarray): - param_state_dict[key] = paddle.to_tensor(param_state_dict[ - key]) + param_state_dict[key] = paddle.to_tensor(param_state_dict[key]) if value.dtype == param_state_dict[key].dtype: model_weight[key] = param_state_dict[key] else: - model_weight[key] = param_state_dict[key].astype( - value.dtype) + model_weight[key] = param_state_dict[key].astype(value.dtype) if value.shape != param_state_dict[key].shape: - logger.info('Unmatched key: {}'.format(key)) + logger.info("Unmatched key: {}".format(key)) print(value.shape, param_state_dict[key].shape, key) else: - if load_language_model == False and "language_model" in key: + if load_language_model is False and "language_model" in key: continue - logger.info('Unmatched key: {}'.format(key)) + logger.info("Unmatched key: {}".format(key)) incorrect_keys += 1 interpolate_pos_embed(model, model_weight) model.set_state_dict(model_weight) @@ -245,13 +232,13 @@ def emb_split_modeldict(model_dict): raise TypeError("`load` requires a valid value of `ckpt_dir`.") -def save_result(result, result_dir, filename, remove_duplicate="", - world_size=1): +def save_result(result, result_dir, filename, remove_duplicate="", world_size=1): import logging + rank_id_curr_node = int(os.environ.get("PADDLE_RANK_IN_NODE", 0)) - result_file = os.path.join(result_dir, - "%s_rank%d.json" % (filename, rank_id_curr_node)) - if not os.path.exists(result_dir): os.mkdir(result_dir) + result_file = os.path.join(result_dir, "%s_rank%d.json" % (filename, rank_id_curr_node)) + if not os.path.exists(result_dir): + os.mkdir(result_dir) json.dump(result, open(result_file, "w")) final_result_file = os.path.join(result_dir, "%s.json" % filename) @@ -262,8 +249,7 @@ def save_result(result, result_dir, filename, remove_duplicate="", result = [] # for rank in range(get_world_size()): for rank in range(int(os.environ.get("PADDLE_TRAINERS_NUM", 1))): - result_file = os.path.join(result_dir, - "%s_rank%d.json" % (filename, rank)) + result_file = os.path.join(result_dir, "%s_rank%d.json" % (filename, rank)) res = json.load(open(result_file, "r")) result += res @@ -281,8 +267,7 @@ def save_result(result, result_dir, filename, remove_duplicate="", else: while not os.path.exists(final_result_file): time.sleep(0.5) - logging.warning("rank %d waits rank0 to merge results." % - rank_id_curr_node) + logging.warning("rank %d waits rank0 to merge results." % rank_id_curr_node) # combine results from all processes return final_result_file @@ -464,7 +449,7 @@ def __init__(self, vqa=None, vqaRes=None, n=2): ] def evaluate(self, quesIds=None): - if quesIds == None: + if quesIds is None: quesIds = [quesId for quesId in self.params["question_id"]] gts = {} res = {} @@ -493,13 +478,8 @@ def evaluate(self, quesIds=None): for ansDic in gts[quesId]["answers"]: ansDic["answer"] = self.processPunctuation(ansDic["answer"]) for gtAnsDatum in gts[quesId]["answers"]: - otherGTAns = [ - item for item in gts[quesId]["answers"] - if item != gtAnsDatum - ] - matchingAns = [ - item for item in otherGTAns if item["answer"] == resAns - ] + otherGTAns = [item for item in gts[quesId]["answers"] if item != gtAnsDatum] + matchingAns = [item for item in otherGTAns if item["answer"] == resAns] acc = min(1, float(len(matchingAns)) / 3) gtAcc.append(acc) quesType = gts[quesId]["question_type"] @@ -525,8 +505,7 @@ def evaluate(self, quesIds=None): def processPunctuation(self, inText): outText = inText for p in self.punct: - if (p + " " in inText or " " + p in inText) or ( - re.search(self.commaStrip, inText) != None): + if (p + " " in inText or " " + p in inText) or (re.search(self.commaStrip, inText) is not None): outText = outText.replace(p, "") else: outText = outText.replace(p, " ") @@ -549,18 +528,16 @@ def processDigitArticle(self, inText): return outText def setAccuracy(self, accQA, accQuesType, accAnsType): - self.accuracy["overall"] = round(100 * float(sum(accQA)) / len(accQA), - self.n) + self.accuracy["overall"] = round(100 * float(sum(accQA)) / len(accQA), self.n) self.accuracy["perQuestionType"] = { quesType: round( - 100 * float(sum(accQuesType[quesType])) / - len(accQuesType[quesType]), - self.n, ) + 100 * float(sum(accQuesType[quesType])) / len(accQuesType[quesType]), + self.n, + ) for quesType in accQuesType } self.accuracy["perAnswerType"] = { - ansType: round(100 * float(sum(accAnsType[ansType])) / - len(accAnsType[ansType]), self.n) + ansType: round(100 * float(sum(accAnsType[ansType])) / len(accAnsType[ansType]), self.n) for ansType in accAnsType } @@ -593,8 +570,8 @@ def updateProgress(self, progress): status = "Done...\r\n" block = int(round(barLength * progress)) text = "\rFinshed Percent: [{0}] {1}% {2}".format( - "#" * block + "-" * (barLength - block), - int(progress * 100), status) + "#" * block + "-" * (barLength - block), int(progress * 100), status + ) sys.stdout.write(text) sys.stdout.flush() @@ -612,9 +589,9 @@ def __init__(self, annotation_file=None, question_file=None): self.qa = {} self.qqa = {} self.imgToQA = {} - if not annotation_file == None and not question_file == None: + if annotation_file is not None and question_file is not None: print("loading VQA annotations and questions into memory...") - time_t = datetime.datetime.utcnow() + # time_t = datetime.datetime.utcnow() dataset = json.load(open(annotation_file, "r")) questions = json.load(open(question_file, "r")) self.dataset = dataset @@ -664,17 +641,13 @@ def getQuesIds(self, imgIds=[], quesTypes=[], ansTypes=[]): else: if not len(imgIds) == 0: anns = sum( - [ - self.imgToQA[imgId] for imgId in imgIds - if imgId in self.imgToQA - ], - [], ) + [self.imgToQA[imgId] for imgId in imgIds if imgId in self.imgToQA], + [], + ) else: anns = self.dataset["annotations"] - anns = (anns if len(quesTypes) == 0 else - [ann for ann in anns if ann["question_type"] in quesTypes]) - anns = (anns if len(ansTypes) == 0 else - [ann for ann in anns if ann["answer_type"] in ansTypes]) + anns = anns if len(quesTypes) == 0 else [ann for ann in anns if ann["question_type"] in quesTypes] + anns = anns if len(ansTypes) == 0 else [ann for ann in anns if ann["answer_type"] in ansTypes] ids = [ann["question_id"] for ann in anns] return ids @@ -694,15 +667,11 @@ def getImgIds(self, quesIds=[], quesTypes=[], ansTypes=[]): anns = self.dataset["annotations"] else: if not len(quesIds) == 0: - anns = sum([ - self.qa[quesId] for quesId in quesIds if quesId in self.qa - ], []) + anns = sum([self.qa[quesId] for quesId in quesIds if quesId in self.qa], []) else: anns = self.dataset["annotations"] - anns = (anns if len(quesTypes) == 0 else - [ann for ann in anns if ann["question_type"] in quesTypes]) - anns = (anns if len(ansTypes) == 0 else - [ann for ann in anns if ann["answer_type"] in ansTypes]) + anns = anns if len(quesTypes) == 0 else [ann for ann in anns if ann["question_type"] in quesTypes] + anns = anns if len(ansTypes) == 0 else [ann for ann in anns if ann["answer_type"] in ansTypes] ids = [ann["image_id"] for ann in anns] return ids @@ -742,8 +711,7 @@ def loadRes(self, resFile, quesFile): res.dataset["info"] = copy.deepcopy(self.questions["info"]) res.dataset["task_type"] = copy.deepcopy(self.questions["task_type"]) res.dataset["data_type"] = copy.deepcopy(self.questions["data_type"]) - res.dataset["data_subtype"] = copy.deepcopy(self.questions[ - "data_subtype"]) + res.dataset["data_subtype"] = copy.deepcopy(self.questions["data_subtype"]) res.dataset["license"] = copy.deepcopy(self.questions["license"]) print("Loading and preparing results... ") @@ -751,20 +719,21 @@ def loadRes(self, resFile, quesFile): anns = json.load(open(resFile)) assert type(anns) == list, "results is not an array of objects" annsQuesIds = [ann["question_id"] for ann in anns] - assert set(annsQuesIds) == set(self.getQuesIds( - )), "Results do not correspond to current VQA set. Either the results do not have predictions for all question ids in annotation file or there is atleast one question id that does not belong to the question ids in the annotation file." + assert set(annsQuesIds) == set( + self.getQuesIds() + ), "Results do not correspond to current VQA set. Either the results do not have predictions for all question ids in annotation file or there is atleast one question id that does not belong to the question ids in the annotation file." for ann in anns: quesId = ann["question_id"] if res.dataset["task_type"] == "Multiple Choice": - assert (ann["answer"] in self.qqa[quesId]["multiple_choices"] - ), "predicted answer is not one of the multiple choices" + assert ( + ann["answer"] in self.qqa[quesId]["multiple_choices"] + ), "predicted answer is not one of the multiple choices" qaAnn = self.qa[quesId] ann["image_id"] = qaAnn["image_id"] ann["question_type"] = qaAnn["question_type"] ann["answer_type"] = qaAnn["answer_type"] - print("DONE (t=%0.2fs)" % ( - (datetime.datetime.utcnow() - time_t).total_seconds())) + print("DONE (t=%0.2fs)" % ((datetime.datetime.utcnow() - time_t).total_seconds())) res.dataset["annotations"] = anns res.createIndex() - return res \ No newline at end of file + return res diff --git a/paddlemix/examples/evaclip/run_pretrain_dist.py b/paddlemix/examples/evaclip/run_pretrain_dist.py index 82573d19ebb21..ad7967bdead5a 100644 --- a/paddlemix/examples/evaclip/run_pretrain_dist.py +++ b/paddlemix/examples/evaclip/run_pretrain_dist.py @@ -24,17 +24,18 @@ from dataclasses import dataclass, field import paddle -from paddlenlp.trainer import (PdArgumentParser, TrainingArguments, - get_last_checkpoint) +from paddlenlp.trainer import PdArgumentParser, TrainingArguments from paddlenlp.transformers import AutoTokenizer -from paddlemix.checkpoint import load_model, save +from paddlemix.checkpoint import load_model from paddlemix.datasets import load_dataset -from paddlemix.models.evaclip.eva_clip_model import EVACLIP +from paddlemix.models.evaclip.eva_clip_model import EVACLIP, EVACLIPConfig from paddlemix.optimization import create_optimizer -from paddlemix.processors import SimpleTokenizer from paddlemix.processors.clip_processing import ( - CLIPImageProcessor, CLIPProcessor, CLIPTextProcessor) + CLIPImageProcessor, + CLIPProcessor, + CLIPTextProcessor, +) from paddlemix.trainer import CLIPTrainer from paddlemix.utils.env import setdistenv @@ -50,21 +51,23 @@ class DataArguments: task_name: str = field( default="coco_clip", - metadata={ - "help": "The name of the task to use (via the datasets library)." - }, ) + metadata={"help": "The name of the task to use (via the datasets library)."}, + ) image_size: int = field( default=224, - metadata={"help": "image size for training"}, ) + metadata={"help": "image size for training"}, + ) train_data: str = field( default="", - metadata={"help": "The traindata list path."}, ) + metadata={"help": "The traindata list path."}, + ) precomputed_text_emb: str = field( default="open_clip_vit_g_14", - metadata={"help": "precomputed_text_emb name."}, ) + metadata={"help": "precomputed_text_emb name."}, + ) @dataclass @@ -75,19 +78,20 @@ class ModelArguments: model: str = field( default="paddlemix/EVA/EVA02-CLIP-L-14", - metadata={ - "help": - "model name to create, for example [EVA02-CLIP-B-16/coca_EVA02-B-16]" - }, ) + metadata={"help": "model name to create, for example [EVA02-CLIP-B-16/coca_EVA02-B-16]"}, + ) model_name_or_path: str = field( default="clip", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) coca_caption_loss_weight: float = field( default=2.0, - metadata={"help": "coca_caption_loss_weight set, default: 2.0"}, ) + metadata={"help": "coca_caption_loss_weight set, default: 2.0"}, + ) coca_contrastive_loss_weight: float = field( default=1.0, - metadata={"help": "coca_contrastive_loss_weight set, default: 1.0"}, ) + metadata={"help": "coca_contrastive_loss_weight set, default: 1.0"}, + ) @dataclass @@ -98,48 +102,41 @@ class PreTrainingArguments(TrainingArguments): pretrained_model_path: str = field( default=None, - metadata={ - "help": - "The path to pre-trained model that we will use for pretraining." - }, ) - text_wd: float = field( - default=0.05, metadata={"help": "Weight decay for text tower"}) - visual_wd: float = field( - default=0.05, metadata={"help": "Weight decay for visual tower"}) - text_lr: float = field( - default=2e-5, - metadata={"help": "The initial learning rate of text tower."}) - visual_lr: float = field( - default=2e-4, - metadata={"help": "The initial learning rate of visual tower."}) - layer_decay: float = field( - default=1.0, metadata={"help": "The basic layer decay."}) - text_ld: float = field( - default=0.75, metadata={"help": "The layer decay of text tower."}) - visual_ld: float = field( - default=0.75, metadata={"help": "The layer decay of visual tower."}) + metadata={"help": "The path to pre-trained model that we will use for pretraining."}, + ) + text_wd: float = field(default=0.05, metadata={"help": "Weight decay for text tower"}) + visual_wd: float = field(default=0.05, metadata={"help": "Weight decay for visual tower"}) + text_lr: float = field(default=2e-5, metadata={"help": "The initial learning rate of text tower."}) + visual_lr: float = field(default=2e-4, metadata={"help": "The initial learning rate of visual tower."}) + layer_decay: float = field(default=1.0, metadata={"help": "The basic layer decay."}) + text_ld: float = field(default=0.75, metadata={"help": "The layer decay of text tower."}) + visual_ld: float = field(default=0.75, metadata={"help": "The layer decay of visual tower."}) start_epoch: int = field( default=0, - metadata={"help": " manual epoch number (useful on restarts)"}, ) + metadata={"help": " manual epoch number (useful on restarts)"}, + ) context_length: int = field( default=77, - metadata={"help": " context length for text."}, ) - optimizer: str = field( - default="lamb", metadata={"help": "optimizer setting, [lamb/adamw]"}) + metadata={"help": " context length for text."}, + ) + optimizer: str = field(default="lamb", metadata={"help": "optimizer setting, [lamb/adamw]"}) dp_degree: int = field( default=2, - metadata={"help": " data parallel degrees."}, ) - last_epoch: int = field( - default=-1, metadata={"help": "the last epoch to resume"}) + metadata={"help": " data parallel degrees."}, + ) + last_epoch: int = field(default=-1, metadata={"help": "the last epoch to resume"}) gather_with_grad: bool = field( default=False, - metadata={"help": "Whether to use gather_with_grad in loss."}, ) + metadata={"help": "Whether to use gather_with_grad in loss."}, + ) local_loss: bool = field( default=False, - metadata={"help": "Whether to use local loss in loss."}, ) + metadata={"help": "Whether to use local loss in loss."}, + ) tensorboard: bool = field( default=False, - metadata={"help": "Whether to use tensorboard to record loss."}, ) + metadata={"help": "Whether to use tensorboard to record loss."}, + ) class SelfTrainer(CLIPTrainer): @@ -154,16 +151,17 @@ def create_optimizer_and_scheduler(self, num_training_steps: int): self.lr_scheduler = paddle.optimizer.lr.CosineAnnealingDecay( 1.0, num_training_steps - self.args.warmup_steps, - last_epoch=self.args.last_epoch, ) + last_epoch=self.args.last_epoch, + ) if self.args.warmup_steps > 0: self.lr_scheduler = paddle.optimizer.lr.LinearWarmup( self.lr_scheduler, self.args.warmup_steps, 0, 1.0, - last_epoch=self.args.last_epoch, ) - self.optimizer = create_optimizer(self.args, self.model, - self.lr_scheduler) + last_epoch=self.args.last_epoch, + ) + self.optimizer = create_optimizer(self.args, self.model, self.lr_scheduler) class Collator: @@ -187,7 +185,8 @@ def __call__(self, data_list): max_length=77, return_tensors="pd", return_attention_mask=False, - mode="train", ) + mode="train", + ) return batch @@ -202,22 +201,22 @@ def main_worker(training_args, model_args, data_args): local_loss=training_args.local_loss, gather_with_grad=training_args.gather_with_grad, data_world_rank=training_args.data_world_rank, - data_world_size=training_args.data_world_size, ) + data_world_size=training_args.data_world_size, + ) training_args.model = model_args.model - if (training_args.pretrained_model_path and - training_args.pretrained_model_path != "None" and - training_args.resume_from_checkpoint is None): - load_model( - training_args, model, ckpt_dir=training_args.pretrained_model_path) + if ( + training_args.pretrained_model_path + and training_args.pretrained_model_path != "None" + and training_args.resume_from_checkpoint is None + ): + load_model(training_args, model, ckpt_dir=training_args.pretrained_model_path) if training_args.bf16 and training_args.fp16_opt_level == "O2": paddle.set_default_dtype("float32") train_dataset = load_dataset("coco_clip", splits="train") - image_processor = CLIPImageProcessor.from_pretrained( - model_args.model_name_or_path) - text_processor = CLIPTextProcessor.from_pretrained( - model_args.model_name_or_path) + image_processor = CLIPImageProcessor.from_pretrained(model_args.model_name_or_path) + text_processor = CLIPTextProcessor.from_pretrained(model_args.model_name_or_path) tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) processor = CLIPProcessor(image_processor, text_processor, tokenizer) collator = Collator(processor) @@ -226,7 +225,8 @@ def main_worker(training_args, model_args, data_args): model=model, args=training_args, train_dataset=train_dataset, - data_collator=collator, ) + data_collator=collator, + ) # Training checkpoint = None @@ -239,11 +239,8 @@ def main_worker(training_args, model_args, data_args): trainer.save_state() -from paddlemix.models.evaclip.eva_clip_model import EVACLIP, EVACLIPConfig - if __name__ == "__main__": - parser = PdArgumentParser( - (ModelArguments, DataArguments, PreTrainingArguments)) + parser = PdArgumentParser((ModelArguments, DataArguments, PreTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() training_args.hostname = socket.gethostname() pprint.pprint(data_args) diff --git a/paddlemix/examples/evaclip/run_zero_shot_eval.py b/paddlemix/examples/evaclip/run_zero_shot_eval.py index 984495cd049fc..236b54795a4ff 100644 --- a/paddlemix/examples/evaclip/run_zero_shot_eval.py +++ b/paddlemix/examples/evaclip/run_zero_shot_eval.py @@ -23,10 +23,9 @@ import socket from dataclasses import dataclass, field -from paddlenlp.trainer import (PdArgumentParser, TrainingArguments, - get_last_checkpoint) +from paddlenlp.trainer import PdArgumentParser, TrainingArguments -from paddlemix.checkpoint import load_model, save +from paddlemix.checkpoint import load_model from paddlemix.datasets.laion_clip import get_data from paddlemix.metrics.clip_zero_shot import zero_shot_eval from paddlemix.models.evaclip.eva_clip_model import EVACLIP @@ -45,11 +44,13 @@ class DataArguments: classification_eval: str = field( default="", - metadata={"help": "Path to IN1K data."}, ) + metadata={"help": "Path to IN1K data."}, + ) precomputed_text_emb: str = field( default="open_clip_vit_g_14", - metadata={"help": "precomputed_text_emb name."}, ) + metadata={"help": "precomputed_text_emb name."}, + ) @dataclass @@ -60,13 +61,12 @@ class ModelArguments: model: str = field( default="paddlemix/EVA/EVA02-CLIP-L-14", - metadata={ - "help": - "model name to create, for example paddlemix/EVA/EVA02-CLIP-L-14" - }, ) + metadata={"help": "model name to create, for example paddlemix/EVA/EVA02-CLIP-L-14"}, + ) model_name_or_path: str = field( default="clip", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) @dataclass @@ -77,13 +77,9 @@ class PreTrainingArguments(TrainingArguments): pretrained_model_path: str = field( default=None, - metadata={ - "help": - "The path to pre-trained model that we will use for pretraining." - }, ) - pretrained_text_model: str = field( - default="openclip", - metadata={"help": "the model to pre-extract text feats"}) + metadata={"help": "The path to pre-trained model that we will use for pretraining."}, + ) + pretrained_text_model: str = field(default="openclip", metadata={"help": "the model to pre-extract text feats"}) def evaluate(model, dataloader_dict, args): @@ -94,15 +90,15 @@ def evaluate(model, dataloader_dict, args): def main_worker(training_args, model_args, data_args): - model = EVACLIP.from_pretrained( - model_args.model, ignore_mismatched_sizes=False) + model = EVACLIP.from_pretrained(model_args.model, ignore_mismatched_sizes=False) training_args.model = model_args.model - if (training_args.pretrained_model_path and - training_args.pretrained_model_path != "None" and - training_args.resume_from_checkpoint is None): - load_model( - training_args, model, ckpt_dir=training_args.pretrained_model_path) + if ( + training_args.pretrained_model_path + and training_args.pretrained_model_path != "None" + and training_args.resume_from_checkpoint is None + ): + load_model(training_args, model, ckpt_dir=training_args.pretrained_model_path) preprocess_train = image_transform(model.visual.image_size, is_train=True) preprocess_val = image_transform(model.visual.image_size, is_train=False) @@ -112,8 +108,7 @@ def main_worker(training_args, model_args, data_args): if __name__ == "__main__": - parser = PdArgumentParser( - (ModelArguments, DataArguments, PreTrainingArguments)) + parser = PdArgumentParser((ModelArguments, DataArguments, PreTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() training_args.hostname = socket.gethostname() pprint.pprint(data_args) diff --git a/paddlemix/examples/groundingdino/run_predict.py b/paddlemix/examples/groundingdino/run_predict.py index bf8022aabe463..dbd7f959cfc7c 100644 --- a/paddlemix/examples/groundingdino/run_predict.py +++ b/paddlemix/examples/groundingdino/run_predict.py @@ -78,9 +78,7 @@ class DataArguments: """ input_image: str = field(metadata={"help": "The name of input image."}) - prompt: str = field( - default=None, - metadata={"help": "The prompt of the image to be generated."}) + prompt: str = field(default=None, metadata={"help": "The prompt of the image to be generated."}) @dataclass @@ -91,19 +89,24 @@ class ModelArguments: model_name_or_path: str = field( default="GroundingDino/groundingdino-swint-ogc", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) box_threshold: float = field( default=0.3, - metadata={"help": "box threshold."}, ) + metadata={"help": "box threshold."}, + ) text_threshold: float = field( default=0.25, - metadata={"help": "text threshold."}, ) + metadata={"help": "text threshold."}, + ) output_dir: str = field( default="output", - metadata={"help": "output directory."}, ) + metadata={"help": "output directory."}, + ) visual: bool = field( default=True, - metadata={"help": "save visual image."}, ) + metadata={"help": "save visual image."}, + ) def main(): @@ -111,12 +114,10 @@ def main(): model_args, data_args = parser.parse_args_into_dataclasses() # bulid processor - processor = GroudingDinoProcessor.from_pretrained( - model_args.model_name_or_path) + processor = GroudingDinoProcessor.from_pretrained(model_args.model_name_or_path) # bulid model logger.info("dino_model: {}".format(model_args.model_name_or_path)) - dino_model = GroundingDinoModel.from_pretrained( - model_args.model_name_or_path) + dino_model = GroundingDinoModel.from_pretrained(model_args.model_name_or_path) dino_model.eval() # read image url = data_args.input_image @@ -125,11 +126,9 @@ def main(): # read image image_pil = Image.open(data_args.input_image).convert("RGB") else: - image_pil = Image.open(requests.get(url, stream=True).raw).convert( - "RGB") + image_pil = Image.open(requests.get(url, stream=True).raw).convert("RGB") # preprocess image text_prompt - image_tensor, mask, tokenized_out = processor( - images=image_pil, text=data_args.prompt) + image_tensor, mask, tokenized_out = processor(images=image_pil, text=data_args.prompt) with paddle.no_grad(): outputs = dino_model( @@ -137,9 +136,9 @@ def main(): mask, input_ids=tokenized_out["input_ids"], attention_mask=tokenized_out["attention_mask"], - text_self_attention_masks=tokenized_out[ - "text_self_attention_masks"], - position_ids=tokenized_out["position_ids"], ) + text_self_attention_masks=tokenized_out["text_self_attention_masks"], + position_ids=tokenized_out["position_ids"], + ) logits = F.sigmoid(outputs["pred_logits"])[0] # (nq, 256) boxes = outputs["pred_boxes"][0] # (nq, 4) diff --git a/paddlemix/examples/imagebind/run_predict.py b/paddlemix/examples/imagebind/run_predict.py index a304a15403219..f195c74d8b8f9 100644 --- a/paddlemix/examples/imagebind/run_predict.py +++ b/paddlemix/examples/imagebind/run_predict.py @@ -12,12 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import argparse import os -import sys from dataclasses import dataclass, field -import numpy as np import paddle import requests from paddlenlp.trainer import PdArgumentParser @@ -26,16 +23,13 @@ from paddlemix import ImageBindModel, ImageBindProcessor from paddlemix.datasets import * from paddlemix.models import ModalityType -from paddlemix.models.imagebind.modeling import ImageBindModel from paddlemix.utils.log import logger class Predictor: def __init__(self, model_args): - self.processor = ImageBindProcessor.from_pretrained( - model_args.model_name_or_path) - self.predictor = ImageBindModel.from_pretrained( - model_args.model_name_or_path) + self.processor = ImageBindProcessor.from_pretrained(model_args.model_name_or_path) + self.predictor = ImageBindModel.from_pretrained(model_args.model_name_or_path) self.predictor.eval() def run(self, inputs): @@ -55,8 +49,7 @@ def main(model_args, data_args): # read image image_pil = Image.open(data_args.input_image).convert("RGB") elif url: - image_pil = Image.open(requests.get(url, stream=True).raw).convert( - "RGB") + image_pil = Image.open(requests.get(url, stream=True).raw).convert("RGB") else: image_pil = None @@ -66,7 +59,8 @@ def main(model_args, data_args): images=image_pil, text=data_args.input_text, audios=data_args.input_audio, - return_tensors="pd", ) + return_tensors="pd", + ) inputs = {} if data_args.input_text: tokenized_processor = encoding["input_ids"] @@ -84,8 +78,7 @@ def main(model_args, data_args): if data_args.input_text: logger.info("Generate text: {}".format(embeddings[ModalityType.TEXT])) if image_pil: - logger.info("Generate vision: {}".format(embeddings[ - ModalityType.VISION])) + logger.info("Generate vision: {}".format(embeddings[ModalityType.VISION])) if data_args.input_audio: logger.info("Generate audio: {}".format(embeddings[ModalityType.AUDIO])) @@ -99,17 +92,17 @@ class DataArguments: the command line. """ - input_text: str = field( - default="A dog.", - metadata={"help": "The name of imagebind text input."}) + input_text: str = field(default="A dog.", metadata={"help": "The name of imagebind text input."}) input_image: str = field( default="", # wget https://github.com/facebookresearch/ImageBind/blob/main/.assets/bird_image.jpg - metadata={"help": "The name of imagebind image input."}, ) + metadata={"help": "The name of imagebind image input."}, + ) input_audio: str = field( default=None, # wget https://github.com/facebookresearch/ImageBind/blob/main/.assets/bird_audio.wav - metadata={"help": "The name of imagebind audio input."}, ) + metadata={"help": "The name of imagebind audio input."}, + ) @dataclass @@ -120,14 +113,13 @@ class ModelArguments: model_name_or_path: str = field( default="imagebind-1.2b/", - metadata={"help": "Path to pretrained model or model identifier"}, ) + metadata={"help": "Path to pretrained model or model identifier"}, + ) device: str = field( default="GPU", - metadata={ - "help": - "Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU." - }, ) + metadata={"help": "Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU."}, + ) if __name__ == "__main__": diff --git a/paddlemix/examples/minigpt4/merge_weight.py b/paddlemix/examples/minigpt4/merge_weight.py index 5f06d130fb67e..e6a4035bf1816 100644 --- a/paddlemix/examples/minigpt4/merge_weight.py +++ b/paddlemix/examples/minigpt4/merge_weight.py @@ -28,11 +28,9 @@ def merge(args): # load the first item: blip2-flan-t5-xxl state_dict = paddle.load(args.blip2_path) for n, p in state_dict.items(): - if (n.startswith("vision_model") or n.startswith("qformer") or - n == "query_tokens"): + if n.startswith("vision_model") or n.startswith("qformer") or n == "query_tokens": model_dict[n] = p - print( - "[1/3] load ViT, qformer and query_tokens from blip2-flan-t5-xxl done!") + print("[1/3] load ViT, qformer and query_tokens from blip2-flan-t5-xxl done!") # load the second item: vicuna llama_model = LlamaForCausalLM.from_pretrained(args.vicuna_path) @@ -58,8 +56,7 @@ def merge(args): new_p = paddle.to_tensor(p.cpu().numpy()) model_dict[new_name] = new_p - print( - "[3/3] load language_projection, some llama weights from minigpt4 done!") + print("[3/3] load language_projection, some llama weights from minigpt4 done!") save_path = os.path.join(args.save_path, "model_state.pdparams") paddle.save(model_dict, save_path) @@ -73,22 +70,26 @@ def merge(args): "--blip2_path", default="/blip2/dirname", type=str, - help="The dir name of blip2-flan-t5-xxl.", ) + help="The dir name of blip2-flan-t5-xxl.", + ) parser.add_argument( "--vicuna_path", default="/vicuna/dirname", type=str, - help="The dir name of vicuna.", ) + help="The dir name of vicuna.", + ) parser.add_argument( "--minigpt4_path", default="/minigpt4/prerained_minigpt4.pth", type=str, - help="The checkpoint path of vicuna.", ) + help="The checkpoint path of vicuna.", + ) parser.add_argument( "--save_path", default="/save/to/dirname", type=str, - help="The saving path of minigpt4.", ) + help="The saving path of minigpt4.", + ) args = parser.parse_args() args.blip2_path = os.path.join(args.blip2_path, "model_state.pdparams") diff --git a/paddlemix/examples/minigpt4/run_predict.py b/paddlemix/examples/minigpt4/run_predict.py index a883f611471dd..31da230530508 100644 --- a/paddlemix/examples/minigpt4/run_predict.py +++ b/paddlemix/examples/minigpt4/run_predict.py @@ -25,8 +25,7 @@ def predict(args): # load MiniGPT4 moel and processor - model = MiniGPT4ForConditionalGeneration.from_pretrained( - args.pretrained_name_or_path) + model = MiniGPT4ForConditionalGeneration.from_pretrained(args.pretrained_name_or_path) model.eval() processor = MiniGPT4Processor.from_pretrained(args.pretrained_name_or_path) print("load processor and model done!") @@ -61,7 +60,8 @@ def predict(args): "--pretrained_name_or_path", default="your directory of minigpt4", type=str, - help="The dir name of minigpt4 checkpoint.", ) + help="The dir name of minigpt4 checkpoint.", + ) args = parser.parse_args() predict(args) diff --git a/paddlemix/examples/visualglm/run_predict.py b/paddlemix/examples/visualglm/run_predict.py index a11a52f904c20..dabf2b4c5534e 100644 --- a/paddlemix/examples/visualglm/run_predict.py +++ b/paddlemix/examples/visualglm/run_predict.py @@ -26,8 +26,7 @@ def predict(args): # load VisualGLM moel and processor - model = VisualGLMForConditionalGeneration.from_pretrained( - args.pretrained_name_or_path, dtype="float16") + model = VisualGLMForConditionalGeneration.from_pretrained(args.pretrained_name_or_path, dtype="float16") model.eval() processor = VisualGLMProcessor.from_pretrained(args.pretrained_name_or_path) print("load processor and model done!") @@ -70,7 +69,8 @@ def predict(args): "--pretrained_name_or_path", default="THUDM/visualglm-6b", type=str, - help="The dir name of visualglm checkpoint.", ) + help="The dir name of visualglm checkpoint.", + ) args = parser.parse_args() predict(args) diff --git a/paddlemix/external_ops/setup.py b/paddlemix/external_ops/setup.py index 7b1fa658805dc..2c5cd345c21e6 100644 --- a/paddlemix/external_ops/setup.py +++ b/paddlemix/external_ops/setup.py @@ -63,8 +63,11 @@ def setup_fast_ln(): "--expt-relaxed-constexpr", "--expt-extended-lambda", "--use_fast_math", - ] + gencode_flags, - }, ), ) + ] + + gencode_flags, + }, + ), + ) def setup_fused_ln(): @@ -75,7 +78,9 @@ def setup_fused_ln(): setup( name="fused_ln", ext_modules=CUDAExtension( - sources=["fused_ln/layer_norm_cuda.cu", ], + sources=[ + "fused_ln/layer_norm_cuda.cu", + ], extra_compile_args={ "cxx": ["-O3"], "nvcc": [ @@ -91,8 +96,11 @@ def setup_fused_ln(): "--expt-extended-lambda", "--use_fast_math", "-maxrregcount=50", - ] + gencode_flags, - }, ), ) + ] + + gencode_flags, + }, + ), + ) run(setup_fast_ln) diff --git a/paddlemix/metrics/clip_zero_shot.py b/paddlemix/metrics/clip_zero_shot.py index c7271cd2f1f4e..f3037795883c0 100644 --- a/paddlemix/metrics/clip_zero_shot.py +++ b/paddlemix/metrics/clip_zero_shot.py @@ -21,29 +21,24 @@ from paddlemix.processors.tokenizer import tokenize -def zero_shot_classifier(model, - classnames_filename, - templates_filename, - args, - text_tower=None): +def zero_shot_classifier(model, classnames_filename, templates_filename, args, text_tower=None): classnames = [i.strip() for i in open(classnames_filename).readlines()] templates = [i.strip() for i in open(templates_filename).readlines()] if text_tower is None: if hasattr(model, "_layers"): - text_tower = (model._layers.module.encode_text - if not hasattr(model._layers, "encode_text") else - model._layers.encode_text) + text_tower = ( + model._layers.module.encode_text + if not hasattr(model._layers, "encode_text") + else model._layers.encode_text + ) else: - text_tower = (model.module.encode_text - if not hasattr(model, "encode_text") else - model.encode_text) + text_tower = model.module.encode_text if not hasattr(model, "encode_text") else model.encode_text tokenizer = tokenize with paddle.no_grad(): zeroshot_weights = [] for classname in tqdm(classnames): - texts = [template.format(classname) - for template in templates] # format with class + texts = [template.format(classname) for template in templates] # format with class texts = tokenizer(texts) # tokenize class_embeddings = text_tower(texts) @@ -54,13 +49,10 @@ def zero_shot_classifier(model, return zeroshot_weights -def accuracy(output, target, topk=(1, )): +def accuracy(output, target, topk=(1,)): pred = output.topk(max(topk), 1, True, True)[1].t() correct = pred.equal(target.reshape([1, -1]).expand_as(pred)) - return [ - float(correct[:k].reshape([-1]).astype(paddle.float32) - .sum(0, keepdim=True).numpy()) for k in topk - ] + return [float(correct[:k].reshape([-1]).astype(paddle.float32).sum(0, keepdim=True).numpy()) for k in topk] class DummyAutocast: @@ -97,8 +89,7 @@ def run(model, classifier, dataloader, args): autocast = get_autocast(cast_dtype) with paddle.no_grad(): top1, top5, n = 0.0, 0.0, 0.0 - for images, target in tqdm( - dataloader, unit_scale=args.per_device_eval_batch_size): + for images, target in tqdm(dataloader, unit_scale=args.per_device_eval_batch_size): if cast_dtype is not None: images = images.cast(cast_dtype) target = target @@ -109,11 +100,11 @@ def run(model, classifier, dataloader, args): else: image_features = model.encode_image(images) image_features = F.normalize(image_features, axis=-1) - logits = 100.0 * image_features @classifier + logits = 100.0 * image_features @ classifier # measure accuracy if logits.shape[-1] < 5: - (acc1, ) = accuracy(logits, target, topk=(1, )) + (acc1,) = accuracy(logits, target, topk=(1,)) acc5 = -1 else: acc1, acc5 = accuracy(logits, target, topk=(1, 5)) @@ -133,14 +124,15 @@ def zero_shot_eval(model, data, args): for k, v in data.items(): if "eval/classification" in k: data_name = os.path.basename(k) - classifier_filename = f"{os.path.dirname(v.classname_filename)}/{args.pretrained_text_model}_{data_name}_classifier.pt" + classifier_filename = ( + f"{os.path.dirname(v.classname_filename)}/{args.pretrained_text_model}_{data_name}_classifier.pt" + ) if os.path.exists(classifier_filename): print("load classifier from disk") classifier = paddle.load(classifier_filename) else: print("constructing classifier.") - classifier = zero_shot_classifier(model, v.classname_filename, - v.template_filename, args) + classifier = zero_shot_classifier(model, v.classname_filename, v.template_filename, args) paddle.save(classifier, classifier_filename) print(f"zero-shot evaluating classification task: {data_name}") if args.bf16: @@ -154,9 +146,7 @@ def zero_shot_eval(model, data, args): # FIXME: DEBUG ONLY results[f"{k}-top1"] = top1 - print( - f"zero-shot classification task: {data_name}: top1: {top1}, top5: {top5}" - ) + print(f"zero-shot classification task: {data_name}: top1: {top1}, top5: {top5}") print("Finished zero-shot evaluation.") diff --git a/paddlemix/models/blip2/Qformer.py b/paddlemix/models/blip2/Qformer.py index 32b86642170f8..979ecf7857407 100644 --- a/paddlemix/models/blip2/Qformer.py +++ b/paddlemix/models/blip2/Qformer.py @@ -13,31 +13,26 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sklearn -import math -from dataclasses import dataclass -from typing import Optional, Tuple, Dict, Any import inspect -import numpy as np +import math +from typing import Tuple +import numpy as np import paddle -from paddle import Tensor, device, dtype, nn -from paddle.nn import CrossEntropyLoss import paddle.nn.functional as F +from paddle import Tensor, device, nn from paddle.distributed import fleet from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker - +from paddle.distributed.fleet.utils import recompute from paddlenlp.transformers.activations import ACT2FN +from paddlenlp.transformers.bert.configuration import BertConfig from paddlenlp.transformers.model_outputs import ( BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions, - CausalLMOutputWithCrossAttentions, MaskedLMOutput) - + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, +) from paddlenlp.transformers.model_utils import PretrainedModel -from paddlenlp.transformers.bert.configuration import BertConfig -import numpy as np -import paddle -from paddle.distributed.fleet.utils import recompute class CrossEntropyLoss(nn.Layer): @@ -45,7 +40,7 @@ class CrossEntropyLoss(nn.Layer): Softmax Cross entropy loss """ - def __init__(self, reduction='mean', label_smoothing=None): + def __init__(self, reduction="mean", label_smoothing=None): super().__init__() if label_smoothing is not None: assert label_smoothing >= 0 and label_smoothing <= 1, "label_smoothing must be in [0, 1]" @@ -75,12 +70,12 @@ def forward(self, x, label): loss = paddle.sum(-label * F.log_softmax(x, axis=-1), axis=-1) else: if label.dtype == paddle.int32: - label = paddle.cast(label, 'int64') + label = paddle.cast(label, "int64") loss = F.cross_entropy(x, label=label, soft_label=False) - if self.reduction == 'sum': + if self.reduction == "sum": return loss.sum() - elif self.reduction == 'mean': + elif self.reduction == "mean": return loss.mean() else: return loss @@ -93,40 +88,29 @@ class BertEmbeddings(nn.Layer): def __init__(self, config): super(BertEmbeddings, self).__init__() - self.word_embeddings = nn.Embedding( - config.vocab_size, - config.hidden_size, - padding_idx=config.pad_token_id) - self.position_embeddings = nn.Embedding(config.max_position_embeddings, - config.hidden_size) - self.token_type_embeddings = nn.Embedding(config.type_vocab_size, - config.hidden_size) - self.LayerNorm = nn.LayerNorm( - config.hidden_size, epsilon=config.layer_norm_eps) + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.register_buffer("position_ids", - paddle.expand( - paddle.arange(config.max_position_embeddings), - [1, -1])) - self.position_embedding_type = getattr( - config, "position_embedding_type", "absolute") + self.register_buffer("position_ids", paddle.expand(paddle.arange(config.max_position_embeddings), [1, -1])) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.mp_degree = config.mp_degree def forward( - self, - input_ids=None, - position_ids=None, - query_embeds=None, - past_key_values_length=0, ): + self, + input_ids=None, + position_ids=None, + query_embeds=None, + past_key_values_length=0, + ): if input_ids is not None: seq_length = input_ids.shape[1] else: seq_length = 0 if position_ids is None: - position_ids = self.position_ids[:, past_key_values_length: - seq_length + - past_key_values_length] + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] if input_ids is not None: embeddings = self.word_embeddings(input_ids) @@ -151,91 +135,63 @@ class BertSelfAttention(nn.Layer): def __init__(self, config, is_cross_attention): super().__init__() self.config = config - if config.hidden_size % config.num_attention_heads != 0 and not hasattr( - config, "embedding_size"): + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " - "heads (%d)" % (config.hidden_size, config.num_attention_heads)) + "heads (%d)" % (config.hidden_size, config.num_attention_heads) + ) self.num_attention_heads = config.num_attention_heads - self.attention_head_size = int(config.hidden_size / - config.num_attention_heads) + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size if config.mp_degree > 1: self.query = fleet.meta_parallel.ColumnParallelLinear( - config.hidden_size, - self.all_head_size, - weight_attr=None, - has_bias=True, - gather_output=True) + config.hidden_size, self.all_head_size, weight_attr=None, has_bias=True, gather_output=True + ) else: if config.use_fusedlinear: - self.query = paddle.incubate.nn.FusedLinear(config.hidden_size, - self.all_head_size) + self.query = paddle.incubate.nn.FusedLinear(config.hidden_size, self.all_head_size) else: self.query = nn.Linear(config.hidden_size, self.all_head_size) if is_cross_attention: if config.mp_degree > 1: self.key = fleet.meta_parallel.ColumnParallelLinear( - config.encoder_width, - self.all_head_size, - weight_attr=None, - has_bias=True, - gather_output=True) + config.encoder_width, self.all_head_size, weight_attr=None, has_bias=True, gather_output=True + ) self.value = fleet.meta_parallel.ColumnParallelLinear( - config.encoder_width, - self.all_head_size, - weight_attr=None, - has_bias=True, - gather_output=True) + config.encoder_width, self.all_head_size, weight_attr=None, has_bias=True, gather_output=True + ) else: if config.use_fusedlinear: - self.key = paddle.incubate.nn.FusedLinear( - config.encoder_width, self.all_head_size) - self.value = paddle.incubate.nn.FusedLinear( - config.encoder_width, self.all_head_size) + self.key = paddle.incubate.nn.FusedLinear(config.encoder_width, self.all_head_size) + self.value = paddle.incubate.nn.FusedLinear(config.encoder_width, self.all_head_size) else: - self.key = nn.Linear(config.encoder_width, - self.all_head_size) - self.value = nn.Linear(config.encoder_width, - self.all_head_size) + self.key = nn.Linear(config.encoder_width, self.all_head_size) + self.value = nn.Linear(config.encoder_width, self.all_head_size) else: if config.mp_degree > 1: self.key = fleet.meta_parallel.ColumnParallelLinear( - config.hidden_size, - self.all_head_size, - weight_attr=None, - has_bias=True, - gather_output=True) + config.hidden_size, self.all_head_size, weight_attr=None, has_bias=True, gather_output=True + ) self.value = fleet.meta_parallel.ColumnParallelLinear( - config.hidden_size, - self.all_head_size, - weight_attr=None, - has_bias=True, - gather_output=True) + config.hidden_size, self.all_head_size, weight_attr=None, has_bias=True, gather_output=True + ) else: if config.use_fusedlinear: - self.key = paddle.incubate.nn.FusedLinear( - config.hidden_size, self.all_head_size) - self.value = paddle.incubate.nn.FusedLinear( - config.hidden_size, self.all_head_size) + self.key = paddle.incubate.nn.FusedLinear(config.hidden_size, self.all_head_size) + self.value = paddle.incubate.nn.FusedLinear(config.hidden_size, self.all_head_size) else: self.key = nn.Linear(config.hidden_size, self.all_head_size) - self.value = nn.Linear(config.hidden_size, - self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) self.mp_degree = config.mp_degree self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = getattr( - config, "position_embedding_type", "absolute") - if (self.position_embedding_type == "relative_key" or - self.position_embedding_type == "relative_key_query"): + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding( - 2 * config.max_position_embeddings - 1, - self.attention_head_size) + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.save_attention = False def save_attn_gradients(self, attn_gradients): @@ -260,14 +216,15 @@ def transpose_for_scores(self, x): return x.transpose([0, 2, 1, 3]) def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, ): + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): # If this is instantiated as a cross-attention module, the keys # and values come from an encoder; the attention mask needs to be @@ -275,17 +232,14 @@ def forward( is_cross_attention = encoder_hidden_states is not None if is_cross_attention: - key_layer = self.transpose_for_scores( - self.key(encoder_hidden_states)) - value_layer = self.transpose_for_scores( - self.value(encoder_hidden_states)) + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) attention_mask = encoder_attention_mask elif past_key_value is not None: key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) key_layer = paddle.concat([past_key_value[0], key_layer], axis=2) - value_layer = paddle.concat( - [past_key_value[1], value_layer], axis=2) + value_layer = paddle.concat([past_key_value[1], value_layer], axis=2) else: key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) @@ -296,34 +250,24 @@ def forward( past_key_value = (key_layer, value_layer) # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = paddle.matmul(query_layer, - key_layer.transpose([0, 1, 3, 2])) + attention_scores = paddle.matmul(query_layer, key_layer.transpose([0, 1, 3, 2])) - if (self.position_embedding_type == "relative_key" or - self.position_embedding_type == "relative_key_query"): + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": seq_length = hidden_states.size()[1] position_ids_l = paddle.arange(seq_length).reshape([-1, 1]) position_ids_r = paddle.arange(seq_length).reshape([1, -1]) distance = position_ids_l - position_ids_r - positional_embedding = self.distance_embedding( - distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.cast( - query_layer.dtype) # fp16 compatibility + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.cast(query_layer.dtype) # fp16 compatibility if self.position_embedding_type == "relative_key": - relative_position_scores = paddle.einsum( - "bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) attention_scores = attention_scores + relative_position_scores elif self.position_embedding_type == "relative_key_query": - relative_position_scores_query = paddle.einsum( - "bhld,lrd->bhlr", query_layer, positional_embedding) - relative_position_scores_key = paddle.einsum( - "bhrd,lrd->bhlr", key_layer, positional_embedding) - attention_scores = ( - attention_scores + relative_position_scores_query + - relative_position_scores_key) - attention_scores = attention_scores / math.sqrt( - self.attention_head_size) + relative_position_scores_query = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = paddle.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in BertModel forward() function) attention_scores = attention_scores + attention_mask @@ -350,15 +294,12 @@ def forward( context_layer = paddle.matmul(attention_probs_dropped, value_layer) context_layer = context_layer.transpose([0, 2, 1, 3]) - new_context_layer_shape = context_layer.shape[:-2] + [ - self.all_head_size - ] + new_context_layer_shape = context_layer.shape[:-2] + [self.all_head_size] context_layer = context_layer.reshape(new_context_layer_shape) - outputs = ((context_layer, attention_probs) - if output_attentions else (context_layer, )) + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) - outputs = outputs + (past_key_value, ) + outputs = outputs + (past_key_value,) return outputs @@ -366,12 +307,10 @@ class BertSelfOutput(nn.Layer): def __init__(self, config): super().__init__() if config.use_fusedlinear: - self.dense = paddle.incubate.nn.FusedLinear(config.hidden_size, - config.hidden_size) + self.dense = paddle.incubate.nn.FusedLinear(config.hidden_size, config.hidden_size) else: self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.LayerNorm = nn.LayerNorm( - config.hidden_size, epsilon=config.layer_norm_eps) + self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.mp_degree = config.mp_degree @@ -393,14 +332,15 @@ def __init__(self, config, is_cross_attention=False): self.output = BertSelfOutput(config) def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, ): + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): self_outputs = self.self( hidden_states, attention_mask, @@ -408,11 +348,11 @@ def forward( encoder_hidden_states, encoder_attention_mask, past_key_value, - output_attentions, ) + output_attentions, + ) attention_output = self.output(self_outputs[0], hidden_states) - outputs = (attention_output, - ) + self_outputs[1:] # add attentions if we output them + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs @@ -420,8 +360,7 @@ class BertIntermediate(nn.Layer): def __init__(self, config): super().__init__() if config.use_fusedlinear: - self.dense = paddle.incubate.nn.FusedLinear( - config.hidden_size, config.intermediate_size) + self.dense = paddle.incubate.nn.FusedLinear(config.hidden_size, config.intermediate_size) else: self.dense = nn.Linear(config.hidden_size, config.intermediate_size) if isinstance(config.hidden_act, str): @@ -439,12 +378,10 @@ class BertOutput(nn.Layer): def __init__(self, config): super().__init__() if config.use_fusedlinear: - self.dense = paddle.incubate.nn.FusedLinear( - config.intermediate_size, config.hidden_size) + self.dense = paddle.incubate.nn.FusedLinear(config.intermediate_size, config.hidden_size) else: self.dense = nn.Linear(config.intermediate_size, config.hidden_size) - self.LayerNorm = nn.LayerNorm( - config.hidden_size, epsilon=config.layer_norm_eps) + self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.mp_degree = config.mp_degree @@ -467,10 +404,8 @@ def __init__(self, config, layer_num): self.seq_len_dim = 1 self.attention = BertAttention(config) self.layer_num = layer_num - if (self.config.add_cross_attention and - layer_num % self.config.cross_attention_freq == 0): - self.crossattention = BertAttention( - config, is_cross_attention=self.config.add_cross_attention) + if self.config.add_cross_attention and layer_num % self.config.cross_attention_freq == 0: + self.crossattention = BertAttention(config, is_cross_attention=self.config.add_cross_attention) self.has_cross_attention = True else: self.has_cross_attention = False @@ -480,25 +415,27 @@ def __init__(self, config, layer_num): self.intermediate_query = BertIntermediate(config) self.output_query = BertOutput(config) - def forward(self, - hidden_states=None, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, - query_length=0, - **kwargs): + def forward( + self, + hidden_states=None, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + query_length=0, + **kwargs + ): # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 - self_attn_past_key_value = (past_key_value[:2] - if past_key_value is not None else None) + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None self_attention_outputs = self.attention( hidden_states, attention_mask, head_mask, output_attentions=output_attentions, - past_key_value=self_attn_past_key_value, ) + past_key_value=self_attn_past_key_value, + ) attention_output = self_attention_outputs[0] outputs = self_attention_outputs[1:-1] @@ -517,7 +454,8 @@ def forward(self, head_mask, encoder_hidden_states, encoder_attention_mask, - output_attentions=output_attentions, ) + output_attentions=output_attentions, + ) query_attention_output = cross_attention_outputs[0] outputs = ( outputs + cross_attention_outputs[1:-1] @@ -527,59 +465,59 @@ def forward(self, self.feed_forward_chunk_query, self.chunk_size_feed_forward, self.seq_len_dim, - query_attention_output, ) + query_attention_output, + ) if attention_output.shape[1] > query_length: layer_output_text = self.apply_chunking_to_forward( self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, - attention_output[:, query_length:, :], ) - layer_output = paddle.concat( - [layer_output, layer_output_text], axis=1) + attention_output[:, query_length:, :], + ) + layer_output = paddle.concat([layer_output, layer_output_text], axis=1) else: layer_output = self.apply_chunking_to_forward( self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, - attention_output, ) - outputs = (layer_output, ) + outputs + attention_output, + ) + outputs = (layer_output,) + outputs - outputs = outputs + (present_key_value, ) + outputs = outputs + (present_key_value,) return outputs - def apply_chunking_to_forward(self, forward_fn, chunk_size, chunk_dim, - *input_tensors): - assert len( - input_tensors) > 0, "{0} has to be a tuple/list of tensors".format( - input_tensors) + def apply_chunking_to_forward(self, forward_fn, chunk_size, chunk_dim, *input_tensors): + assert len(input_tensors) > 0, "{0} has to be a tuple/list of tensors".format(input_tensors) # inspect.signature exist since python 3.5 and is a python method -> no problem with backward compatibility - num_args_in_forward_chunk_fn = len( - inspect.signature(forward_fn).parameters) + num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters) if num_args_in_forward_chunk_fn != len(input_tensors): raise ValueError( f"forward_chunk_fn expects {num_args_in_forward_chunk_fn} arguments, but only {len(input_tensors)} input " - "tensors are given") + "tensors are given" + ) if chunk_size > 0: tensor_shape = input_tensors[0].shape[chunk_dim] for input_tensor in input_tensors: if input_tensor.shape[chunk_dim] != tensor_shape: raise ValueError( f"All input tenors have to be of the same shape: {tensor_shape}, " - f"found shape {input_tensor.shape[chunk_dim]}") + f"found shape {input_tensor.shape[chunk_dim]}" + ) if input_tensors[0].shape[chunk_dim] % chunk_size != 0: raise ValueError( f"The dimension to be chunked {input_tensors[0].shape[chunk_dim]} has to be a multiple of the chunk " - f"size {chunk_size}") + f"size {chunk_size}" + ) num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size input_tensors_chunks = tuple( - input_tensor.chunk( - num_chunks, axis=chunk_dim) - for input_tensor in input_tensors) + input_tensor.chunk(num_chunks, axis=chunk_dim) for input_tensor in input_tensors + ) output_chunks = tuple( - forward_fn(*input_tensors_chunk) - for input_tensors_chunk in zip(*input_tensors_chunks)) + forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks) + ) return paddle.concat(output_chunks, axis=chunk_dim) return forward_fn(*input_tensors) @@ -598,56 +536,63 @@ class BertEncoder(nn.Layer): def __init__(self, config): super().__init__() self.config = config - self.layer = nn.LayerList( - [BertLayer(config, i) for i in range(config.num_hidden_layers)]) + self.layer = nn.LayerList([BertLayer(config, i) for i in range(config.num_hidden_layers)]) self.gradient_checkpointing = config.gradient_checkpointing def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_values=None, - use_cache=None, - output_attentions=False, - output_hidden_states=False, - return_dict=True, - query_length=0, ): + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + query_length=0, + ): all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None - all_cross_attentions = (() if output_attentions and - self.config.add_cross_attention else None) + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None next_decoder_cache = () if use_cache else None # cuda_state = paddle.get_cuda_rng_state() # paddle.set_cuda_rng_state(cuda_state) # print("qformergradient_checkpointing:{}".format(self.gradient_checkpointing)) - for i in range(self.config.num_hidden_layers): #add recompute + for i in range(self.config.num_hidden_layers): # add recompute layer_module = self.layer[i] if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states, ) + all_hidden_states = all_hidden_states + (hidden_states,) layer_head_mask = head_mask[i] if head_mask is not None else None - past_key_value = past_key_values[ - i] if past_key_values is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: def create_custom_forward(module): def custom_forward(*inputs): - return module(*inputs, ) + return module( + *inputs, + ) return custom_forward layer_outputs = recompute( create_custom_forward(layer_module), - *(hidden_states, attention_mask, layer_head_mask, - encoder_hidden_states, encoder_attention_mask, - past_key_value, output_attentions, query_length), - **{"preserve_rng_state": True, - "use_reentrant": False}) + *( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + query_length, + ), + **{"preserve_rng_state": True, "use_reentrant": False}, + ) else: layer_outputs = layer_module( @@ -658,34 +603,38 @@ def custom_forward(*inputs): encoder_attention_mask, past_key_value, output_attentions, - query_length, ) + query_length, + ) hidden_states = layer_outputs[0] if use_cache: - next_decoder_cache += (layer_outputs[-1], ) + next_decoder_cache += (layer_outputs[-1],) if output_attentions: - all_self_attentions = all_self_attentions + (layer_outputs[1], ) - all_cross_attentions = all_cross_attentions + (layer_outputs[2], - ) + all_self_attentions = all_self_attentions + (layer_outputs[1],) + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states, ) + all_hidden_states = all_hidden_states + (hidden_states,) if not return_dict: - return tuple(v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] if v is not None) + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, hidden_states=all_hidden_states, attentions=all_self_attentions, - cross_attentions=all_cross_attentions, ) + cross_attentions=all_cross_attentions, + ) class BertPooler(nn.Layer): @@ -701,8 +650,7 @@ def __init__(self, config: BertConfig): """ super(BertPooler, self).__init__() if config.use_fusedlinear: - self.dense = paddle.incubate.nn.FusedLinear(config.hidden_size, - config.hidden_size) + self.dense = paddle.incubate.nn.FusedLinear(config.hidden_size, config.hidden_size) else: self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.activation = nn.Tanh() @@ -718,8 +666,7 @@ class BertPredictionHeadTransform(nn.Layer): def __init__(self, config): super().__init__() if config.use_fusedlinear: - self.dense = paddle.incubate.nn.FusedLinear(config.hidden_size, - config.hidden_size) + self.dense = paddle.incubate.nn.FusedLinear(config.hidden_size, config.hidden_size) else: self.dense = nn.Linear(config.hidden_size, config.hidden_size) # self.dense = fleet.meta_parallel.ColumnParallelLinear(config.hidden_size, config.hidden_size,weight_attr=None, @@ -729,8 +676,7 @@ def __init__(self, config): self.transform_act_fn = ACT2FN[config.hidden_act] else: self.transform_act_fn = config.hidden_act - self.LayerNorm = nn.LayerNorm( - config.hidden_size, epsilon=config.layer_norm_eps) + self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) def forward(self, hidden_states): hidden_states = self.dense(hidden_states) @@ -747,11 +693,9 @@ def __init__(self, config): # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. if config.use_fusedlinear: - self.dense = paddle.incubate.nn.FusedLinear(config.hidden_size, - config.vocab_size) + self.dense = paddle.incubate.nn.FusedLinear(config.hidden_size, config.vocab_size) else: - self.decoder = nn.Linear( - config.hidden_size, config.vocab_size, bias_attr=False) + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias_attr=False) # # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` # self.decoder.bias = self.bias @@ -759,9 +703,8 @@ def __init__(self, config): initializer = paddle.nn.initializer.Constant(value=0.0) bias_data = paddle.zeros([config.vocab_size]) self.bias = self.create_parameter( - shape=[config.vocab_size], - dtype='float32', - default_initializer=initializer(bias_data)) + shape=[config.vocab_size], dtype="float32", default_initializer=initializer(bias_data) + ) # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` self.decoder.bias = self.bias @@ -780,8 +723,7 @@ def __init__(self, config): def forward(self, sequence_output, word_embeddings): prediction_scores = self.predictions(sequence_output) - prediction_scores = prediction_scores @word_embeddings.weight.t( - ) + self.predictions.bias + prediction_scores = prediction_scores @ word_embeddings.weight.t() + self.predictions.bias return prediction_scores @@ -854,11 +796,9 @@ def invert_attention_mask(self, encoder_attention_mask: Tensor) -> Tensor: `torch.Tensor`: The inverted attention mask. """ if encoder_attention_mask.dim() == 3: - encoder_extended_attention_mask = encoder_attention_mask[:, - None, :, :] + encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] if encoder_attention_mask.dim() == 2: - encoder_extended_attention_mask = encoder_attention_mask[:, None, - None, :] + encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow # /transformer/transformer_layers.py#L270 @@ -866,18 +806,18 @@ def invert_attention_mask(self, encoder_attention_mask: Tensor) -> Tensor: # encoder_extended_attention_mask.transpose(-1, -2)) # encoder_extended_attention_mask = encoder_extended_attention_mask.cast(dtype=encoder_attention_mask.dtype) # fp16 compatibility - encoder_extended_attention_mask = ( - 1.0 - encoder_extended_attention_mask) * np.finfo('float32').min + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * np.finfo("float32").min return encoder_extended_attention_mask def get_extended_attention_mask( - self, - attention_mask: Tensor, - input_shape: Tuple[int], - device: device, - is_decoder: bool, - has_query: bool=False, ) -> Tensor: + self, + attention_mask: Tensor, + input_shape: Tuple[int], + device: device, + is_decoder: bool, + has_query: bool = False, + ) -> Tensor: """ Makes broadcastable attention and causal masks so that future and masked tokens are ignored. @@ -904,58 +844,57 @@ def get_extended_attention_mask( batch_size, seq_length = input_shape seq_ids = paddle.arange(seq_length) - causal_mask = (paddle.tile(seq_ids[None, None, :], - [batch_size, seq_length, 1]) <= - seq_ids[None, :, None]) + causal_mask = ( + paddle.tile(seq_ids[None, None, :], [batch_size, seq_length, 1]) <= seq_ids[None, :, None] + ) # add a prefix ones mask to the causal mask # causal and attention masks must have same type with pytorch version < 1.3 causal_mask = causal_mask.cast(attention_mask.dtype) if causal_mask.shape[1] < attention_mask.shape[1]: - prefix_seq_len = attention_mask.shape[ - 1] - causal_mask.shape[1] + prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1] if has_query: # UniLM style attention mask causal_mask = paddle.concat( [ paddle.zeros( (batch_size, prefix_seq_len, seq_length), - dtype=causal_mask.dtype, ), + dtype=causal_mask.dtype, + ), causal_mask, ], - axis=1, ) + axis=1, + ) causal_mask = paddle.concat( [ paddle.ones( - (batch_size, causal_mask.shape[1], - prefix_seq_len), - dtype=causal_mask.dtype, ), + (batch_size, causal_mask.shape[1], prefix_seq_len), + dtype=causal_mask.dtype, + ), causal_mask, ], - axis=-1, ) - extended_attention_mask = (causal_mask[:, None, :, :] * - attention_mask[:, None, None, :]) + axis=-1, + ) + extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] else: extended_attention_mask = attention_mask[:, None, None, :] else: raise ValueError( - "Wrong shape for input_ids (shape {}) or attention_mask (shape {})". - format(input_shape, attention_mask.shape)) + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( + input_shape, attention_mask.shape + ) + ) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. - extended_attention_mask = extended_attention_mask.cast( - self.config.dtype) # fp16 compatibility + extended_attention_mask = extended_attention_mask.cast(self.config.dtype) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 return extended_attention_mask - def get_head_mask(self, - head_mask, - num_hidden_layers, - is_attention_chunked=False) -> Tensor: + def get_head_mask(self, head_mask, num_hidden_layers, is_attention_chunked=False) -> Tensor: """ Prepare the head mask if needed. @@ -972,8 +911,7 @@ def get_head_mask(self, `[None]` for each layer. """ if head_mask is not None: - head_mask = self._convert_head_mask_to_5d(head_mask, - num_hidden_layers) + head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers) if is_attention_chunked is True: head_mask = head_mask.unsqueeze(-1) else: @@ -984,32 +922,30 @@ def get_head_mask(self, def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers): """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]""" if head_mask.dim() == 1: - head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze( - -1).unsqueeze(-1) + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1) elif head_mask.dim() == 2: - head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze( - -1) # We can specify head_mask for each layer - assert head_mask.dim( - ) == 5, f"head_mask.dim != 5, instead {head_mask.dim()}" + head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer + assert head_mask.dim() == 5, f"head_mask.dim != 5, instead {head_mask.dim()}" # head_mask = head_mask.to(dtype=num_hidden_layers.dtype) # switch to float if need + fp16 compatibility return head_mask def forward( - self, - input_ids=None, - attention_mask=None, - position_ids=None, - head_mask=None, - query_embeds=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_values=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - is_decoder=False, ): + self, + input_ids=None, + attention_mask=None, + position_ids=None, + head_mask=None, + query_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + is_decoder=False, + ): r""" encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1028,24 +964,21 @@ def forward( If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up decoding (see :obj:`past_key_values`). """ - output_attentions = (output_attentions if output_attentions is not None - else self.config.output_attentions) - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # use_cache = use_cache if use_cache is not None else self.config.use_cache if input_ids is None: - assert (query_embeds is not None - ), "You have to specify query_embeds when input_ids is None" + assert query_embeds is not None, "You have to specify query_embeds when input_ids is None" # past_key_values_length past_key_values_length = ( - past_key_values[0][0].shape[2] - self.config.query_length - if past_key_values is not None else 0) + past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0 + ) query_length = query_embeds.shape[1] if query_embeds is not None else 0 @@ -1053,14 +986,14 @@ def forward( input_ids=input_ids, position_ids=position_ids, query_embeds=query_embeds, - past_key_values_length=past_key_values_length, ) + past_key_values_length=past_key_values_length, + ) input_shape = embedding_output.shape[:-1] batch_size, seq_length = input_shape if attention_mask is None: - attention_mask = paddle.ones(( - (batch_size, seq_length + past_key_values_length))) + attention_mask = paddle.ones(((batch_size, seq_length + past_key_values_length))) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. @@ -1070,34 +1003,27 @@ def forward( input_ids.shape, device, is_decoder, - has_query=(query_embeds is not None), ) + has_query=(query_embeds is not None), + ) else: - extended_attention_mask = self.get_extended_attention_mask( - attention_mask, input_shape, device, is_decoder) + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device, is_decoder) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] if encoder_hidden_states is not None: if type(encoder_hidden_states) == list: - encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[ - 0].shape + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].shape else: - [encoder_batch_size, encoder_sequence_length, - _] = encoder_hidden_states.shape + [encoder_batch_size, encoder_sequence_length, _] = encoder_hidden_states.shape encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if type(encoder_attention_mask) == list: - encoder_extended_attention_mask = [ - self.invert_attention_mask(mask) - for mask in encoder_attention_mask - ] + encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask] elif encoder_attention_mask is None: encoder_attention_mask = paddle.ones(encoder_hidden_shape) - encoder_extended_attention_mask = self.invert_attention_mask( - encoder_attention_mask) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: - encoder_extended_attention_mask = self.invert_attention_mask( - encoder_attention_mask) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_extended_attention_mask = None @@ -1119,10 +1045,10 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, - query_length=query_length, ) + query_length=query_length, + ) sequence_output = encoder_outputs[0] - pooled_output = (self.pooler(sequence_output) - if self.pooler is not None else None) + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] @@ -1133,23 +1059,18 @@ def forward( past_key_values=encoder_outputs.past_key_values, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, - cross_attentions=encoder_outputs.cross_attentions, ) + cross_attentions=encoder_outputs.cross_attentions, + ) class BertLMHeadModel(BertPreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [ - r"position_ids", r"predictions.decoder.bias" - ] - - def __init__(self, - config, - encoder_width=None, - train_in_satge1=False, - **kwargs): + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + + def __init__(self, config, encoder_width=None, train_in_satge1=False, **kwargs): super().__init__(config) - from paddle.distributed import fleet + config.mp_degree = kwargs.get("mp_degree") config.encoder_width = encoder_width config.gradient_checkpointing = False @@ -1161,21 +1082,17 @@ def __init__(self, self.query_tokens = paddle.create_parameter( shape=(1, config.num_query_tokens, config.hidden_size), - dtype='float32', - default_initializer=paddle.nn.initializer.Normal( - mean=0.0, std=config.initializer_range)) + dtype="float32", + default_initializer=paddle.nn.initializer.Normal(mean=0.0, std=config.initializer_range), + ) if train_in_satge1: - self.vision_proj = paddle.nn.Linear( - in_features=config.hidden_size, out_features=config.embed_dim) - self.text_proj = paddle.nn.Linear( - in_features=config.hidden_size, out_features=config.embed_dim) - self.itm_head = paddle.nn.Linear( - in_features=config.hidden_size, out_features=2) - self.resize_token_embeddings(kwargs.get('tokenizer_length')) + self.vision_proj = paddle.nn.Linear(in_features=config.hidden_size, out_features=config.embed_dim) + self.text_proj = paddle.nn.Linear(in_features=config.hidden_size, out_features=config.embed_dim) + self.itm_head = paddle.nn.Linear(in_features=config.hidden_size, out_features=2) + self.resize_token_embeddings(kwargs.get("tokenizer_length")) else: - text_hidden_size = kwargs.get('text_hidden_size') - self.language_projection = paddle.nn.Linear( - in_features=config.hidden_size, out_features=text_hidden_size) + text_hidden_size = kwargs.get("text_hidden_size") + self.language_projection = paddle.nn.Linear(in_features=config.hidden_size, out_features=text_hidden_size) # self.init_weights() @@ -1186,23 +1103,24 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings def forward( - self, - input_ids=None, - attention_mask=None, - position_ids=None, - head_mask=None, - query_embeds=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - labels=None, - past_key_values=None, - use_cache=True, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - return_logits=False, - is_decoder=True, - reduction="mean", ): + self, + input_ids=None, + attention_mask=None, + position_ids=None, + head_mask=None, + query_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + past_key_values=None, + use_cache=True, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + return_logits=False, + is_decoder=True, + reduction="mean", + ): r""" encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1235,8 +1153,7 @@ def forward( >>> outputs = model(**inputs) >>> prediction_logits = outputs.logits """ - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False if past_key_values is not None: @@ -1255,14 +1172,14 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, - is_decoder=is_decoder, ) + is_decoder=is_decoder, + ) sequence_output = outputs[0] if query_embeds is not None: - sequence_output = outputs[0][:, query_embeds.shape[1]:, :] + sequence_output = outputs[0][:, query_embeds.shape[1] :, :] - prediction_scores = self.cls(sequence_output, - self.bert.embeddings.word_embeddings) + prediction_scores = self.cls(sequence_output, self.bert.embeddings.word_embeddings) if return_logits: return prediction_scores[:, :-1, :] @@ -1274,23 +1191,20 @@ def forward( labels = labels[:, 1:] labels = labels.flatten() # loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1) - loss_fct = CrossEntropyLoss( - reduction=reduction, label_smoothing=0.1) + loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1) valid_index = paddle.where(labels != -100)[0].flatten() - logits = shifted_prediction_scores.reshape( - (-1, self.config.vocab_size)) + logits = shifted_prediction_scores.reshape((-1, self.config.vocab_size)) logits = paddle.gather(logits, valid_index, axis=0) labels = paddle.gather(labels, valid_index, axis=0) lm_loss = loss_fct(logits, labels) if reduction == "none": - lm_loss = lm_loss.reshape( - [prediction_scores.shape(0), -1]).sum(1) + lm_loss = lm_loss.reshape([prediction_scores.shape(0), -1]).sum(1) if not return_dict: - output = (prediction_scores, ) + outputs[2:] - return ((lm_loss, ) + output) if lm_loss is not None else output + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output return CausalLMOutputWithCrossAttentions( loss=lm_loss, @@ -1298,14 +1212,10 @@ def forward( past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, - cross_attentions=outputs.cross_attentions, ) - - def prepare_inputs_for_generation(self, - input_ids, - query_embeds, - past=None, - attention_mask=None, - **model_kwargs): + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, query_embeds, past=None, attention_mask=None, **model_kwargs): # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly # if attention_mask is None: # attention_mask = input_ids.new_ones(input_ids.shape) @@ -1325,28 +1235,22 @@ def prepare_inputs_for_generation(self, "query_embeds": query_embeds, "attention_mask": attention_mask, "past_key_values": past, - "encoder_hidden_states": - model_kwargs.get("encoder_hidden_states", None), - "encoder_attention_mask": - model_kwargs.get("encoder_attention_mask", None), + "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None), + "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None), "is_decoder": True, } def _reorder_cache(self, past, beam_idx): reordered_past = () for layer_past in past: - reordered_past += (tuple( - past_state.index_select(0, beam_idx) - for past_state in layer_past), ) + reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) return reordered_past class BertForMaskedLM(BertPreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [ - r"position_ids", r"predictions.decoder.bias" - ] + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] def __init__(self, config): super().__init__(config) @@ -1363,20 +1267,21 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings def forward( - self, - input_ids=None, - attention_mask=None, - position_ids=None, - head_mask=None, - query_embeds=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - return_logits=False, - is_decoder=False, ): + self, + input_ids=None, + attention_mask=None, + position_ids=None, + head_mask=None, + query_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + return_logits=False, + is_decoder=False, + ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., @@ -1384,8 +1289,7 @@ def forward( (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` """ - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, @@ -1398,10 +1302,11 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, - is_decoder=is_decoder, ) + is_decoder=is_decoder, + ) if query_embeds is not None: - sequence_output = outputs[0][:, query_embeds.shape[1]:, :] + sequence_output = outputs[0][:, query_embeds.shape[1] :, :] prediction_scores = self.cls(sequence_output) if return_logits: @@ -1410,26 +1315,24 @@ def forward( masked_lm_loss = None if labels is not None: loss_fct = CrossEntropyLoss() # -100 index = padding token - masked_lm_loss = loss_fct( - prediction_scores.reshape([-1, self.config.vocab_size]), - labels.reshape([-1])) + masked_lm_loss = loss_fct(prediction_scores.reshape([-1, self.config.vocab_size]), labels.reshape([-1])) if not return_dict: - output = (prediction_scores, ) + outputs[2:] - return (((masked_lm_loss, ) + output) - if masked_lm_loss is not None else output) + output = (prediction_scores,) + outputs[2:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, hidden_states=outputs.hidden_states, - attentions=outputs.attentions, ) + attentions=outputs.attentions, + ) def prune_linear_layer(layer, index_to_prune, dim=0): index = paddle.to_tensor(index_to_prune) num_dims = len(layer.weight.shape) - index_expanded = index.expand((layer.weight.shape[dim], )).T + index_expanded = index.expand((layer.weight.shape[dim],)).T if dim != 0: perm = list(range(num_dims)) @@ -1447,8 +1350,7 @@ def prune_linear_layer(layer, index_to_prune, dim=0): return layer -def find_pruneable_heads_and_indices(heads, n_heads, head_size, - already_pruned_heads): +def find_pruneable_heads_and_indices(heads, n_heads, head_size, already_pruned_heads): """ Finds the heads and their indices taking `already_pruned_heads` into account. @@ -1462,14 +1364,12 @@ def find_pruneable_heads_and_indices(heads, n_heads, head_size, `Tuple[Set[int], torch.LongTensor]`: A tuple with the remaining heads and their corresponding indices. """ mask = paddle.ones([n_heads, head_size]) - heads = set( - heads - ) - already_pruned_heads # Convert to set and remove already pruned heads + heads = set(heads) - already_pruned_heads # Convert to set and remove already pruned heads for head in heads: # Compute how many pruned heads are before the head and move the index accordingly head = head - sum(1 if h < head else 0 for h in already_pruned_heads) mask[head] = 0 mask = mask.reshape(-1).eq(1) # index: torch.LongTensor = torch.arange(len(mask))[mask].long() - index = paddle.arange(len(mask))[mask].astype('int64') + index = paddle.arange(len(mask))[mask].astype("int64") return heads, index diff --git a/paddlemix/models/blip2/configuration.py b/paddlemix/models/blip2/configuration.py index a86d5d1908372..c978eeb0419bf 100644 --- a/paddlemix/models/blip2/configuration.py +++ b/paddlemix/models/blip2/configuration.py @@ -17,12 +17,7 @@ import os from typing import Union -from paddlenlp.transformers import AutoConfig -from paddlenlp.transformers.auto.modeling import \ - MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from paddlenlp.transformers.configuration_utils import PretrainedConfig -from paddlenlp.transformers.opt.configuration import OPTConfig -from paddlenlp.transformers.t5.configuration import T5Config from paddlenlp.utils.log import logger __all__ = [ @@ -82,18 +77,19 @@ class Blip2VisionConfig(PretrainedConfig): model_type = "blip_2_vision_model" def __init__( - self, - img_size=224, - patch_size=14, - embed_dim=1408, - depth=39, - num_heads=16, - mlp_ratio=4.3637, - qkv_bias=True, - drop_rate=0, - epsilon=1e-6, - gradient_checkpointing=False, - **kwargs, ): + self, + img_size=224, + patch_size=14, + embed_dim=1408, + depth=39, + num_heads=16, + mlp_ratio=4.3637, + qkv_bias=True, + drop_rate=0, + epsilon=1e-6, + gradient_checkpointing=False, + **kwargs, + ): kwargs["return_dict"] = kwargs.pop("return_dict", True) super().__init__(**kwargs) @@ -108,26 +104,22 @@ def __init__( self.epsilon = epsilon self.gradient_checkpointing = gradient_checkpointing - self.in_chans = kwargs.get('in_chans', 3) - self.class_num = kwargs.get('class_num', 1000) - self.qk_scale = kwargs.get('qk_scale', None) - self.attn_drop_rate = kwargs.get('attn_drop_rate=', 0.) - self.drop_path_rate = kwargs.get('drop_path_rate', 0.) - self.norm_layer = kwargs.get('norm_layer', 'nn.LayerNorm') + self.in_chans = kwargs.get("in_chans", 3) + self.class_num = kwargs.get("class_num", 1000) + self.qk_scale = kwargs.get("qk_scale", None) + self.attn_drop_rate = kwargs.get("attn_drop_rate=", 0.0) + self.drop_path_rate = kwargs.get("drop_path_rate", 0.0) + self.norm_layer = kwargs.get("norm_layer", "nn.LayerNorm") @classmethod - def from_pretrained(cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, - **kwargs) + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the vision config dict if we are loading from Blip2Config if config_dict.get("model_type") == "blip-2": config_dict = config_dict["vision_config"] - if ("model_type" in config_dict and hasattr(cls, "model_type") and - config_dict["model_type"] != cls.model_type): + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: logger.warning( f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." @@ -196,24 +188,25 @@ class Blip2QFormerConfig(PretrainedConfig): model_type = "blip_2_qformer" def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - position_embedding_type="absolute", - classifier_dropout=None, - cross_attention_frequency=2, - encoder_hidden_size=1408, - **kwargs, ): + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type="absolute", + classifier_dropout=None, + cross_attention_frequency=2, + encoder_hidden_size=1408, + **kwargs, + ): kwargs["return_dict"] = kwargs.pop("return_dict", True) super().__init__(pad_token_id=pad_token_id, **kwargs) @@ -234,18 +227,14 @@ def __init__( self.encoder_hidden_size = encoder_hidden_size @classmethod - def from_pretrained(cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, - **kwargs) + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the qformer config dict if we are loading from Blip2Config if config_dict.get("model_type") == "blip-2": config_dict = config_dict["qformer_config"] - if ("model_type" in config_dict and hasattr(cls, "model_type") and - config_dict["model_type"] != cls.model_type): + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: logger.warning( f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." @@ -300,31 +289,26 @@ class Blip2Config(PretrainedConfig): is_composition = True def __init__( - self, - vision_config=None, - qformer_config=None, - text_config=None, - num_query_tokens=32, - **kwargs, ): + self, + vision_config=None, + qformer_config=None, + text_config=None, + num_query_tokens=32, + **kwargs, + ): super().__init__(**kwargs) if vision_config is None: vision_config = {} - logger.info( - "vision_config is None. initializing the Blip2VisionConfig with default values." - ) + logger.info("vision_config is None. initializing the Blip2VisionConfig with default values.") if qformer_config is None: qformer_config = {} - logger.info( - "qformer_config is None. Initializing the Blip2QFormerConfig with default values." - ) + logger.info("qformer_config is None. Initializing the Blip2QFormerConfig with default values.") if text_config is None: text_config = {} - logger.info( - "text_config is None. Initializing the text config with default values (`OPTConfig`)." - ) + logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).") self.vision_config = vision_config self.qformer_config = qformer_config self.text_config = text_config @@ -336,15 +320,16 @@ def __init__( # self.use_decoder_only_language_model = self.text_config.model_type in CONFIGURATION_MODEL_MAPPING self.initializer_factor = 1.0 self.initializer_range = 0.02 - self.freeze_vit = kwargs.get('freeze_vit', True) + self.freeze_vit = kwargs.get("freeze_vit", True) @classmethod def from_vision_qformer_text_configs( - cls, - vision_config: Blip2VisionConfig, - qformer_config: Blip2QFormerConfig, - text_config: PretrainedConfig, - **kwargs, ): + cls, + vision_config: Blip2VisionConfig, + qformer_config: Blip2QFormerConfig, + text_config: PretrainedConfig, + **kwargs, + ): r""" Instantiate a [`Blip2Config`] (or a derived class) from a BLIP-2 vision model, Q-Former and language model configurations. @@ -356,7 +341,8 @@ def from_vision_qformer_text_configs( vision_config=vision_config, qformer_config=qformer_config, text_config=text_config, - **kwargs, ) + **kwargs, + ) def to_dict(self): """ diff --git a/paddlemix/models/blip2/eva_vit.py b/paddlemix/models/blip2/eva_vit.py index bdbfa337377fe..d7ef6525c8a55 100644 --- a/paddlemix/models/blip2/eva_vit.py +++ b/paddlemix/models/blip2/eva_vit.py @@ -12,26 +12,28 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Code was based on https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py -# reference: https://arxiv.org/abs/2010.11929 -from paddlemix.utils.log import logger from collections.abc import Callable -from paddle.distributed import fleet -from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker + import numpy as np import paddle import paddle.nn as nn from paddle import _legacy_C_ops -from paddle.nn.initializer import TruncatedNormal, Constant, Normal from paddle.distributed import fleet +from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker +from paddle.nn.functional.flash_attention import flash_attention +from paddle.nn.initializer import Constant, Normal, TruncatedNormal + from paddlemix.models.blip2.configuration import Blip2VisionConfig from paddlemix.models.blip2.modeling import Blip2PretrainedModel -from paddle.nn.functional.flash_attention import (flash_attention, ) -trunc_normal_ = TruncatedNormal(std=.02) +# Code was based on https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py +# reference: https://arxiv.org/abs/2010.11929 +from paddlemix.utils.log import logger + +trunc_normal_ = TruncatedNormal(std=0.02) normal_ = Normal -zeros_ = Constant(value=0.) -ones_ = Constant(value=1.) +zeros_ = Constant(value=0.0) +ones_ = Constant(value=1.0) from paddle.distributed.fleet.utils import recompute @@ -39,12 +41,12 @@ def to_2tuple(x): return tuple([x] * 2) -def drop_path(x, drop_prob=0., training=False): +def drop_path(x, drop_prob=0.0, training=False): - if drop_prob == 0. or not training: + if drop_prob == 0.0 or not training: return x keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype) - shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) + shape = (paddle.shape(x)[0],) + (1,) * (x.ndim - 1) random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype) random_tensor = paddle.floor(random_tensor) # binarize output = x.divide(keep_prob) * random_tensor @@ -61,37 +63,31 @@ def forward(self, x): class Mlp(nn.Layer): - def __init__(self, - in_features, - hidden_features=None, - out_features=None, - act_layer=nn.GELU, - drop=0., - mp_degree=1, - use_fusedlinear=False): + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, + mp_degree=1, + use_fusedlinear=False, + ): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features if mp_degree > 1: self.fc1 = fleet.meta_parallel.ColumnParallelLinear( - in_features, - hidden_features, - weight_attr=None, - has_bias=True, - gather_output=True) + in_features, hidden_features, weight_attr=None, has_bias=True, gather_output=True + ) self.fc2 = fleet.meta_parallel.ColumnParallelLinear( - hidden_features, - out_features, - weight_attr=None, - has_bias=True, - gather_output=True) + hidden_features, out_features, weight_attr=None, has_bias=True, gather_output=True + ) else: if use_fusedlinear: self.use_fusedlinear = True - self.fc1 = paddle.incubate.nn.FusedLinear(in_features, - hidden_features) - self.fc2 = paddle.incubate.nn.FusedLinear(hidden_features, - out_features) + self.fc1 = paddle.incubate.nn.FusedLinear(in_features, hidden_features) + self.fc2 = paddle.incubate.nn.FusedLinear(hidden_features, out_features) self.fc1 = nn.Linear(in_features, hidden_features) self.fc2 = nn.Linear(hidden_features, out_features) self.mp_degree = mp_degree @@ -102,12 +98,12 @@ def forward(self, x): if getattr(self, "use_fusedlinear", False): if isinstance(self.act, nn.GELU): x = _legacy_C_ops.fused_gemm_epilogue( - x, self.fc1.weight, self.fc1.bias, 'trans_x', False, - 'trans_y', False, 'activation', 'gelu') + x, self.fc1.weight, self.fc1.bias, "trans_x", False, "trans_y", False, "activation", "gelu" + ) elif isinstance(self.act, nn.ReLU): x = _legacy_C_ops.fused_gemm_epilogue( - x, self.fc1.weight, self.fc1.bias, 'trans_x', False, - 'trans_y', False, 'activation', 'relu') + x, self.fc1.weight, self.fc1.bias, "trans_x", False, "trans_y", False, "activation", "relu" + ) else: ValueError else: @@ -123,17 +119,19 @@ def forward(self, x): class Attention(nn.Layer): - def __init__(self, - dim, - num_heads=8, - qkv_bias=False, - qk_scale=None, - attn_drop=0., - proj_drop=0., - window_size=None, - mp_degree=1, - use_fusedlinear=False, - use_flash_attn=False): + def __init__( + self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + window_size=None, + mp_degree=1, + use_fusedlinear=False, + use_flash_attn=False, + ): super().__init__() self.use_flash_attn = use_flash_attn self.num_heads = num_heads @@ -141,21 +139,18 @@ def __init__(self, self.scale = qk_scale or head_dim**-0.5 if mp_degree > 1: self.qkv = fleet.meta_parallel.ColumnParallelLinear( - dim, - dim * 3, - weight_attr=None, - has_bias=True, - gather_output=True) + dim, dim * 3, weight_attr=None, has_bias=True, gather_output=True + ) else: if use_fusedlinear: - self.qkv = paddle.incubate.nn.FusedLinear( - dim, dim * 3, bias_attr=qkv_bias) + self.qkv = paddle.incubate.nn.FusedLinear(dim, dim * 3, bias_attr=qkv_bias) else: self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) if mp_degree > 1: self.proj = fleet.meta_parallel.ColumnParallelLinear( - dim, dim, weight_attr=None, has_bias=True, gather_output=True) + dim, dim, weight_attr=None, has_bias=True, gather_output=True + ) else: if use_fusedlinear: self.proj = paddle.incubate.nn.FusedLinear(dim, dim) @@ -165,31 +160,25 @@ def __init__(self, self.proj_drop = nn.Dropout(proj_drop) def _register_relative_position_index( - self, - window_size, - num_heads, ): - self.num_relative_distance = (2 * window_size[0] - 1) * ( - 2 * window_size[1] - 1) + 3 + self, + window_size, + num_heads, + ): + self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 self.relative_position_bias_table = self.create_parameter( - [self.num_relative_distance, num_heads], - default_initializer=zeros_) # 2*Wh-1 * 2*Ww-1, nH + [self.num_relative_distance, num_heads], default_initializer=zeros_ + ) # 2*Wh-1 * 2*Ww-1, nH coords_h = paddle.arange(window_size[0]) coords_w = paddle.arange(window_size[1]) - coords = paddle.stack(paddle.meshgrid( - [coords_h, coords_w])) # 2, Wh, Ww + coords = paddle.stack(paddle.meshgrid([coords_h, coords_w])) # 2, Wh, Ww coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww - relative_coords = coords_flatten[:, :, - None] - coords_flatten[:, - None, :] # 2, Wh*Ww, Wh*Ww - relative_coords = relative_coords.transpose( - [1, 2, 0]) # Wh*Ww, Wh*Ww, 2 + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.transpose([1, 2, 0]) # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += window_size[1] - 1 relative_coords[:, :, 0] *= 2 * window_size[1] - 1 - relative_position_index = \ - paddle.zeros((window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype) - relative_position_index[1:, 1:] = relative_coords.sum( - -1) # Wh*Ww, Wh*Ww + relative_position_index = paddle.zeros((window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww relative_position_index[0, 0:] = self.num_relative_distance - 3 relative_position_index[0:, 0] = self.num_relative_distance - 2 relative_position_index[0, 0] = self.num_relative_distance - 1 @@ -198,28 +187,20 @@ def _register_relative_position_index( def forward(self, x, rel_pos_bias=None): N, C = x.shape[1:] - qkv = self.qkv(x).reshape( - (-1, N, 3, self.num_heads, C // self.num_heads)).transpose( - (2, 0, 3, 1, 4)) + qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C // self.num_heads)).transpose((2, 0, 3, 1, 4)) q, k, v = qkv[0], qkv[1], qkv[2] if self.use_flash_attn: - x, _ = flash_attention( - q, - k, - v, - dropout=self.proj_drop.p, - causal=False, - return_softmax=False) + x, _ = flash_attention(q, k, v, dropout=self.proj_drop.p, causal=False, return_softmax=False) x = paddle.reshape(x, [0, 0, -1]) else: attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale - if hasattr(self, 'relative_position_bias_table'): - relative_position_bias = \ - self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([ - self.window_size[0] * self.window_size[1] + 1, - self.window_size[0] * self.window_size[1] + 1, -1]) # Wh*Ww,Wh*Ww,nH - relative_position_bias = relative_position_bias.transpose( - [2, 0, 1]) # nH, Wh*Ww, Wh*Ww + if hasattr(self, "relative_position_bias_table"): + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.reshape([-1]) + ].reshape( + [self.window_size[0] * self.window_size[1] + 1, self.window_size[0] * self.window_size[1] + 1, -1] + ) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.transpose([2, 0, 1]) # nH, Wh*Ww, Wh*Ww attn = attn + relative_position_bias.unsqueeze(0) attn = nn.functional.softmax(attn, axis=-1) @@ -242,31 +223,32 @@ def forward(self, x, rel_pos_bias=None): class Block(nn.Layer): - def __init__(self, - dim, - num_heads, - mlp_ratio=4., - qkv_bias=False, - qk_scale=None, - drop=0., - init_values=0., - attn_drop=0., - drop_path=0., - act_layer=nn.GELU, - norm_layer='nn.LayerNorm', - epsilon=1e-5, - window_size=None, - mp_degree=1, - use_flash_attn=False, - use_fusedlinear=False): + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + init_values=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer="nn.LayerNorm", + epsilon=1e-5, + window_size=None, + mp_degree=1, + use_flash_attn=False, + use_fusedlinear=False, + ): super().__init__() if isinstance(norm_layer, str): self.norm1 = eval(norm_layer)(dim, epsilon=epsilon) elif isinstance(norm_layer, Callable): self.norm1 = norm_layer(dim) else: - raise TypeError( - "The norm_layer must be str or paddle.nn.layer.Layer class") + raise TypeError("The norm_layer must be str or paddle.nn.layer.Layer class") self.attn = Attention( dim, num_heads=num_heads, @@ -277,7 +259,8 @@ def __init__(self, window_size=window_size, mp_degree=mp_degree, use_flash_attn=use_flash_attn, - use_fusedlinear=use_fusedlinear) + use_fusedlinear=use_fusedlinear, + ) # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here self.drop_path = DropPath(drop_path) self.gamma_1 = None @@ -287,25 +270,23 @@ def __init__(self, elif isinstance(norm_layer, Callable): self.norm2 = norm_layer(dim) else: - raise TypeError( - "The norm_layer must be str or paddle.nn.layer.Layer class") + raise TypeError("The norm_layer must be str or paddle.nn.layer.Layer class") mlp_hidden_dim = int(dim * mlp_ratio) - self.mlp = Mlp(in_features=dim, - hidden_features=mlp_hidden_dim, - act_layer=act_layer, - drop=drop, - mp_degree=mp_degree, - use_fusedlinear=use_fusedlinear) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + mp_degree=mp_degree, + use_fusedlinear=use_fusedlinear, + ) def forward(self, x, rel_pos_bias=None): if self.gamma_1 is not None: - x = x + self.drop_path(self.gamma_1 * self.attn( - self.norm1(x), rel_pos_bias=rel_pos_bias)) + x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias)) x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) else: - x = x + self.drop_path( - self.attn( - self.norm1(x), rel_pos_bias=rel_pos_bias)) + x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias)) x = x + self.drop_path(self.mlp(self.norm2(x))) return x @@ -314,31 +295,24 @@ class RelativePositionBias(nn.Layer): def __init__(self, window_size, num_heads): super().__init__() self.window_size = window_size - self.num_relative_distance = (2 * window_size[0] - 1) * ( - 2 * window_size[1] - 1) + 3 + self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 self.relative_position_bias_table = self.create_parameter( - [self.num_relative_distance, num_heads], - default_initializer=zeros_) # 2*Wh-1 * 2*Ww-1, nH + [self.num_relative_distance, num_heads], default_initializer=zeros_ + ) # 2*Wh-1 * 2*Ww-1, nH # cls to token & token 2 cls & cls to cls # get pair-wise relative position index for each token inside the window coords_h = paddle.arange(window_size[0]) coords_w = paddle.arange(window_size[1]) - coords = paddle.stack(paddle.meshgrid( - [coords_h, coords_w])) # 2, Wh, Ww + coords = paddle.stack(paddle.meshgrid([coords_h, coords_w])) # 2, Wh, Ww coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww - relative_coords = coords_flatten[:, :, - None] - coords_flatten[:, - None, :] # 2, Wh*Ww, Wh*Ww - relative_coords = relative_coords.transpose( - [1, 2, 0]) # Wh*Ww, Wh*Ww, 2 + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.transpose([1, 2, 0]) # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += window_size[1] - 1 relative_coords[:, :, 0] *= 2 * window_size[1] - 1 - relative_position_index = \ - paddle.zeros((window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) - relative_position_index[1:, 1:] = relative_coords.sum( - -1) # Wh*Ww, Wh*Ww + relative_position_index = paddle.zeros((window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww relative_position_index[0, 0:] = self.num_relative_distance - 3 relative_position_index[0:, 0] = self.num_relative_distance - 2 relative_position_index[0, 0] = self.num_relative_distance - 1 @@ -348,42 +322,39 @@ def __init__(self, window_size, num_heads): # trunc_normal_(self.relative_position_bias_table, std=.02) def forward(self): - relative_position_bias = \ - self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([ - self.window_size[0] * self.window_size[1] + 1, - self.window_size[0] * self.window_size[1] + 1, -1]) # Wh*Ww,Wh*Ww,nH + relative_position_bias = self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape( + [self.window_size[0] * self.window_size[1] + 1, self.window_size[0] * self.window_size[1] + 1, -1] + ) # Wh*Ww,Wh*Ww,nH return relative_position_bias.transpose([2, 0, 1]) # nH, Wh*Ww, Wh*Ww class PatchEmbed(nn.Layer): - """ Image to Patch Embedding - """ + """Image to Patch Embedding""" def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): super().__init__() img_size = to_2tuple(img_size) patch_size = to_2tuple(patch_size) - num_patches = (img_size[1] // patch_size[1]) * \ - (img_size[0] // patch_size[0]) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) self.img_size = img_size self.patch_size = patch_size self.num_patches = num_patches - self.proj = nn.Conv2D( - in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + self.proj = nn.Conv2D(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) def forward(self, x): B, C, H, W = x.shape - assert H == self.img_size[0] and W == self.img_size[1], \ - f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + assert ( + H == self.img_size[0] and W == self.img_size[1] + ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." x = self.proj(x).flatten(2).transpose((0, 2, 1)) return x class VisionTransformer(Blip2PretrainedModel): - """ Vision Transformer with support for patch input - """ + """Vision Transformer with support for patch input""" + main_input_name = "pixel_values" config_class = Blip2VisionConfig @@ -396,47 +367,49 @@ def __init__(self, config: Blip2VisionConfig, **kwargs): self.num_features = self.embed_dim = config.embed_dim _img_size = to_2tuple(config.img_size) _patch_size = to_2tuple(config.patch_size) - self.window_size = (_img_size[0] // _patch_size[0], - _img_size[1] // _patch_size[1]) + self.window_size = (_img_size[0] // _patch_size[0], _img_size[1] // _patch_size[1]) self.patch_embed = PatchEmbed( img_size=config.img_size, patch_size=config.patch_size, in_chans=config.in_chans, - embed_dim=config.embed_dim) + embed_dim=config.embed_dim, + ) num_patches = self.patch_embed.num_patches - self.cls_token = self.create_parameter( - shape=(1, 1, config.embed_dim), default_initializer=zeros_) + self.cls_token = self.create_parameter(shape=(1, 1, config.embed_dim), default_initializer=zeros_) self.pos_embed = self.create_parameter( - shape=(1, num_patches + 1, config.embed_dim), - default_initializer=zeros_) + shape=(1, num_patches + 1, config.embed_dim), default_initializer=zeros_ + ) self.add_parameter("pos_embed", self.pos_embed) self.add_parameter("cls_token", self.cls_token) self.pos_drop = nn.Dropout(p=config.drop_rate) self.gradient_checkpointing = config.gradient_checkpointing - logger.info("self.gradient_checkpointing:{}".format( - self.gradient_checkpointing)) + logger.info("self.gradient_checkpointing:{}".format(self.gradient_checkpointing)) dpr = np.linspace(0, config.drop_path_rate, config.depth) - self.blocks = nn.LayerList([ - Block( - dim=config.embed_dim, - num_heads=config.num_heads, - mlp_ratio=config.mlp_ratio, - qkv_bias=config.qkv_bias, - qk_scale=config.qk_scale, - drop=config.drop_rate, - attn_drop=config.attn_drop_rate, - drop_path=dpr[i], - norm_layer=config.norm_layer, - epsilon=config.epsilon, - window_size=self.window_size, - mp_degree=mp_degree, - use_flash_attn=use_flash_attn, - use_fusedlinear=use_fusedlinear) for i in range(config.depth) - ]) + self.blocks = nn.LayerList( + [ + Block( + dim=config.embed_dim, + num_heads=config.num_heads, + mlp_ratio=config.mlp_ratio, + qkv_bias=config.qkv_bias, + qk_scale=config.qk_scale, + drop=config.drop_rate, + attn_drop=config.attn_drop_rate, + drop_path=dpr[i], + norm_layer=config.norm_layer, + epsilon=config.epsilon, + window_size=self.window_size, + mp_degree=mp_degree, + use_flash_attn=use_flash_attn, + use_fusedlinear=use_fusedlinear, + ) + for i in range(config.depth) + ] + ) self.mp_degree = mp_degree if self.pos_embed is not None: @@ -447,9 +420,7 @@ def __init__(self, config: Blip2VisionConfig, **kwargs): def _init_weights(self, m): if isinstance(m, (nn.Linear, fleet.meta_parallel.ColumnParallelLinear)): trunc_normal_(m.weight) - if isinstance(m, - (nn.Linear, fleet.meta_parallel.ColumnParallelLinear - )) and m.bias is not None: + if isinstance(m, (nn.Linear, fleet.meta_parallel.ColumnParallelLinear)) and m.bias is not None: zeros_(m.bias) elif isinstance(m, nn.LayerNorm): zeros_(m.bias) @@ -469,15 +440,14 @@ def forward_features(self, x): x = self.pos_drop(x) else: x = self.pos_drop(x) - rel_pos_bias = self.rel_pos_bias() if hasattr(self, - 'rel_pos_bias') else None + rel_pos_bias = self.rel_pos_bias() if hasattr(self, "rel_pos_bias") else None for blk in self.blocks: if self.gradient_checkpointing and self.training: x = recompute(blk, x, rel_pos_bias=rel_pos_bias) else: x = blk(x, rel_pos_bias=rel_pos_bias) - #x = self.norm(x) + # x = self.norm(x) return x def forward(self, x): @@ -486,60 +456,47 @@ def forward(self, x): def interpolate_pos_embed(model, checkpoint_model): - if 'visual_encoder.pos_embed' in checkpoint_model: - pos_embed_checkpoint = checkpoint_model['visual_encoder.pos_embed'] + if "visual_encoder.pos_embed" in checkpoint_model: + pos_embed_checkpoint = checkpoint_model["visual_encoder.pos_embed"] embedding_size = pos_embed_checkpoint.shape[-1] num_patches = model.visual_encoder.patch_embed.num_patches - num_extra_tokens = model.visual_encoder.pos_embed.shape[ - -2] - num_patches + num_extra_tokens = model.visual_encoder.pos_embed.shape[-2] - num_patches # height (== width) for the checkpoint position embedding - orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens)** - 0.5) + orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) # height (== width) for the new position embedding new_size = int(num_patches**0.5) # class_token and dist_token are kept unchanged if orig_size != new_size: - print("Position interpolate from %dx%d to %dx%d" % - (orig_size, orig_size, new_size, new_size)) + print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size)) extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] # only the position tokens are interpolated pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] - pos_tokens = pos_tokens.reshape( - (-1, orig_size, orig_size, embedding_size)).transpose( - (0, 3, 1, 2)) + pos_tokens = pos_tokens.reshape((-1, orig_size, orig_size, embedding_size)).transpose((0, 3, 1, 2)) pos_tokens = paddle.nn.functional.interpolate( - pos_tokens, - size=(new_size, new_size), - mode='bicubic', - align_corners=False) + pos_tokens, size=(new_size, new_size), mode="bicubic", align_corners=False + ) pos_tokens = pos_tokens.transpose((0, 2, 3, 1)).flatten(1, 2) new_pos_embed = paddle.concat((extra_tokens, pos_tokens), axis=1) - checkpoint_model['visual_encoder.pos_embed'] = new_pos_embed - elif 'pos_embed' in checkpoint_model: - pos_embed_checkpoint = checkpoint_model['pos_embed'] + checkpoint_model["visual_encoder.pos_embed"] = new_pos_embed + elif "pos_embed" in checkpoint_model: + pos_embed_checkpoint = checkpoint_model["pos_embed"] embedding_size = pos_embed_checkpoint.shape[-1] num_patches = model.patch_embed.num_patches num_extra_tokens = model.pos_embed.shape[-2] - num_patches # height (== width) for the checkpoint position embedding - orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens)** - 0.5) + orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) # height (== width) for the new position embedding new_size = int(num_patches**0.5) # class_token and dist_token are kept unchanged if orig_size != new_size: - print("Position interpolate from %dx%d to %dx%d" % - (orig_size, orig_size, new_size, new_size)) + print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size)) extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] # only the position tokens are interpolated pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] - pos_tokens = pos_tokens.reshape( - (-1, orig_size, orig_size, embedding_size)).transpose( - (0, 3, 1, 2)) + pos_tokens = pos_tokens.reshape((-1, orig_size, orig_size, embedding_size)).transpose((0, 3, 1, 2)) pos_tokens = paddle.nn.functional.interpolate( - pos_tokens, - size=(new_size, new_size), - mode='bicubic', - align_corners=False) + pos_tokens, size=(new_size, new_size), mode="bicubic", align_corners=False + ) pos_tokens = pos_tokens.transpose((0, 2, 3, 1)).flatten(1, 2) new_pos_embed = paddle.concat((extra_tokens, pos_tokens), axis=1) - checkpoint_model['pos_embed'] = new_pos_embed + checkpoint_model["pos_embed"] = new_pos_embed diff --git a/paddlemix/models/blip2/modeling.py b/paddlemix/models/blip2/modeling.py index caeb250df69da..0cfb3c7daa98c 100644 --- a/paddlemix/models/blip2/modeling.py +++ b/paddlemix/models/blip2/modeling.py @@ -13,43 +13,46 @@ # See the License for the specific language governing permissions and # limitations under the License. """ Paddle BLIP2 model.""" -from paddlemix.utils.log import logger -import math from dataclasses import dataclass from typing import Any, Optional, Tuple, Union import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle.distributed.fleet.utils import recompute import paddle.distributed as dist - +import paddle.nn as nn +from paddlenlp.transformers import AutoTokenizer from paddlenlp.transformers.model_outputs import ModelOutput from paddlenlp.transformers.model_utils import PretrainedModel - -from paddlemix.models.blip2.modeling_opt import OPTForCausalLM -from paddlenlp.transformers.t5.configuration import T5Config from paddlenlp.transformers.t5.modeling import T5ForConditionalGeneration from paddlenlp.utils.initializer import normal_, ones_, zeros_ -from paddlenlp.utils.log import logger -from .configuration import Blip2Config + +from paddlemix.models.blip2.modeling_opt import OPTForCausalLM +from paddlemix.models.blip2.modeling_utils import ( + all_gather_with_grad, + concat_all_gather, + disabled_train, + masked_fill, +) from paddlemix.models.blip2.Qformer import BertLMHeadModel -from paddlenlp.transformers import AutoTokenizer -from paddlemix.models.blip2.modeling_utils import disabled_train, all_gather_with_grad, concat_all_gather, masked_fill +from paddlemix.utils.log import logger + +from .configuration import Blip2Config BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST = [ "Salesforce/blip2-flan-t5-xl", "Salesforce/blip2-opt-2.7b", ] -__all__ = ["Blip2ForConditionalGeneration", ] +__all__ = [ + "Blip2ForConditionalGeneration", +] def Parameter(tensor): return paddle.create_parameter( tensor.shape, dtype=tensor.dtype, - default_initializer=nn.initializer.Assign(tensor), ) + default_initializer=nn.initializer.Assign(tensor), + ) @dataclass @@ -77,9 +80,11 @@ class Blip2ForConditionalGenerationModelOutput(ModelOutput): def to_tuple(self) -> Tuple[Any]: return tuple( - self[k] if k not in - ["vision_outputs", "qformer_outputs", "language_model_outputs"] else - getattr(self, k).to_tuple() for k in self.keys()) + self[k] + if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"] + else getattr(self, k).to_tuple() + for k in self.keys() + ) @dataclass @@ -87,6 +92,7 @@ class Blip2ForStage1ModelOutput(Blip2ForConditionalGenerationModelOutput): """ Class defining the outputs of [`Blip2ForStage1ModelOutput`]. """ + loss: Optional[Tuple[paddle.Tensor]] = None loss_itc: Optional[Tuple[paddle.Tensor]] = None loss_itm: Optional[paddle.Tensor] = None @@ -113,8 +119,7 @@ class Blip2PretrainedModel(PretrainedModel): def _init_weights(self, module): """Initialize the weights""" factor = self.config.initializer_range - if (isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or - isinstance(module, nn.Linear)): + if isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear): normal_(module.weight, mean=0.0, std=factor) if hasattr(module, "bias") and module.bias is not None: zeros_(module.bias) @@ -132,12 +137,9 @@ def init_tokenizer(cls, tokenizer_name="bert-base-uncased"): return tokenizer @classmethod - def from_pretrained(cls, - pretrained_model_name_or_path, - from_hf_hub: bool=False, - subfolder: str=None, - *args, - **kwargs): + def from_pretrained( + cls, pretrained_model_name_or_path, from_hf_hub: bool = False, subfolder: str = None, *args, **kwargs + ): """ Creates an instance of `PretrainedModel`. Model weights are loaded by specifying name of a built-in pretrained model, a pretrained model from HF Hub, a community contributed model, @@ -192,18 +194,22 @@ def from_pretrained(cls, model = BertForSequenceClassification.from_pretrained('./my_bert/' """ import os + + from paddlenlp.transformers.configuration_utils import PretrainedConfig + from paddlenlp.transformers.model_utils import load_state_dict, no_init_weights from paddlenlp.transformers.utils import ( ContextManagers, + device_guard, is_paddle_support_lazy_init, is_safetensors_available, resolve_cache_dir, - device_guard, ) - from paddlenlp.transformers.configuration_utils import PretrainedConfig + ) from paddlenlp.utils.env import ( CONFIG_NAME, PADDLE_WEIGHTS_NAME, - PYTORCH_WEIGHTS_NAME, ) - from paddlenlp.transformers.model_utils import no_init_weights, load_state_dict + PYTORCH_WEIGHTS_NAME, + ) + config = kwargs.pop("config", None) state_dict = kwargs.pop("state_dict", None) cache_dir = kwargs.pop("cache_dir", None) @@ -212,16 +218,14 @@ def from_pretrained(cls, dtype = kwargs.pop("dtype", None) subfolder = kwargs.pop("subfolder", "") variant = kwargs.pop("variant", None) - use_safetensors = kwargs.pop("use_safetensors", None - if is_safetensors_available() else False) + use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False) low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", False) convert_from_torch = kwargs.pop("convert_from_torch", None) load_state_as_np = kwargs.pop("load_state_as_np", None) mp_degree = kwargs.pop("mp_degree", 1) if load_state_as_np is not None: - logger.warning( - "`load_state_as_np` is deprecated, please delete it!") + logger.warning("`load_state_as_np` is deprecated, please delete it!") model_kwargs = kwargs @@ -236,8 +240,7 @@ def from_pretrained(cls, if convert_from_torch is None: convert_from_torch = False - cache_dir = resolve_cache_dir(pretrained_model_name_or_path, - from_hf_hub, cache_dir) + cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir) # 1. get the PretrainedConfig to init model if not isinstance(config, PretrainedConfig): config_path = config if config is not None else pretrained_model_name_or_path @@ -248,14 +251,14 @@ def from_pretrained(cls, force_download=force_download, from_hf_hub=from_hf_hub, subfolder=subfolder, - **kwargs, ) + **kwargs, + ) if not os.path.exists(os.path.join(cache_dir, CONFIG_NAME)): config.save_pretrained(cache_dir) # refine options for config config.mp_degree = mp_degree - convert_from_torch = cls.support_conversion( - config) and convert_from_torch + convert_from_torch = cls.support_conversion(config) and convert_from_torch if dtype is None: dtype = config.dtype @@ -285,7 +288,8 @@ def from_pretrained(cls, config=config, convert_from_torch=convert_from_torch, use_safetensors=use_safetensors, - variant=variant, ) + variant=variant, + ) # load pt weights early so that we know which dtype to init the model under if not is_sharded and state_dict is None: @@ -297,8 +301,7 @@ def from_pretrained(cls, f"Starting to convert pytorch weight file<{resolved_archive_file}> to " f"paddle weight file<{os.path.join(cache_dir, PADDLE_WEIGHTS_NAME)}> ..." ) - state_dict = cls.convert(resolved_archive_file, config, - cache_dir) + state_dict = cls.convert(resolved_archive_file, config, cache_dir) else: raise ValueError( f"download the {PYTORCH_WEIGHTS_NAME} weight file, but model<{cls}> " @@ -306,19 +309,15 @@ def from_pretrained(cls, ) else: # 4. loading non-sharded ckpt from the state dict - if config.tensor_parallel_degree > 1 and resolved_archive_file.endswith( - "model_state.pdparams"): - state_dict = cls.convert_tensor_parallel( - resolved_archive_file, config) + if config.tensor_parallel_degree > 1 and resolved_archive_file.endswith("model_state.pdparams"): + state_dict = cls.convert_tensor_parallel(resolved_archive_file, config) else: state_dict = load_state_dict(resolved_archive_file) - logger.info( - "Loaded weights file from disk, setting weights to model.") + logger.info("Loaded weights file from disk, setting weights to model.") # Check if `_keep_in_fp32_modules` is not None - use_keep_in_fp32_modules = ( - cls._keep_in_fp32_modules is not None) and dtype == "float16" + use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and dtype == "float16" if is_sharded: loaded_state_dict_keys = sharded_metadata["all_checkpoint_keys"] @@ -333,14 +332,14 @@ def from_pretrained(cls, for k in list(state_dict.keys()): if not isinstance(state_dict[k], paddle.Tensor): with device_guard(): - state_dict[k] = paddle.Tensor( - state_dict.pop(k), zero_copy=True) + state_dict[k] = paddle.Tensor(state_dict.pop(k), zero_copy=True) # 3. init the model init_args = config["init_args"] or () with ContextManagers(init_contexts): model = cls(config, *init_args, **model_kwargs) from paddlemix.models.blip2.eva_vit import interpolate_pos_embed + interpolate_pos_embed(model, state_dict) if use_keep_in_fp32_modules: # low_cpu_mem_usage = True @@ -358,7 +357,8 @@ def from_pretrained(cls, ignore_mismatched_sizes=ignore_mismatched_sizes, low_cpu_mem_usage=low_cpu_mem_usage, dtype=dtype, - keep_in_fp32_modules=keep_in_fp32_modules, ) + keep_in_fp32_modules=keep_in_fp32_modules, + ) if paddle.in_dynamic_mode(): return model @@ -376,13 +376,15 @@ class Blip2ForConditionalGeneration(Blip2PretrainedModel): ] def __init__( - self, - config: Blip2Config, ): + self, + config: Blip2Config, + ): super().__init__(config) from paddlemix.models.blip2.eva_vit import VisionTransformer + self.visual_encoder = VisionTransformer.from_pretrained( - pretrained_model_name_or_path=config.vision_config, - mp_degree=config.mp_degree) + pretrained_model_name_or_path=config.vision_config, mp_degree=config.mp_degree + ) self.freeze_vit = config.freeze_vit self.train_stage1 = False if self.freeze_vit: @@ -400,33 +402,32 @@ def __init__( encoder_width=self.visual_encoder.num_features, train_in_satge1=True, tokenizer_length=len(self.tokenizer), - mp_degree=config.mp_degree) + mp_degree=config.mp_degree, + ) state_dict = self.Qformer.state_dict() for name, param in self.Qformer.named_parameters(): - if '_query' in name: - key_orig = name.replace('_query', '') - param.copy_(state_dict[key_orig], False) ### problem + if "_query" in name: + key_orig = name.replace("_query", "") + param.copy_(state_dict[key_orig], False) self.temp = self.create_parameter( - shape=(1, ), - default_initializer=paddle.nn.initializer.Constant(value=0.07)) + shape=(1,), default_initializer=paddle.nn.initializer.Constant(value=0.07) + ) self.max_txt_len = config.get("max_txt_len") else: if config.use_decoder_only_language_model: if "opt" in config.text_config: language_model = OPTForCausalLM.from_pretrained( - config.text_config, - load_state_as_np=True, - mp_degree=config.mp_degree) + config.text_config, load_state_as_np=True, mp_degree=config.mp_degree + ) else: raise NotImplementedError else: if "t5" in config.text_config: language_model = T5ForConditionalGeneration( - config.text_config, - load_state_as_np=True, - mp_degree=config.mp_degree) + config.text_config, load_state_as_np=True, mp_degree=config.mp_degree + ) else: raise NotImplementedError @@ -441,7 +442,8 @@ def __init__( train_in_satge1=False, text_hidden_size=self.language_model.hidden_size, ignore_mismatched_sizes=True, - mp_degree=config.mp_degree) + mp_degree=config.mp_degree, + ) self.Qformer.cls = None self.Qformer.bert.embeddings.word_embeddings = None self.Qformer.bert.embeddings.position_embeddings = None @@ -452,13 +454,15 @@ def __init__( def get_input_embeddings(self) -> nn.Layer: return self.vision_model.embeddings.patch_embedding - def forward(self, - pixel_values: paddle.Tensor, - input_ids: paddle.Tensor=None, - attention_mask: Optional[paddle.Tensor]=None, - return_dict: Optional[bool]=None, - text_input_stage1: Optional[paddle.Tensor]=None, - **kwargs): + def forward( + self, + pixel_values: paddle.Tensor, + input_ids: paddle.Tensor = None, + attention_mask: Optional[paddle.Tensor] = None, + return_dict: Optional[bool] = None, + text_input_stage1: Optional[paddle.Tensor] = None, + **kwargs + ): if self.train_stage1: return self.forward_stage1(pixel_values, text_input_stage1) @@ -467,15 +471,17 @@ def forward(self, pixel_values, input_ids, attention_mask, - return_dict, ) + return_dict, + ) def forward_stage2( - self, - pixel_values: paddle.Tensor, - input_ids: paddle.Tensor, - attention_mask: Optional[paddle.Tensor]=None, - return_dict: Optional[bool]=None, - **kwargs) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]: + self, + pixel_values: paddle.Tensor, + input_ids: paddle.Tensor, + attention_mask: Optional[paddle.Tensor] = None, + return_dict: Optional[bool] = None, + **kwargs + ) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]: r""" Returns: Examples: @@ -516,54 +522,50 @@ def forward_stage2( >>> print(generated_text) two ```""" - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) - with paddle.amp.auto_cast(level='O2'): - image_embeds = self.Qformer.ln_vision( - self.visual_encoder(pixel_values)) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + with paddle.amp.auto_cast(level="O2"): + image_embeds = self.Qformer.ln_vision(self.visual_encoder(pixel_values)) image_embeds = image_embeds.astype("float32") # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention - image_attention_mask = paddle.ones( - image_embeds.shape[:-1], dtype="int64") - query_tokens = self.Qformer.query_tokens.expand( - [image_embeds.shape[0], -1, -1]) + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") + query_tokens = self.Qformer.query_tokens.expand([image_embeds.shape[0], -1, -1]) query_outputs = self.Qformer.bert( query_embeds=query_tokens, encoder_hidden_states=image_embeds, encoder_attention_mask=image_attention_mask, - return_dict=True, ) + return_dict=True, + ) query_output = query_outputs[0] # step 3: use the language model, conditioned on the query outputs and the prompt language_model_inputs = self.Qformer.language_projection(query_output) - language_model_attention_mask = paddle.ones( - language_model_inputs.shape[:-1], dtype="int64") + language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64") inputs_embeds = self.language_model.get_input_embeddings()(input_ids) - inputs_embeds = paddle.concat( - [language_model_inputs, inputs_embeds], axis=1) + inputs_embeds = paddle.concat([language_model_inputs, inputs_embeds], axis=1) if attention_mask is None: attention_mask = paddle.ones_like(input_ids) - attention_mask = paddle.concat( - [language_model_attention_mask, attention_mask], axis=1) + attention_mask = paddle.concat([language_model_attention_mask, attention_mask], axis=1) - targets = input_ids * (1 - ( - input_ids == self.pad_token_id).astype(input_ids.dtype)) + ( - input_ids == self.pad_token_id).astype(input_ids.dtype) * (-100) + targets = input_ids * (1 - (input_ids == self.pad_token_id).astype(input_ids.dtype)) + ( + input_ids == self.pad_token_id + ).astype(input_ids.dtype) * (-100) - empty_targets = paddle.ones( - language_model_attention_mask.shape, dtype="int64").fill_(-100) + empty_targets = paddle.ones(language_model_attention_mask.shape, dtype="int64").fill_(-100) labels = paddle.concat([empty_targets, targets], axis=1) labels.stop_gradient = True - with paddle.amp.auto_cast(level='O2'): + with paddle.amp.auto_cast(level="O2"): outputs = self.language_model( inputs_embeds=inputs_embeds, attention_mask=attention_mask, return_dict=True, - labels=labels, ) + labels=labels, + ) loss = outputs.loss - return Blip2ForConditionalGenerationModelOutput(loss=loss, ) + return Blip2ForConditionalGenerationModelOutput( + loss=loss, + ) def forward_stage1(self, pixel_values, text_input): text = text_input @@ -572,80 +574,67 @@ def forward_stage1(self, pixel_values, text_input): image_embeds = self.Qformer.ln_vision(self.visual_encoder(image)) image_atts = paddle.ones(image_embeds.shape[:-1], dtype="int64") - query_tokens = self.Qformer.query_tokens.expand( - shape=[image_embeds.shape[0], -1, -1]) + query_tokens = self.Qformer.query_tokens.expand(shape=[image_embeds.shape[0], -1, -1]) query_output = self.Qformer.bert( query_embeds=query_tokens, encoder_hidden_states=image_embeds, encoder_attention_mask=image_atts, use_cache=True, - return_dict=True) + return_dict=True, + ) image_feats = paddle.nn.functional.normalize( - x=self.Qformer.vision_proj(query_output.last_hidden_state), axis=-1) + x=self.Qformer.vision_proj(query_output.last_hidden_state), axis=-1 + ) text_tokens = self.tokenizer( text, - padding='max_length', + padding="max_length", truncation=True, max_length=self.max_txt_len, return_attention_mask=True, - return_tensors="pd") + return_tensors="pd", + ) text_output = self.Qformer.bert( - text_tokens.input_ids, - attention_mask=text_tokens.attention_mask, - return_dict=True) + text_tokens.input_ids, attention_mask=text_tokens.attention_mask, return_dict=True + ) text_feat = paddle.nn.functional.normalize( - self.Qformer.text_proj(text_output.last_hidden_state[:, 0, :]), - axis=-1) + self.Qformer.text_proj(text_output.last_hidden_state[:, 0, :]), axis=-1 + ) - ###============== Image-text Contrastive ===================### + # Image-text Contrastive # image_feats_all = image_feats # text_feat_all = text_feat image_feats_all = concat_all_gather(image_feats) text_feat_all = concat_all_gather(text_feat) - sim_q2t = paddle.matmul( - image_feats.unsqueeze(axis=1), - text_feat_all.unsqueeze(axis=-1)).squeeze() + sim_q2t = paddle.matmul(image_feats.unsqueeze(axis=1), text_feat_all.unsqueeze(axis=-1)).squeeze() sim_i2t = sim_q2t.max(axis=-1) sim_i2t = sim_i2t / self.temp sim_t2q = paddle.matmul( - x=text_feat.unsqueeze(axis=1).unsqueeze(axis=1), - y=image_feats_all.transpose(perm=[0, 2, 1])).squeeze() + x=text_feat.unsqueeze(axis=1).unsqueeze(axis=1), y=image_feats_all.transpose(perm=[0, 2, 1]) + ).squeeze() sim_t2i = sim_t2q.max(axis=-1) sim_t2i = sim_t2i / self.temp rank = dist.get_rank() bs = image.shape[0] - targets = paddle.linspace( - start=rank * bs, stop=rank * bs + bs - 1, num=bs).astype(int) - one_hot_label = paddle.nn.functional.one_hot( - targets, num_classes=sim_i2t.shape[1]) - smooth_label = paddle.nn.functional.label_smooth( - label=one_hot_label, epsilon=0.1) - loss_itc = (paddle.nn.functional.cross_entropy( - input=sim_i2t, label=smooth_label, soft_label=True) + - paddle.nn.functional.cross_entropy( - input=sim_t2i, label=smooth_label, soft_label=True)) / 2 + targets = paddle.linspace(start=rank * bs, stop=rank * bs + bs - 1, num=bs).astype(int) + one_hot_label = paddle.nn.functional.one_hot(targets, num_classes=sim_i2t.shape[1]) + smooth_label = paddle.nn.functional.label_smooth(label=one_hot_label, epsilon=0.1) + loss_itc = ( + paddle.nn.functional.cross_entropy(input=sim_i2t, label=smooth_label, soft_label=True) + + paddle.nn.functional.cross_entropy(input=sim_t2i, label=smooth_label, soft_label=True) + ) / 2 text_input_ids_world = concat_all_gather(text_tokens.input_ids) - text_attention_mask_world = concat_all_gather( - text_tokens.attention_mask) + text_attention_mask_world = concat_all_gather(text_tokens.attention_mask) image_embeds_world = all_gather_with_grad(image_embeds) with paddle.no_grad(): - weights_t2i = paddle.nn.functional.softmax( - x=sim_t2i, axis=1) + 0.0001 - weights_t2i_list = paddle.chunk( - weights_t2i, - chunks=paddle.distributed.get_world_size(), - axis=-1) + weights_t2i = paddle.nn.functional.softmax(x=sim_t2i, axis=1) + 0.0001 + weights_t2i_list = paddle.chunk(weights_t2i, chunks=paddle.distributed.get_world_size(), axis=-1) weights_t2i_list[rank].fill_diagonal_(value=0) weights_t2i = paddle.concat(weights_t2i_list, axis=-1) - weights_i2t = paddle.nn.functional.softmax( - x=sim_i2t, axis=1) + 0.0001 - weights_i2t_list = paddle.chunk( - weights_i2t, - chunks=paddle.distributed.get_world_size(), - axis=-1) + weights_i2t = paddle.nn.functional.softmax(x=sim_i2t, axis=1) + 0.0001 + weights_i2t_list = paddle.chunk(weights_i2t, chunks=paddle.distributed.get_world_size(), axis=-1) weights_i2t_list[rank].fill_diagonal_(value=0) weights_i2t = paddle.concat(weights_i2t_list, axis=-1) image_embeds_neg = [] @@ -661,79 +650,59 @@ def forward_stage1(self, pixel_values, text_input): text_atts_neg.append(text_attention_mask_world[neg_idx]) text_ids_neg = paddle.stack(x=text_ids_neg, axis=0) text_atts_neg = paddle.stack(x=text_atts_neg, axis=0) - text_ids_all = paddle.concat( - x=[text_tokens.input_ids, text_tokens.input_ids, text_ids_neg], - axis=0) + text_ids_all = paddle.concat(x=[text_tokens.input_ids, text_tokens.input_ids, text_ids_neg], axis=0) text_atts_all = paddle.concat( - x=[ - text_tokens.attention_mask, text_tokens.attention_mask, - text_atts_neg - ], - axis=0) - query_tokens_itm = self.Qformer.query_tokens.expand( - shape=[text_ids_all.shape[0], -1, -1]) - query_atts_itm = paddle.ones( - shape=query_tokens_itm.shape[:-1], dtype='int64') - attention_mask_all = paddle.concat( - x=[query_atts_itm, text_atts_all], axis=1) - image_embeds_all = paddle.concat( - x=[image_embeds, image_embeds_neg, image_embeds], axis=0) - image_atts_all = paddle.ones( - shape=image_embeds_all.shape[:-1], dtype='int64') + x=[text_tokens.attention_mask, text_tokens.attention_mask, text_atts_neg], axis=0 + ) + query_tokens_itm = self.Qformer.query_tokens.expand(shape=[text_ids_all.shape[0], -1, -1]) + query_atts_itm = paddle.ones(shape=query_tokens_itm.shape[:-1], dtype="int64") + attention_mask_all = paddle.concat(x=[query_atts_itm, text_atts_all], axis=1) + image_embeds_all = paddle.concat(x=[image_embeds, image_embeds_neg, image_embeds], axis=0) + image_atts_all = paddle.ones(shape=image_embeds_all.shape[:-1], dtype="int64") output_itm = self.Qformer.bert( text_ids_all, query_embeds=query_tokens_itm, attention_mask=attention_mask_all, encoder_hidden_states=image_embeds_all, encoder_attention_mask=image_atts_all, - return_dict=True) - vl_embeddings = output_itm.last_hidden_state[:, :query_tokens_itm.shape[ - 1], :] + return_dict=True, + ) + vl_embeddings = output_itm.last_hidden_state[:, : query_tokens_itm.shape[1], :] vl_output = self.Qformer.itm_head(vl_embeddings) logits = vl_output.mean(axis=1) - itm_labels = paddle.concat( - [ - paddle.ones( - [bs], dtype='int64'), paddle.zeros( - [2 * bs], dtype='int64') - ], - axis=0) - loss_itm = paddle.nn.functional.cross_entropy( - input=logits, label=itm_labels) - ##================= Image Captioning ========================## + itm_labels = paddle.concat([paddle.ones([bs], dtype="int64"), paddle.zeros([2 * bs], dtype="int64")], axis=0) + loss_itm = paddle.nn.functional.cross_entropy(input=logits, label=itm_labels) + # Image Captioning decoder_input_ids = text_tokens.input_ids.clone() decoder_input_ids[:, (0)] = self.tokenizer.bos_token_id - labels = masked_fill(decoder_input_ids, - decoder_input_ids == self.tokenizer.pad_token_id, - -100) - query_atts = paddle.ones(shape=query_tokens.shape[:-1], dtype='int64') - attention_mask = paddle.concat( - x=[query_atts, text_tokens.attention_mask], axis=1) - #import pdb;pdb.set_trace() + labels = masked_fill(decoder_input_ids, decoder_input_ids == self.tokenizer.pad_token_id, -100) + query_atts = paddle.ones(shape=query_tokens.shape[:-1], dtype="int64") + attention_mask = paddle.concat(x=[query_atts, text_tokens.attention_mask], axis=1) lm_output = self.Qformer( decoder_input_ids, attention_mask=attention_mask, past_key_values=query_output.past_key_values, return_dict=True, - labels=labels) + labels=labels, + ) loss_lm = lm_output.loss return Blip2ForStage1ModelOutput( - loss=loss_itc + loss_itm + loss_lm, - loss_itc=loss_itc, - loss_itm=loss_itm, - loss_lm=loss_lm) + loss=loss_itc + loss_itm + loss_lm, loss_itc=loss_itc, loss_itm=loss_itm, loss_lm=loss_lm + ) @paddle.no_grad() - def generate_stage1(self, - samples, - use_nucleus_sampling=False, - num_beams=3, - max_length=30, - min_length=10, - top_p=0.9, - repetition_penalty=1.0): + def generate_stage1( + self, + samples, + use_nucleus_sampling=False, + num_beams=3, + max_length=30, + min_length=10, + top_p=0.9, + repetition_penalty=1.0, + ): """ Args: samples (dict): A dictionary containing the following keys: @@ -748,22 +717,16 @@ def generate_stage1(self, Returns: captions (list): A list of strings of length batch_size * num_captions. """ - image = samples['image'] + image = samples["image"] image_embeds = self.ln_vision(self.visual_encoder(image)) if not use_nucleus_sampling: image_embeds = image_embeds.repeat_interleave(num_beams, axis=0) else: num_beams = 1 - image_atts = paddle.ones(shape=image_embeds.shape[:-1], dtype='int64') - model_kwargs = { - 'encoder_hidden_states': image_embeds, - 'encoder_attention_mask': image_atts - } - input_ids = paddle.empty( - shape=[image.shape[0], 1], - dtype='int64').fill_(value=self.tokenizer.bos_token_id) - query_tokens = self.query_tokens.expand( - shape=[image_embeds.shape[0], -1, -1]) + image_atts = paddle.ones(shape=image_embeds.shape[:-1], dtype="int64") + model_kwargs = {"encoder_hidden_states": image_embeds, "encoder_attention_mask": image_atts} + input_ids = paddle.empty(shape=[image.shape[0], 1], dtype="int64").fill_(value=self.tokenizer.bos_token_id) + query_tokens = self.query_tokens.expand(shape=[image_embeds.shape[0], -1, -1]) outputs = self.Qformer.generate( input_ids=input_ids, query_embeds=query_tokens, @@ -774,18 +737,19 @@ def generate_stage1(self, top_p=top_p, eos_token_id=self.tokenizer.sep_token_id, pad_token_id=self.tokenizer.pad_token_id, - **model_kwargs) - captions = self.tokenizer.batch_decode( - outputs, skip_special_tokens=True) + **model_kwargs, + ) + captions = self.tokenizer.batch_decode(outputs, skip_special_tokens=True) return captions @paddle.no_grad() def generate( - self, - pixel_values: paddle.Tensor, - input_ids: Optional[paddle.Tensor]=None, - attention_mask: Optional[paddle.Tensor]=None, - **generate_kwargs, ) -> paddle.Tensor: + self, + pixel_values: paddle.Tensor, + input_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + **generate_kwargs, + ) -> paddle.Tensor: """ Overrides `generate` function to be able to use the model as a conditional generator. Args: @@ -800,32 +764,27 @@ def generate( """ batch_size = pixel_values.shape[0] image_embeds = self.Qformer.ln_vision(self.visual_encoder(pixel_values)) - image_attention_mask = paddle.ones( - image_embeds.shape[:-1], dtype="int64") + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") - query_tokens = self.Qformer.query_tokens.expand( - [image_embeds.shape[0], -1, -1]) + query_tokens = self.Qformer.query_tokens.expand([image_embeds.shape[0], -1, -1]) query_outputs = self.Qformer.bert( query_embeds=query_tokens, encoder_hidden_states=image_embeds, encoder_attention_mask=image_attention_mask, - return_dict=True, ) + return_dict=True, + ) query_output = query_outputs.last_hidden_state language_model_inputs = self.Qformer.language_projection(query_output) - language_attention_mask = paddle.ones( - language_model_inputs.shape[:-1], dtype="int64") + language_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64") if input_ids is None: - input_ids = paddle.to_tensor( - [[self.config.text_config.bos_token_id]]).tile([batch_size, 1]) + input_ids = paddle.to_tensor([[self.config.text_config.bos_token_id]]).tile([batch_size, 1]) if attention_mask is None: attention_mask = paddle.ones_like(input_ids) - attention_mask = paddle.concat( - [language_attention_mask, attention_mask], axis=1) + attention_mask = paddle.concat([language_attention_mask, attention_mask], axis=1) # concatenate query embeddings with prompt embeddings inputs_embeds = self.language_model.get_input_embeddings()(input_ids) - inputs_embeds = paddle.concat( - [language_model_inputs, inputs_embeds], axis=1) + inputs_embeds = paddle.concat([language_model_inputs, inputs_embeds], axis=1) outputs = self.language_model.generate( inputs_embeds=inputs_embeds, @@ -840,63 +799,62 @@ def generate( eos_token_id=50118, repetition_penalty=1, length_penalty=1, - num_return_sequences=1, ) + num_return_sequences=1, + ) return outputs @paddle.no_grad() def encode_image( - self, - pixel_values: paddle.Tensor, - **kwargs, ): - image_embeds = self.ln_vision( - self.visual_encoder(pixel_values.astype("float16"))) + self, + pixel_values: paddle.Tensor, + **kwargs, + ): + image_embeds = self.ln_vision(self.visual_encoder(pixel_values.astype("float16"))) image_embeds = image_embeds.astype("float32") - image_attention_mask = paddle.ones( - image_embeds.shape[:-1], dtype="int64") + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) query_outputs = self.Qformer.bert( query_embeds=query_tokens, encoder_hidden_states=image_embeds, encoder_attention_mask=image_attention_mask, - return_dict=True, ) + return_dict=True, + ) query_output = query_outputs[0] return query_output @paddle.no_grad() - def predict_answers(self, - pixel_values: paddle.Tensor, - input_ids: Optional[paddle.Tensor]=None, - attention_mask: Optional[paddle.Tensor]=None, - max_len=10, - min_len=1, - **kwargs): - batch_size = pixel_values.shape[0] + def predict_answers( + self, + pixel_values: paddle.Tensor, + input_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + max_len=10, + min_len=1, + **kwargs + ): + # batch_size = pixel_values.shape[0] image_embeds = self.Qformer.ln_vision(self.visual_encoder(pixel_values)) - image_attention_mask = paddle.ones( - image_embeds.shape[:-1], dtype="int64") + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") - query_tokens = self.Qformer.query_tokens.expand( - [image_embeds.shape[0], -1, -1]) + query_tokens = self.Qformer.query_tokens.expand([image_embeds.shape[0], -1, -1]) query_outputs = self.Qformer.bert( query_embeds=query_tokens, encoder_hidden_states=image_embeds, encoder_attention_mask=image_attention_mask, - return_dict=True, ) + return_dict=True, + ) query_output = query_outputs.last_hidden_state language_model_inputs = self.Qformer.language_projection(query_output) - language_attention_mask = paddle.ones( - language_model_inputs.shape[:-1], dtype="int64") + language_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64") - attention_mask = paddle.concat( - [language_attention_mask, attention_mask], axis=1) + attention_mask = paddle.concat([language_attention_mask, attention_mask], axis=1) # concatenate query embeddings with prompt embeddings inputs_embeds = self.language_model.get_input_embeddings()(input_ids) - inputs_embeds = paddle.concat( - [language_model_inputs, inputs_embeds], axis=1) + inputs_embeds = paddle.concat([language_model_inputs, inputs_embeds], axis=1) outputs = self.language_model.generate( inputs_embeds=inputs_embeds, @@ -909,7 +867,8 @@ def predict_answers(self, min_length=min_len, eos_token_id=50118, repetition_penalty=1, - length_penalty=0, ) + length_penalty=0, + ) return outputs diff --git a/paddlemix/models/blip2/modeling_opt.py b/paddlemix/models/blip2/modeling_opt.py index 3fdec9f248951..ee9e8250159d1 100644 --- a/paddlemix/models/blip2/modeling_opt.py +++ b/paddlemix/models/blip2/modeling_opt.py @@ -25,28 +25,25 @@ import paddle.nn.functional as F import paddle.tensor as tensor from paddle.distributed import fleet +from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker from paddle.fluid import layers from paddle.nn import Layer +from paddle.nn.functional.flash_attention import flash_attention from paddle.nn.layer.transformer import _convert_param_attr_to_list -from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker -from paddle.nn.functional.flash_attention import (flash_attention, ) - from paddlenlp.transformers.conversion_utils import StateDictNameMapping -from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model -from paddlenlp.utils.log import logger - from paddlenlp.transformers.model_outputs import ( BaseModelOutputWithPastAndCrossAttentions, - CausalLMOutputWithCrossAttentions, ) + CausalLMOutputWithCrossAttentions, +) +from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model from paddlenlp.transformers.opt.configuration import ( OPT_PRETRAINED_INIT_CONFIGURATION, OPT_PRETRAINED_RESOURCE_FILES_MAP, - OPTConfig, ) + OPTConfig, +) +from paddlenlp.utils.log import logger -__all__ = [ - "OPTModel", "OPTPretrainedModel", "OPTForCausalLM", - "OPTForConditionalGeneration" -] +__all__ = ["OPTModel", "OPTPretrainedModel", "OPTForCausalLM", "OPTForConditionalGeneration"] def finfo(dtype): @@ -64,24 +61,16 @@ def _make_causal_mask(input_ids_shape, past_key_values_length, dtype): """ batch_size, target_length = input_ids_shape - mask = paddle.full((target_length, target_length), - float(finfo(paddle.get_default_dtype()).min)) + mask = paddle.full((target_length, target_length), float(finfo(paddle.get_default_dtype()).min)) mask_cond = paddle.arange(mask.shape[-1]) mask_cond = mask_cond < (mask_cond + 1).reshape([mask.shape[-1], 1]) mask = paddle.where(mask_cond, paddle.full(mask_cond.shape, 0), mask) if past_key_values_length > 0: - mask = paddle.concat( - [ - paddle.zeros( - [target_length, past_key_values_length], dtype=mask.dtype), - mask - ], - axis=-1) - - expanded_mask = mask.unsqueeze(0).expand( - [batch_size, 1, target_length, target_length + past_key_values_length]) + mask = paddle.concat([paddle.zeros([target_length, past_key_values_length], dtype=mask.dtype), mask], axis=-1) + + expanded_mask = mask.unsqueeze(0).expand([batch_size, 1, target_length, target_length + past_key_values_length]) return expanded_mask @@ -95,9 +84,8 @@ def _expand_mask(mask, tgt_length): expanded_mask = ~(paddle.cast(mask[:, None, None, :], "bool")) expanded_mask = paddle.cast(expanded_mask, dtype=paddle.float32) - expanded_mask = expanded_mask.expand( - [batch_size, 1, tgt_length, src_length]) - expanded_mask = expanded_mask * float(finfo('float16').min) + expanded_mask = expanded_mask.expand([batch_size, 1, tgt_length, src_length]) + expanded_mask = expanded_mask * float(finfo("float16").min) return expanded_mask @@ -113,9 +101,10 @@ class MultiHeadAttention(nn.Layer): StaticCache = collections.namedtuple("StaticCache", ["k", "v"]) def __init__( - self, - config: OPTConfig, - need_weights=False, ): + self, + config: OPTConfig, + need_weights=False, + ): super(MultiHeadAttention, self).__init__() self.use_flash_attn = config.get("use_flash_attn", False) self.num_heads = config.num_attention_heads @@ -131,8 +120,8 @@ def __init__( self.mp_degree = config.mp_degree assert ( - self.head_dim * self.num_heads * config.mp_degree == - config.hidden_size), "hidden_size must be divisible by num_heads" + self.head_dim * self.num_heads * config.mp_degree == config.hidden_size + ), "hidden_size must be divisible by num_heads" if config.mp_degree > 1: if self.fuse_attention_qkv: @@ -140,33 +129,34 @@ def __init__( config.hidden_size, config.hidden_size * 3, has_bias=True, - input_is_parallel=True, ) + input_is_parallel=True, + ) else: self.q_proj = fleet.meta_parallel.ColumnParallelLinear( config.hidden_size, config.hidden_size, has_bias=True, - gather_output=False, ) + gather_output=False, + ) self.k_proj = fleet.meta_parallel.ColumnParallelLinear( config.hidden_size, config.hidden_size, has_bias=True, - gather_output=False, ) + gather_output=False, + ) self.v_proj = fleet.meta_parallel.ColumnParallelLinear( config.hidden_size, config.hidden_size, has_bias=True, - gather_output=False, ) + gather_output=False, + ) self.out_proj = fleet.meta_parallel.RowParallelLinear( - config.hidden_size, - config.hidden_size, - input_is_parallel=True, - has_bias=True) + config.hidden_size, config.hidden_size, input_is_parallel=True, has_bias=True + ) else: if self.fuse_attention_qkv: - self.qkv_proj = nn.Linear(config.hidden_size, - 3 * config.hidden_size) + self.qkv_proj = nn.Linear(config.hidden_size, 3 * config.hidden_size) else: self.q_proj = nn.Linear(config.hidden_size, config.hidden_size) self.k_proj = nn.Linear(config.hidden_size, config.hidden_size) @@ -176,15 +166,13 @@ def __init__( def _fuse_prepare_qkv(self, query, use_cache=False, cache=None): mix_layer = self.qkv_proj(query) - mix_layer = paddle.reshape_(mix_layer, - [0, 0, self.num_heads, 3 * self.head_dim]) + mix_layer = paddle.reshape_(mix_layer, [0, 0, self.num_heads, 3 * self.head_dim]) if not self.use_flash_attn: mix_layer = paddle.transpose(mix_layer, [0, 2, 1, 3]) + q, k, v = paddle.split(mix_layer, num_or_sections=3, axis=-1) - assert not isinstance( - cache, self.StaticCache - ), "cache currently does not support the StaticCache type" + assert not isinstance(cache, self.StaticCache), "cache currently does not support the StaticCache type" if isinstance(cache, self.Cache): # for decoder self-attention in inference @@ -262,29 +250,19 @@ def gen_cache(self, key, value=None, type=Cache): return self.StaticCache(k, v) elif value is None: # incremental_state k = layers.fill_constant_batch_size_like( - input=key, - shape=[-1, self.num_heads, 0, self.head_dim], - dtype=key.dtype, - value=0) + input=key, shape=[-1, self.num_heads, 0, self.head_dim], dtype=key.dtype, value=0 + ) v = layers.fill_constant_batch_size_like( - input=key, - shape=[-1, self.num_heads, 0, self.head_dim], - dtype=key.dtype, - value=0) + input=key, shape=[-1, self.num_heads, 0, self.head_dim], dtype=key.dtype, value=0 + ) return self.Cache(k, v) else: # incremental_state with initial value, mainly for usage like UniLM return self.Cache(key, value) - def forward(self, - query, - key, - value, - attn_mask=None, - use_cache=False, - cache=None, - output_attention=None, - is_causal=True): + def forward( + self, query, key, value, attn_mask=None, use_cache=False, cache=None, output_attention=None, is_causal=True + ): r""" Applies multi-head attention to map queries and a set of key-value pairs to outputs. @@ -295,8 +273,7 @@ def forward(self, if self.fuse_attention_qkv: q, k, v, cache = self._fuse_prepare_qkv(query, use_cache, cache) else: - q, k, v, cache = self._prepare_qkv(query, key, value, use_cache, - cache) + q, k, v, cache = self._prepare_qkv(query, key, value, use_cache, cache) if self.use_flash_attn: bsz, q_len, num_heads, head_dim = q.shape out, weights = flash_attention( @@ -305,12 +282,12 @@ def forward(self, v, causal=is_causal and q.shape[1] != 1, return_softmax=self.need_weights and output_attention, - dropout=self.dropout) + dropout=self.dropout, + ) out = out.reshape([bsz, q_len, head_dim * num_heads]) # scale dot product attention else: - product = paddle.matmul( - x=q * (self.head_dim**-0.5), y=k, transpose_y=True) + product = paddle.matmul(x=q * (self.head_dim**-0.5), y=k, transpose_y=True) if attn_mask is not None: product = product + attn_mask @@ -319,24 +296,15 @@ def forward(self, if self.dropout: if self.mp_degree > 1: with get_rng_state_tracker().rng_state("local_seed"): - weights = F.dropout( - weights, - self.dropout, - training=self.training, - mode="upscale_in_train") + weights = F.dropout(weights, self.dropout, training=self.training, mode="upscale_in_train") else: - weights = F.dropout( - weights, - self.dropout, - training=self.training, - mode="upscale_in_train") + weights = F.dropout(weights, self.dropout, training=self.training, mode="upscale_in_train") out = tensor.matmul(weights, v) # combine heads out = tensor.transpose(out, perm=[0, 2, 1, 3]) - out = tensor.reshape( - x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) + out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) # project to output out = self.out_proj(out) @@ -364,8 +332,7 @@ def __init__(self, config): act_dropout = config.hidden_dropout_prob normalize_before = getattr(config, "normalize_before", True) - weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal( - mean=0.0, std=config.initializer_range)) + weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(mean=0.0, std=config.initializer_range)) bias_attr = None self._config = locals() @@ -383,18 +350,16 @@ def __init__(self, config): self.self_attn = MultiHeadAttention(config, need_weights=True) if config.mp_degree > 1: self.linear1 = fleet.meta_parallel.ColumnParallelLinear( - d_model, dim_feedforward, has_bias=True, gather_output=True) + d_model, dim_feedforward, has_bias=True, gather_output=True + ) else: - self.linear1 = nn.Linear( - d_model, - dim_feedforward, - weight_attrs[2], - bias_attr=bias_attrs[2]) + self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2]) if config.mp_degree > 1: self.linear2 = fleet.meta_parallel.ColumnParallelLinear( - dim_feedforward, d_model, has_bias=True, gather_output=True) + dim_feedforward, d_model, has_bias=True, gather_output=True + ) """ self.linear2 = fleet.meta_parallel.RowParallelLinear( dim_feedforward, @@ -404,11 +369,7 @@ def __init__(self, config): ) """ else: - self.linear2 = nn.Linear( - dim_feedforward, - d_model, - weight_attrs[2], - bias_attr=bias_attrs[2]) + self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2]) self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5) self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5) @@ -421,13 +382,7 @@ def __init__(self, config): self.activation = getattr(F, activation) self.mp_degree = config.mp_degree - def forward(self, - tgt, - memory, - tgt_mask=None, - use_cache=False, - cache=None, - output_attentions=False): + def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None, output_attentions=False): residual = tgt if self.normalize_before: @@ -435,17 +390,9 @@ def forward(self, # self.self_attn(...) --> hidden_states, weights, (cache) if use_cache is False: - tgt, attn_weights = self.self_attn( - tgt, - tgt, - tgt, - tgt_mask, - use_cache, - cache, - output_attention=None) + tgt, attn_weights = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache, output_attention=None) else: - tgt, attn_weights, incremental_cache = self.self_attn( - tgt, tgt, tgt, tgt_mask, use_cache, cache) + tgt, attn_weights, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) if self.mp_degree > 1: with get_rng_state_tracker().rng_state("global_seed"): tgt = residual + self.dropout1(tgt) @@ -459,11 +406,9 @@ def forward(self, tgt = self.norm2(tgt) if self.mp_degree > 1: with get_rng_state_tracker().rng_state("global_seed"): - tgt = self.dropout2( - self.linear2(self.activation(self.linear1(tgt)))) + tgt = self.dropout2(self.linear2(self.activation(self.linear1(tgt)))) else: - tgt = self.dropout2( - self.linear2(self.activation(self.linear1(tgt)))) + tgt = self.dropout2(self.linear2(self.activation(self.linear1(tgt)))) tgt = residual + tgt if not self.normalize_before: @@ -472,15 +417,12 @@ def forward(self, if not (output_attentions or use_cache): return tgt - temp_list = [ - tgt, attn_weights, incremental_cache if use_cache else None - ] + temp_list = [tgt, attn_weights, incremental_cache if use_cache else None] return tuple(v for v in temp_list if v is not None) def gen_cache(self, memory): - incremental_cache = self.self_attn.gen_cache( - memory, type=self.self_attn.Cache) + incremental_cache = self.self_attn.gen_cache(memory, type=self.self_attn.Cache) return incremental_cache @@ -498,18 +440,15 @@ def __init__(self, config: OPTConfig, decoder_layers: List[Layer]): config.hidden_size, config.word_embed_proj_dim, gather_output=True, - has_bias=False, ) + has_bias=False, + ) else: if config.use_fusedlinear: self.project_out = paddle.incubate.nn.FusedLinear( - config.hidden_size, - config.word_embed_proj_dim, - bias_attr=False) + config.hidden_size, config.word_embed_proj_dim, bias_attr=False + ) else: - self.project_out = nn.Linear( - config.hidden_size, - config.word_embed_proj_dim, - bias_attr=False) + self.project_out = nn.Linear(config.hidden_size, config.word_embed_proj_dim, bias_attr=False) else: self.project_out = None @@ -524,16 +463,17 @@ def __init__(self, config: OPTConfig, decoder_layers: List[Layer]): self.checkpoints = [] def forward( - self, - tgt, - memory, - tgt_mask=None, - memory_mask=None, - use_cache: bool=False, - cache=None, - output_attentions=False, - output_hidden_states=False, - return_dict=False, ): + self, + tgt, + memory, + tgt_mask=None, + memory_mask=None, + use_cache: bool = False, + cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=False, + ): r""" Applies a stack of N Transformer decoder layers on inputs. If `norm` is provided, also applies layer normalization on the output of last decoder @@ -552,18 +492,19 @@ def forward( tgt_mask=tgt_mask, use_cache=use_cache, cache=cache[i] if cache is not None else cache, - output_attentions=output_attentions, ) + output_attentions=output_attentions, + ) # outputs = hidden_states if both use_cache and output_attentions are False # Otherwise, outputs = (hidden_states, attention if output_attentions, cache if use_cache) output = outputs[0] if (use_cache or output_attentions) else outputs if output_attentions: - all_self_attentions = all_self_attentions + (outputs[1], ) + all_self_attentions = all_self_attentions + (outputs[1],) if use_cache: new_caches.append(outputs[-1]) if output_hidden_states: - all_hidden_states = all_hidden_states + (output, ) + all_hidden_states = all_hidden_states + (output,) self.checkpoints.append(output.name) if self.final_layer_norm: @@ -573,9 +514,7 @@ def forward( output = self.project_out(output) if not return_dict: - temp_list = [ - output, new_caches, all_hidden_states, all_self_attentions - ] + temp_list = [output, new_caches, all_hidden_states, all_self_attentions] if not (use_cache or output_attentions or output_hidden_states): return output @@ -587,7 +526,8 @@ def forward( past_key_values=new_caches, hidden_states=all_hidden_states, attentions=all_self_attentions, - cross_attentions=None, ) + cross_attentions=None, + ) def gen_cache(self, memory, do_zip=False): r""" @@ -606,10 +546,7 @@ def gen_cache(self, memory, do_zip=False): class OPTLearnedPositionEmbedding(nn.Embedding): """this module learns postional embeddings up to a fixed maximum size""" - def __init__(self, - num_embeddings: int, - embedding_dim: int, - initializer_range: float): + def __init__(self, num_embeddings: int, embedding_dim: int, initializer_range: float): """OPT is set up so that if padding_idx is specified then offset the embedding ids by 2 and adjust num_embeddings appropriately. Other models don't have this hack. @@ -620,7 +557,7 @@ def __init__(self, self.offset = 2 super().__init__(num_embeddings + self.offset, embedding_dim) - def forward(self, attention_mask, past_key_values_length: int=0): + def forward(self, attention_mask, past_key_values_length: int = 0): """get the position embedding with attention mask Args: @@ -634,8 +571,7 @@ def forward(self, attention_mask, past_key_values_length: int=0): if attention_mask.dtype not in [paddle.bool, paddle.int64]: attention_mask = attention_mask == 1.0 - position_ids = paddle.cumsum( - paddle.cast(attention_mask, "int64"), axis=-1) * attention_mask - 1 + position_ids = paddle.cumsum(paddle.cast(attention_mask, "int64"), axis=-1) * attention_mask - 1 # cut positions if `past_key_values_length` is > 0 position_ids = position_ids[:, past_key_values_length:] @@ -653,15 +589,19 @@ def __init__(self, config: OPTConfig): self.word_embeddings = fleet.meta_parallel.VocabParallelEmbedding( config.vocab_size, config.word_embed_proj_dim, - weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal( - mean=0.0, std=config.initializer_range)), ) + weight_attr=paddle.ParamAttr( + initializer=nn.initializer.Normal(mean=0.0, std=config.initializer_range) + ), + ) else: self.word_embeddings = nn.Embedding( config.vocab_size, config.word_embed_proj_dim, # padding_idx=config.pad_token_id, - weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal( - mean=0.0, std=config.initializer_range)), ) + weight_attr=paddle.ParamAttr( + initializer=nn.initializer.Normal(mean=0.0, std=config.initializer_range) + ), + ) if config.word_embed_proj_dim != config.hidden_size: if config.mp_degree > 1: @@ -669,41 +609,34 @@ def __init__(self, config: OPTConfig): config.word_embed_proj_dim, config.hidden_size, gather_output=True, - has_bias=False, ) + has_bias=False, + ) else: if config.use_fusedlinear: self.project_in = paddle.incubate.nn.FusedLinear( - config.word_embed_proj_dim, - config.hidden_size, - bias_attr=False) + config.word_embed_proj_dim, config.hidden_size, bias_attr=False + ) else: - self.project_in = nn.Linear( - config.word_embed_proj_dim, - config.hidden_size, - bias_attr=False) + self.project_in = nn.Linear(config.word_embed_proj_dim, config.hidden_size, bias_attr=False) else: self.project_in = None self.position_embeddings = OPTLearnedPositionEmbedding( num_embeddings=config.max_position_embeddings, embedding_dim=config.hidden_size, - initializer_range=config.initializer_range, ) + initializer_range=config.initializer_range, + ) self.mp_degree = config.mp_degree self.dropout = nn.Dropout(config.hidden_dropout_prob) - def forward(self, - input_ids=None, - attention_mask=None, - input_embeddings=None, - past_key_values_length=None): + def forward(self, input_ids=None, attention_mask=None, input_embeddings=None, past_key_values_length=None): if input_ids is not None: input_embeddings = self.word_embeddings(input_ids) if self.project_in: input_embeddings = self.project_in(input_embeddings) - position_embeddings = self.position_embeddings(attention_mask, - past_key_values_length) + position_embeddings = self.position_embeddings(attention_mask, past_key_values_length) embeddings = input_embeddings + position_embeddings if self.mp_degree > 1: @@ -738,37 +671,32 @@ def _get_tensor_parallel_mappings(cls, config: OPTConfig, is_split=True): is_split=is_split, tensor_parallel_degree=config.tensor_parallel_degree, tensor_parallel_rank=config.tensor_parallel_rank, - num_attention_heads=config.num_attention_heads, ) - actions = {"word_embeddings.weight": partial(fn, is_column=False), } + num_attention_heads=config.num_attention_heads, + ) + actions = { + "word_embeddings.weight": partial(fn, is_column=False), + } for layer_index in range(config.num_hidden_layers): - actions.update({ - # Column Linear - f"decoder.layers.{layer_index}.self_attn.q_proj.weight": - partial( - fn, is_column=True), - f"decoder.layers.{layer_index}.self_attn.k_proj.weight": - partial( - fn, is_column=True), - f"decoder.layers.{layer_index}.self_attn.v_proj.weight": - partial( - fn, is_column=True), - f"decoder.layers.{layer_index}.linear1.weight": partial( - fn, is_column=True), - # Row Linear - f"decoder.layers.{layer_index}.linear2.weight": partial( - fn, is_column=False), - f"decoder.layers.{layer_index}.self_attn.out_proj.weight": - partial( - fn, is_column=False), - }) + actions.update( + { + # Column Linear + f"decoder.layers.{layer_index}.self_attn.q_proj.weight": partial(fn, is_column=True), + f"decoder.layers.{layer_index}.self_attn.k_proj.weight": partial(fn, is_column=True), + f"decoder.layers.{layer_index}.self_attn.v_proj.weight": partial(fn, is_column=True), + f"decoder.layers.{layer_index}.linear1.weight": partial(fn, is_column=True), + # Row Linear + f"decoder.layers.{layer_index}.linear2.weight": partial(fn, is_column=False), + f"decoder.layers.{layer_index}.self_attn.out_proj.weight": partial(fn, is_column=False), + } + ) if config.word_embed_proj_dim != config.hidden_size: - actions.update({ - "decoder.project_out.weight": partial( - fn, is_column=True), - "decoder.project_in.weight": partial( - fn, is_column=True), - }) + actions.update( + { + "decoder.project_out.weight": partial(fn, is_column=True), + "decoder.project_in.weight": partial(fn, is_column=True), + } + ) if cls.__name__ != "OPTModel": for key in list(actions.keys()): @@ -777,22 +705,12 @@ def _get_tensor_parallel_mappings(cls, config: OPTConfig, is_split=True): return actions @classmethod - def _get_name_mappings(cls, - config: OPTConfig) -> list[StateDictNameMapping]: + def _get_name_mappings(cls, config: OPTConfig) -> list[StateDictNameMapping]: mappings: list[StateDictNameMapping] = [] model_mappings = [ - [ - "decoder.embed_tokens.weight", - "embeddings.word_embeddings.weight" - ], - [ - "decoder.embed_positions.weight", - "embeddings.position_embeddings.weight" - ], - [ - "decoder.final_layer_norm.weight", - "decoder.final_layer_norm.weight" - ], + ["decoder.embed_tokens.weight", "embeddings.word_embeddings.weight"], + ["decoder.embed_positions.weight", "embeddings.position_embeddings.weight"], + ["decoder.final_layer_norm.weight", "decoder.final_layer_norm.weight"], ["decoder.final_layer_norm.bias", "decoder.final_layer_norm.bias"], ] for layer_index in range(config.num_hidden_layers): @@ -846,27 +764,18 @@ def _get_name_mappings(cls, f"decoder.layers.{layer_index}.linear1.weight", "transpose", ], - [ - f"decoder.layers.{layer_index}.fc1.bias", - f"decoder.layers.{layer_index}.linear1.bias" - ], + [f"decoder.layers.{layer_index}.fc1.bias", f"decoder.layers.{layer_index}.linear1.bias"], [ f"decoder.layers.{layer_index}.fc2.weight", f"decoder.layers.{layer_index}.linear2.weight", "transpose", ], - [ - f"decoder.layers.{layer_index}.fc2.bias", - f"decoder.layers.{layer_index}.linear2.bias" - ], + [f"decoder.layers.{layer_index}.fc2.bias", f"decoder.layers.{layer_index}.linear2.bias"], [ f"decoder.layers.{layer_index}.final_layer_norm.weight", f"decoder.layers.{layer_index}.norm2.weight", ], - [ - f"decoder.layers.{layer_index}.final_layer_norm.bias", - f"decoder.layers.{layer_index}.norm2.bias" - ], + [f"decoder.layers.{layer_index}.final_layer_norm.bias", f"decoder.layers.{layer_index}.norm2.bias"], ] model_mappings.extend(layer_mappings) @@ -877,17 +786,12 @@ def _get_name_mappings(cls, mapping[1] = "opt." + mapping[1] # downstream mappings - mappings = [ - StateDictNameMapping( - *mapping, index=index) - for index, mapping in enumerate(model_mappings) - ] + mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)] return mappings def _init_weights(self, layer): """Initialization hook""" - if isinstance(layer, (paddle.incubate.nn.FusedLinear, nn.Linear, - nn.Embedding)): + if isinstance(layer, (paddle.incubate.nn.FusedLinear, nn.Linear, nn.Embedding)): # In the dygraph mode, use the `set_value` to reset the parameter directly, # and reset the `state_dict` to update parameter in static mode. if isinstance(layer.weight, paddle.Tensor): @@ -895,9 +799,11 @@ def _init_weights(self, layer): paddle.tensor.normal( mean=0.0, std=self.initializer_range - if hasattr(self, "initializer_range") else - self.opt.config["initializer_range"], - shape=layer.weight.shape, )) + if hasattr(self, "initializer_range") + else self.opt.config["initializer_range"], + shape=layer.weight.shape, + ) + ) @register_base_model @@ -932,38 +838,36 @@ def __init__(self, config: OPTConfig): self.decoder = TransformerDecoder(config, decoder_layers) self.checkpoints = [] - def _prepare_decoder_attention_mask(self, attention_mask, input_shape, - past_key_values_length): + def _prepare_decoder_attention_mask(self, attention_mask, input_shape, past_key_values_length): # create causal mask # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] combined_attention_mask = None if input_shape[-1] > 1: combined_attention_mask = _make_causal_mask( - input_shape, - past_key_values_length=past_key_values_length, - dtype=attention_mask.dtype) + input_shape, past_key_values_length=past_key_values_length, dtype=attention_mask.dtype + ) if attention_mask is not None: # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - expanded_attn_mask = _expand_mask( - attention_mask, tgt_length=input_shape[-1]) + expanded_attn_mask = _expand_mask(attention_mask, tgt_length=input_shape[-1]) combined_attention_mask = ( - expanded_attn_mask if combined_attention_mask is None else - expanded_attn_mask + combined_attention_mask) + expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask + ) return combined_attention_mask def forward( - self, - input_ids=None, - position_ids=None, - attention_mask=None, - inputs_embeds=None, - use_cache=False, - cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, ): + self, + input_ids=None, + position_ids=None, + attention_mask=None, + inputs_embeds=None, + use_cache=False, + cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): r""" The OPTModel forward method, overrides the `__call__()` special method. @@ -1031,42 +935,37 @@ def forward( logger.warning("position_ids has not required for OPTModel.") output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: - raise ValueError( - "You cannot specify both input_ids and inputs_embeds at the same time" - ) + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = paddle.shape(input_ids) input_ids = input_ids.reshape((-1, input_shape[-1])) elif inputs_embeds is not None: input_shape = paddle.shape(inputs_embeds)[:-1] else: - raise ValueError( - "You have to specify either input_ids or inputs_embeds") + raise ValueError("You have to specify either input_ids or inputs_embeds") self.checkpoints = [] - past_key_values_length = paddle.shape(cache[0].k)[ - 2] if cache is not None else 0 + past_key_values_length = paddle.shape(cache[0].k)[2] if cache is not None else 0 seq_length_with_past = input_shape[-1] + past_key_values_length if attention_mask is None: - attention_mask = paddle.ones( - (input_shape[0], seq_length_with_past), dtype=paddle.bool) + attention_mask = paddle.ones((input_shape[0], seq_length_with_past), dtype=paddle.bool) embedding_output = self.embeddings( input_ids=input_ids, attention_mask=attention_mask, input_embeddings=inputs_embeds, - past_key_values_length=past_key_values_length, ) + past_key_values_length=past_key_values_length, + ) - attention_mask = self._prepare_decoder_attention_mask( - attention_mask, input_shape, past_key_values_length) + attention_mask = self._prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length) attention_mask.stop_gradient = True outputs = self.decoder.forward( @@ -1077,17 +976,17 @@ def forward( cache=cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) if output_hidden_states: if return_dict: - outputs.hidden_states = (embedding_output, - ) + outputs.hidden_states + outputs.hidden_states = (embedding_output,) + outputs.hidden_states else: # [last_hidden_state, caches, all_hidden_states, all_self_attentions] idx = 2 if use_cache else 1 - all_hidden_states = ((embedding_output, ) + outputs[idx], ) - outputs = outputs[:idx] + all_hidden_states + outputs[idx + 1:] + all_hidden_states = ((embedding_output,) + outputs[idx],) + outputs = outputs[:idx] + all_hidden_states + outputs[idx + 1 :] self.checkpoints.extend(self.decoder.checkpoints) return outputs @@ -1108,22 +1007,19 @@ def set_input_embeddings(self, embedding: nn.Embedding): class OPTLMHead(Layer): - def __init__(self, - hidden_size: int, - vocab_size: int, - embedding_weights=None): + def __init__(self, hidden_size: int, vocab_size: int, embedding_weights=None): super(OPTLMHead, self).__init__() - self.decoder_weight = (self.create_parameter( - shape=[vocab_size, hidden_size], - dtype=paddle.get_default_dtype(), - is_bias=True) if embedding_weights is None else embedding_weights) + self.decoder_weight = ( + self.create_parameter(shape=[vocab_size, hidden_size], dtype=paddle.get_default_dtype(), is_bias=True) + if embedding_weights is None + else embedding_weights + ) def forward(self, hidden_states): if isinstance(hidden_states, BaseModelOutputWithPastAndCrossAttentions): hidden_states = hidden_states["last_hidden_state"] - logits = paddle.tensor.matmul( - hidden_states, self.decoder_weight, transpose_y=True) + logits = paddle.tensor.matmul(hidden_states, self.decoder_weight, transpose_y=True) return logits @@ -1139,9 +1035,10 @@ class OPTForCausalLM(OPTPretrainedModel): def __init__(self, config: OPTConfig, **kwargs): super(OPTForCausalLM, self).__init__(config) - from paddle.distributed import fleet + config.use_fusedlinear = config.get("use_fusedlinear", False) config.mp_degree = config.mp_degree + self.opt = OPTModel(config) self.lm_head = OPTLMHead( hidden_size=self.opt.config.hidden_size, @@ -1150,17 +1047,18 @@ def __init__(self, config: OPTConfig, **kwargs): ) def forward( - self, - input_ids=None, - attention_mask=None, - inputs_embeds=None, - labels=None, - use_cache=False, - cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - **kwargs, ): + self, + input_ids=None, + attention_mask=None, + inputs_embeds=None, + labels=None, + use_cache=False, + cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs, + ): r""" Args: @@ -1207,9 +1105,10 @@ def forward( print(tokenizer.batch_decode(output_ids[0])) """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.opt( @@ -1220,7 +1119,8 @@ def forward( cache=cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) if use_cache: encoder_outputs, cached_kvs = outputs[:2] @@ -1231,12 +1131,13 @@ def forward( loss = None if labels is not None: - logits = logits[:, -labels.shape[1]:, :] + logits = logits[:, -labels.shape[1] :, :] shift_logits = logits[:, :-1, :] shift_labels = labels[:, 1:] - loss_fct = CrossEntropyLoss(reduction='mean', label_smoothing=None) - labels = shift_labels.reshape((-1, )) + loss_fct = CrossEntropyLoss(reduction="mean", label_smoothing=None) + labels = shift_labels.reshape((-1,)) + valid_index = paddle.where(labels != -100)[0].flatten() logits = shift_logits.reshape((-1, shift_logits.shape[-1])) logits = paddle.gather(logits, valid_index, axis=0) @@ -1249,8 +1150,8 @@ def forward( if not use_cache: return (loss, logits) if loss is not None else logits - outputs = (logits, ) + outputs[1:] - return ((loss, ) + outputs) if loss is not None else outputs + outputs = (logits,) + outputs[1:] + return ((loss,) + outputs) if loss is not None else outputs return CausalLMOutputWithCrossAttentions( loss=loss, @@ -1258,7 +1159,8 @@ def forward( past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, - cross_attentions=outputs.cross_attentions, ) + cross_attentions=outputs.cross_attentions, + ) def prepare_fast_entry(self, kwargs: Dict[str, Any]): # import FasterOPT at here to avoid cycling import @@ -1270,37 +1172,27 @@ def prepare_fast_entry(self, kwargs: Dict[str, Any]): decoding_lib = kwargs.get("decoding_lib", None) if decode_strategy == "beam_search": - raise AttributeError( - "'beam_search' is not supported yet in the fast version of OPT") + raise AttributeError("'beam_search' is not supported yet in the fast version of OPT") # Currently, FasterTransformer only support restricted size_per_head. - size_per_head = self.opt.config["hidden_size"] // self.opt.config[ - "num_attention_heads"] + + size_per_head = self.opt.config["hidden_size"] // self.opt.config["num_attention_heads"] + if size_per_head not in [32, 64, 80, 96, 128]: raise AttributeError( - "'size_per_head = %d' is not supported yet in the fast version of OPT" - % size_per_head) + "'size_per_head = %d' is not supported yet in the fast version of OPT" % size_per_head + ) if kwargs["forced_bos_token_id"] is not None: # not support for forced_bos_token_id yet in the fast version - raise AttributeError( - "'forced_bos_token_id != None' is not supported yet in the fast version" - ) + raise AttributeError("'forced_bos_token_id != None' is not supported yet in the fast version") if kwargs["min_length"] != 0: # not support for min_length yet in the fast version - raise AttributeError( - "'min_length != 0' is not supported yet in the fast version") - self._fast_entry = FasterOPT( - self, - use_fp16_decoding=use_fp16_decoding, - decoding_lib=decoding_lib).forward + raise AttributeError("'min_length != 0' is not supported yet in the fast version") + self._fast_entry = FasterOPT(self, use_fp16_decoding=use_fp16_decoding, decoding_lib=decoding_lib).forward return self._fast_entry - def prepare_inputs_for_generation(self, - input_ids, - use_cache=False, - cache=None, - attention_mask=None, - inputs_embeds=None, - **kwargs): + def prepare_inputs_for_generation( + self, input_ids, use_cache=False, cache=None, attention_mask=None, inputs_embeds=None, **kwargs + ): if cache is not None: input_ids = input_ids[:, -1:] @@ -1310,20 +1202,23 @@ def prepare_inputs_for_generation(self, else: model_inputs = {"input_ids": input_ids} - model_inputs.update({ - "cache": cache, - "use_cache": True, - "attention_mask": attention_mask, - }) + model_inputs.update( + { + "cache": cache, + "use_cache": True, + "attention_mask": attention_mask, + } + ) return model_inputs @staticmethod - def prepare_attention_mask_for_generation(input_ids, pad_token_id, - eos_token_id): + def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id): is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any( - input_ids == pad_token_id).numpy().item() + input_ids == pad_token_id + ).numpy().item() is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or ( - (eos_token_id is not None) and (pad_token_id != eos_token_id)) + (eos_token_id is not None) and (pad_token_id != eos_token_id) + ) if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id: attention_mask = (input_ids != pad_token_id).astype("int64") else: @@ -1348,7 +1243,7 @@ class CrossEntropyLoss(nn.Layer): Softmax Cross entropy loss """ - def __init__(self, reduction='mean', label_smoothing=None): + def __init__(self, reduction="mean", label_smoothing=None): super().__init__() if label_smoothing is not None: assert label_smoothing >= 0 and label_smoothing <= 1, "label_smoothing must be in [0, 1]" @@ -1378,12 +1273,12 @@ def forward(self, x, label): loss = paddle.sum(-label * F.log_softmax(x, axis=-1), axis=-1) else: if label.dtype == paddle.int32: - label = paddle.cast(label, 'int64') + label = paddle.cast(label, "int64") loss = F.cross_entropy(x, label=label, soft_label=False) - if self.reduction == 'sum': + if self.reduction == "sum": return loss.sum() - elif self.reduction == 'mean': + elif self.reduction == "mean": return loss.mean() else: return loss diff --git a/paddlemix/models/blip2/modeling_utils.py b/paddlemix/models/blip2/modeling_utils.py index 35f9105ae3402..dad18fac98637 100644 --- a/paddlemix/models/blip2/modeling_utils.py +++ b/paddlemix/models/blip2/modeling_utils.py @@ -13,12 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle + import numpy as np -import paddle.nn.functional as F +import paddle import paddle.nn as nn -from paddlemix.utils.log import logger -import time +import paddle.nn.functional as F + + def disabled_train(self, mode=True): """Overwrite model.train with this function to make sure train/eval mode does not change anymore.""" @@ -35,7 +36,7 @@ def concat_all_gather(tensor): return tensor tensors_gather = [] - paddle.distributed.all_gather(tensors_gather, tensor,sync_op=False) + paddle.distributed.all_gather(tensors_gather, tensor, sync_op=False) output = paddle.concat(tensors_gather, axis=0) return output @@ -47,8 +48,7 @@ def tile(x, dim, n_tile): repeat_idx[dim] = n_tile x = x.repeat(*(repeat_idx)) order_index = paddle.to_tensor( - np.concatenate([init_dim * np.arange(n_tile) + i for i in range(init_dim)]), - dtype='int64' + np.concatenate([init_dim * np.arange(n_tile) + i for i in range(init_dim)]), dtype="int64" ) return paddle.index_select(x, dim, order_index) @@ -67,12 +67,13 @@ def all_gather_with_grad(tensors): tensor_all = GatherLayer.apply(tensors) return paddle.concat(tensor_all, axis=0) + class CrossEntropyLoss(nn.Layer): """ Softmax Cross entropy loss """ - def __init__(self, reduction='mean', label_smoothing=None): + def __init__(self, reduction="mean", label_smoothing=None): super().__init__() if label_smoothing is not None: assert label_smoothing >= 0 and label_smoothing <= 1, "label_smoothing must be in [0, 1]" @@ -102,16 +103,17 @@ def forward(self, x, label): loss = paddle.sum(-label * F.log_softmax(x, axis=-1), axis=-1) else: if label.dtype == paddle.int32: - label = paddle.cast(label, 'int64') + label = paddle.cast(label, "int64") loss = F.cross_entropy(x, label=label, soft_label=False) - if self.reduction == 'sum': + if self.reduction == "sum": return loss.sum() - elif self.reduction == 'mean': + elif self.reduction == "mean": return loss.mean() else: return loss + class GatherLayer(paddle.autograd.PyLayer): """ Gather tensors from all workers with support for backward propagation: @@ -126,11 +128,11 @@ def forward(ctx, x): @staticmethod def backward(ctx, *grads): - # print(grads) all_gradients = paddle.stack(grads) paddle.distributed.all_reduce(all_gradients) return all_gradients[paddle.distributed.get_rank()] + def masked_fill(x, mask, value): y = paddle.full(x.shape, value, x.dtype) return paddle.where(mask, y, x) diff --git a/paddlemix/models/common/distributed_utils.py b/paddlemix/models/common/distributed_utils.py index d5ab3eab20ca3..2004bd1345360 100644 --- a/paddlemix/models/common/distributed_utils.py +++ b/paddlemix/models/common/distributed_utils.py @@ -30,9 +30,7 @@ def forward(ctx, tensor, group=None): else: rank = dist.get_rank() world_size = dist.get_world_size() - tensors_gather = [ - paddle.empty_like(x=tensor) for _ in range(world_size) - ] + tensors_gather = [paddle.empty_like(x=tensor) for _ in range(world_size)] paddle.distributed.all_gather(tensors_gather, tensor, group=group) ctx.rank = rank ctx.batch_size = tensor.shape[0] @@ -40,8 +38,7 @@ def forward(ctx, tensor, group=None): @staticmethod def backward(ctx, grad_output): - return grad_output[ctx.batch_size * ctx.rank:ctx.batch_size * (ctx.rank - + 1)] + return grad_output[ctx.batch_size * ctx.rank : ctx.batch_size * (ctx.rank + 1)] allgather = AllGather.apply diff --git a/paddlemix/models/evaclip/eva_clip_model.py b/paddlemix/models/evaclip/eva_clip_model.py index 798ef23093732..91ce413de1be4 100644 --- a/paddlemix/models/evaclip/eva_clip_model.py +++ b/paddlemix/models/evaclip/eva_clip_model.py @@ -13,7 +13,7 @@ # limitations under the License. import paddle -from paddlenlp.transformers.convbert.modeling import ConvBertClassificationHead + """ CLIP Model Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI. @@ -37,10 +37,11 @@ class EVACLIPConfig(PretrainedConfig): model_type = "evaclip" def __init__( - self, - vision_cfg={}, - text_cfg={}, - **kwargs, ): + self, + vision_cfg={}, + text_cfg={}, + **kwargs, + ): kwargs["return_dict"] = kwargs.pop("return_dict", True) super().__init__(**kwargs) @@ -49,48 +50,46 @@ def __init__( @classmethod def from_pretrained( - cls, - pretrained_model_name_or_path: Union[str, os.PathLike]=None, - pretrained_vismodel_name_or_path: Union[str, os.PathLike]=None, - pretrained_textmodel_name_or_path: Union[str, os.PathLike]=None, - **kwargs, ) -> "PretrainedConfig": + cls, + pretrained_model_name_or_path: Union[str, os.PathLike] = None, + pretrained_vismodel_name_or_path: Union[str, os.PathLike] = None, + pretrained_textmodel_name_or_path: Union[str, os.PathLike] = None, + **kwargs, + ) -> "PretrainedConfig": assert pretrained_model_name_or_path is not None or ( - pretrained_vismodel_name_or_path is not None and - pretrained_textmodel_name_or_path is not None - ), (f"Either `pretrained_model_name_or_path` or (`pretrained_vismodel_name_or_path` and `pretrained_textmodel_name_or_path`) must be set, but" + pretrained_vismodel_name_or_path is not None and pretrained_textmodel_name_or_path is not None + ), ( + f"Either `pretrained_model_name_or_path` or (`pretrained_vismodel_name_or_path` and `pretrained_textmodel_name_or_path`) must be set, but" f"received `pretrained_model_name_or_path={pretrained_model_name_or_path}` and `pretrained_vismodel_name_or_path={pretrained_vismodel_name_or_path}`, " f"`pretrained_textmodel_name_or_path={pretrained_textmodel_name_or_path}`" - ) + ) config_dict = {} if pretrained_model_name_or_path is not None: - config_dict, kwargs = cls.get_config_dict( - pretrained_model_name_or_path, **kwargs) + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - if ("model_type" in config_dict and hasattr(cls, "model_type") and - config_dict["model_type"] != cls.model_type): + if ( + "model_type" in config_dict + and hasattr(cls, "model_type") + and config_dict["model_type"] != cls.model_type + ): logger.warning( f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." ) if pretrained_vismodel_name_or_path is not None: - visual_config_dict, kwargs = cls.get_config_dict( - pretrained_vismodel_name_or_path, **kwargs) + visual_config_dict, kwargs = cls.get_config_dict(pretrained_vismodel_name_or_path, **kwargs) - if ("model_type" in visual_config_dict and - visual_config_dict["model_type"] != - "evavision_transformer"): + if "model_type" in visual_config_dict and visual_config_dict["model_type"] != "evavision_transformer": logger.warning( f"You are using a model of type {visual_config_dict['model_type']} to instantiate a model of type " f"evavision_transformer. This is not supported for all configurations of models and can yield errors." ) config_dict["vision_cfg"] = visual_config_dict if pretrained_textmodel_name_or_path is not None: - text_config_dict, kwargs = cls.get_config_dict( - pretrained_textmodel_name_or_path, **kwargs) + text_config_dict, kwargs = cls.get_config_dict(pretrained_textmodel_name_or_path, **kwargs) config_dict["text_cfg"] = text_config_dict - if ("model_type" in text_config_dict and - text_config_dict["model_type"] != "evatext_transformer"): + if "model_type" in text_config_dict and text_config_dict["model_type"] != "evatext_transformer": logger.warning( f"You are using a model of type {text_config_dict['model_type']} to instantiate a model of type " f"evatext_transformer. This is not supported for all configurations of models and can yield errors." @@ -111,21 +110,22 @@ class EVACLIPPretrainedModel(PretrainedModel): @classmethod def from_pretrained( - cls, - pretrained_model_name_or_path=None, - pretrained_vismodel_name_or_path=None, - pretrained_textmodel_name_or_path=None, - from_hf_hub: bool=False, - subfolder: str=None, - *args, - **kwargs, ): + cls, + pretrained_model_name_or_path=None, + pretrained_vismodel_name_or_path=None, + pretrained_textmodel_name_or_path=None, + from_hf_hub: bool = False, + subfolder: str = None, + *args, + **kwargs, + ): assert pretrained_model_name_or_path is not None or ( - pretrained_vismodel_name_or_path is not None and - pretrained_textmodel_name_or_path is not None - ), (f"Either `pretrained_model_name_or_path` or (`pretrained_vismodel_name_or_path` and `pretrained_textmodel_name_or_path`) must be set, but" + pretrained_vismodel_name_or_path is not None and pretrained_textmodel_name_or_path is not None + ), ( + f"Either `pretrained_model_name_or_path` or (`pretrained_vismodel_name_or_path` and `pretrained_textmodel_name_or_path`) must be set, but" f"received `pretrained_model_name_or_path={pretrained_model_name_or_path}` and `pretrained_vismodel_name_or_path={pretrained_vismodel_name_or_path}`, " f"`pretrained_textmodel_name_or_path={pretrained_textmodel_name_or_path}`" - ) + ) if pretrained_model_name_or_path is not None: return super().from_pretrained( @@ -133,7 +133,8 @@ def from_pretrained( from_hf_hub=from_hf_hub, subfolder=subfolder, *args, - **kwargs, ) + **kwargs, + ) else: config_dict = { "vision_cfg": pretrained_vismodel_name_or_path, @@ -145,22 +146,21 @@ def from_pretrained( class EVACLIP(EVACLIPPretrainedModel): def __init__( - self, - config, - disable_text=False, - local_loss=False, - gather_with_grad=False, - cache_labels=True, - data_world_rank=0, - data_world_size=1, - enable_recompute=False, ): + self, + config, + disable_text=False, + local_loss=False, + gather_with_grad=False, + cache_labels=True, + data_world_rank=0, + data_world_size=1, + enable_recompute=False, + ): super().__init__(config) if isinstance(config.vision_config, str): - self.visual = EVAVisionTransformer.from_pretrained( - config.vision_config) + self.visual = EVAVisionTransformer.from_pretrained(config.vision_config) if not disable_text: - self.text = EVATextTransformer.from_pretrained( - config.text_config) + self.text = EVATextTransformer.from_pretrained(config.text_config) else: vision_config = EVAVisionTransformerConfig(**config.vision_config) text_config = EVATextTransformerConfig(**config.text_config) @@ -169,15 +169,16 @@ def __init__( self.text = EVATextTransformer(text_config) init_data = paddle.ones(shape=[1]) * np.log(1 / 0.07) self.logit_scale = self.create_parameter( - shape=[1], - default_initializer=paddle.nn.initializer.Assign(init_data)) + shape=[1], default_initializer=paddle.nn.initializer.Assign(init_data) + ) self.loss = ClipLoss( local_loss=local_loss, gather_with_grad=gather_with_grad, cache_labels=cache_labels, rank=data_world_rank, - world_size=data_world_size, ) + world_size=data_world_size, + ) if enable_recompute: self.visual.set_grad_checkpointing(True) @@ -185,12 +186,9 @@ def __init__( self.text.set_grad_checkpointing(True) def lock_image_tower(self, unlocked_groups=0, freeze_bn_stats=False): - self.visual.lock( - unlocked_groups=unlocked_groups, freeze_bn_stats=freeze_bn_stats) + self.visual.lock(unlocked_groups=unlocked_groups, freeze_bn_stats=freeze_bn_stats) - def lock_text_tower(self, - unlocked_layers: int=0, - freeze_layer_norm: bool=True): + def lock_text_tower(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True): self.text.lock(unlocked_layers, freeze_layer_norm) def set_grad_checkpointing(self, enable=True): @@ -205,31 +203,28 @@ def clip_scale(self): share_buffer = self.logit_scale.clip(0, math.log(100)) self.logit_scale.copy_(share_buffer, True) - def encode_image(self, image, normalize: bool=False): + def encode_image(self, image, normalize: bool = False): features = self.visual(image) - out = (paddle.nn.functional.normalize( - x=features, axis=-1) if normalize else features) + out = paddle.nn.functional.normalize(x=features, axis=-1) if normalize else features return out - def encode_text(self, text, text_features=None, normalize: bool=False): + def encode_text(self, text, text_features=None, normalize: bool = False): if text_features is not None: # directly use text_features if given - return (paddle.nn.functional.normalize( - x=text_features, axis=-1) if normalize else text_features) + return paddle.nn.functional.normalize(x=text_features, axis=-1) if normalize else text_features features = self.text(text) - return (paddle.nn.functional.normalize( - x=features, axis=-1) if normalize else features) + return paddle.nn.functional.normalize(x=features, axis=-1) if normalize else features def forward(self, image, input_ids, text_emb=None, skiploss=False): self.clip_scale() text = input_ids text_features = text_emb image_features = self.encode_image(image, normalize=True) - text_features = self.encode_text( - text, text_features=text_features, normalize=True) + text_features = self.encode_text(text, text_features=text_features, normalize=True) if skiploss: return image_features, text_features, self.logit_scale.exp() loss_itc, logits_per_image, logits_per_text, labels = self.loss( - (image_features, text_features, self.logit_scale.exp())) + (image_features, text_features, self.logit_scale.exp()) + ) return loss_itc, image_features, text_features, self.logit_scale.exp() diff --git a/paddlemix/models/evaclip/eva_text_model.py b/paddlemix/models/evaclip/eva_text_model.py index 7b314fd424cc8..77b8bc2207293 100644 --- a/paddlemix/models/evaclip/eva_text_model.py +++ b/paddlemix/models/evaclip/eva_text_model.py @@ -16,7 +16,7 @@ import logging import math import os -from typing import Callable, Optional, Sequence, Union +from typing import Callable, Optional, Union import paddle import paddle.distributed as dist @@ -27,7 +27,7 @@ from paddle.distributed import fleet from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker -from .utils import params_normal_, to_2tuple +from .utils import params_normal_ try: from .modules.fusedln import FusedLayerNorm @@ -37,7 +37,9 @@ print("Warning, FusedLn module is not available, use LayerNorm instead.") try: from paddle.incubate.nn.memory_efficient_attention import ( - LowerTriangularMask, memory_efficient_attention) + LowerTriangularMask, + memory_efficient_attention, + ) except: print("Warning: import memory_efficient_attention error") @@ -87,17 +89,18 @@ class MultiHeadAttention(paddle.nn.Layer): StaticCache = collections.namedtuple("StaticCache", ["k", "v"]) def __init__( - self, - embed_dim, - num_heads, - dropout=0.0, - kdim=None, - vdim=None, - need_weights=False, - weight_attr=None, - bias_attr=None, - fuse_attention_qkv=False, - num_partitions=1, ): + self, + embed_dim, + num_heads, + dropout=0.0, + kdim=None, + vdim=None, + need_weights=False, + weight_attr=None, + bias_attr=None, + fuse_attention_qkv=False, + num_partitions=1, + ): super(MultiHeadAttention, self).__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim @@ -108,8 +111,7 @@ def __init__( self.fuse_attention_qkv = fuse_attention_qkv self.head_dim = embed_dim // num_heads - assert (self.head_dim * num_heads == self.embed_dim - ), "embed_dim must be divisible by num_heads" + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" assert self.num_heads % num_partitions == 0 self.num_heads = self.num_heads // num_partitions @@ -124,12 +126,14 @@ def __init__( 3 * embed_dim, weight_attr=weight_attr, has_bias=True, - gather_output=False, ) + gather_output=False, + ) else: self.qkv_proj = paddle.nn.Linear( embed_dim, 3 * embed_dim, - weight_attr=weight_attr, ) + weight_attr=weight_attr, + ) else: if dist.get_world_size() > 1: self.q_proj = fleet.meta_parallel.ColumnParallelLinear( @@ -137,34 +141,40 @@ def __init__( embed_dim, weight_attr=weight_attr, has_bias=True, - gather_output=False, ) + gather_output=False, + ) self.k_proj = fleet.meta_parallel.ColumnParallelLinear( self.kdim, embed_dim, weight_attr=weight_attr, has_bias=True, - gather_output=False, ) + gather_output=False, + ) self.v_proj = fleet.meta_parallel.ColumnParallelLinear( self.vdim, embed_dim, weight_attr=weight_attr, has_bias=True, - gather_output=False, ) + gather_output=False, + ) else: self.q_proj = paddle.nn.Linear( embed_dim, embed_dim, - weight_attr=weight_attr, ) + weight_attr=weight_attr, + ) self.k_proj = paddle.nn.Linear( self.kdim, embed_dim, - weight_attr=weight_attr, ) + weight_attr=weight_attr, + ) self.v_proj = paddle.nn.Linear( self.vdim, embed_dim, - weight_attr=weight_attr, ) + weight_attr=weight_attr, + ) if dist.get_world_size() > 1: self.out_proj = fleet.meta_parallel.RowParallelLinear( @@ -172,12 +182,14 @@ def __init__( embed_dim, weight_attr=weight_attr, has_bias=True, - input_is_parallel=True, ) + input_is_parallel=True, + ) else: self.out_proj = paddle.nn.Linear( embed_dim, embed_dim, - weight_attr=weight_attr, ) + weight_attr=weight_attr, + ) def _fuse_prepare_qkv(self, query): mix_layer = self.qkv_proj(query) @@ -250,13 +262,7 @@ def gen_cache(self, key, value=None, type=Cache): # incremental_state with initial value, mainly for usage like UniLM return self.Cache(key, value) - def forward(self, - query, - key, - value, - attn_mask=None, - use_cache=False, - cache=None): + def forward(self, query, key, value, attn_mask=None, use_cache=False, cache=None): r""" Applies multi-head attention to map queries and a set of key-value pairs to outputs. @@ -270,11 +276,9 @@ def forward(self, else: q, k, v = self._prepare_qkv(query, key, value, use_cache, cache) else: - q, k, v, cache = self._prepare_qkv(query, key, value, use_cache, - cache) + q, k, v, cache = self._prepare_qkv(query, key, value, use_cache, cache) # scale dot product attention - product = paddle.matmul( - x=q * (self.head_dim**-0.5), y=k, transpose_y=True) + product = paddle.matmul(x=q * (self.head_dim**-0.5), y=k, transpose_y=True) if attn_mask is not None: # Support bool or int mask @@ -288,7 +292,8 @@ def forward(self, weights, self.dropout, training=self.training, - mode="upscale_in_train", ) + mode="upscale_in_train", + ) out = tensor.matmul(weights, v) @@ -317,11 +322,10 @@ def forward(self, x: paddle.Tensor): output = paddle.nn.functional.layer_norm( x=x.astype(dtype="float32"), normalized_shape=self._normalized_shape, - weight=self.weight.astype(dtype="float32") - if self.weight is not None else None, - bias=self.bias.astype(dtype="float32") - if self.bias is not None else None, - epsilon=self._epsilon, ) + weight=self.weight.astype(dtype="float32") if self.weight is not None else None, + bias=self.bias.astype(dtype="float32") if self.bias is not None else None, + epsilon=self._epsilon, + ) return output.astype(dtype=x.dtype) @@ -335,14 +339,15 @@ def forward(self, x: paddle.Tensor): normalized_shape=self._normalized_shape, weight=self.weight, bias=self.bias, - epsilon=self._epsilon, ) + epsilon=self._epsilon, + ) if isinstance(orig_type, paddle.dtype): dtype = orig_type elif isinstance(orig_type, str) and orig_type not in [ - "cpu", - "cuda", - "ipu", - "xpu", + "cpu", + "cuda", + "ipu", + "xpu", ]: dtype = orig_type elif isinstance(orig_type, paddle.Tensor): @@ -361,9 +366,7 @@ class LayerScale(paddle.nn.Layer): def __init__(self, dim, init_values=1e-05): super().__init__() init_data = init_values * paddle.ones(shape=[dim]) - self.gamma = self.create_parameter( - shape=[dim], - default_initializer=paddle.nn.initializer.Assign(init_data)) + self.gamma = self.create_parameter(shape=[dim], default_initializer=paddle.nn.initializer.Assign(init_data)) def forward(self, x): return x * self.gamma @@ -409,30 +412,28 @@ def forward(self, x): def _in_projection_packed( - q: paddle.Tensor, - k: paddle.Tensor, - v: paddle.Tensor, - w: paddle.Tensor, - b: Optional[paddle.Tensor]=None, ): + q: paddle.Tensor, + k: paddle.Tensor, + v: paddle.Tensor, + w: paddle.Tensor, + b: Optional[paddle.Tensor] = None, +): """ https://github.com/pytorch/pytorch/blob/db2a237763eb8693a20788be94f8c192e762baa8/torch/nn/functional.py#L4726 """ E = q.shape[-1] if k is v: if q is k: - return paddle.nn.functional.linear( - x=q, weight=w, bias=b).chunk( - chunks=3, axis=-1) + return paddle.nn.functional.linear(x=q, weight=w, bias=b).chunk(chunks=3, axis=-1) else: w_q, w_kv = w.split([E, E * 2]) if b is None: b_q = b_kv = None else: b_q, b_kv = b.split([E, E * 2]) - return (paddle.nn.functional.linear( - x=q, weight=w_q, bias=b_q), ) + paddle.nn.functional.linear( - x=k, weight=w_kv, bias=b_kv).chunk( - chunks=2, axis=-1) + return (paddle.nn.functional.linear(x=q, weight=w_q, bias=b_q),) + paddle.nn.functional.linear( + x=k, weight=w_kv, bias=b_kv + ).chunk(chunks=2, axis=-1) else: w_q, w_k, w_v = w.chunk(chunks=3) if b is None: @@ -440,12 +441,10 @@ def _in_projection_packed( else: b_q, b_k, b_v = b.chunk(chunks=3) return ( - paddle.nn.functional.linear( - x=q, weight=w_q, bias=b_q), - paddle.nn.functional.linear( - x=k, weight=w_k, bias=b_k), - paddle.nn.functional.linear( - x=v, weight=w_v, bias=b_v), ) + paddle.nn.functional.linear(x=q, weight=w_q, bias=b_q), + paddle.nn.functional.linear(x=k, weight=w_k, bias=b_k), + paddle.nn.functional.linear(x=v, weight=w_v, bias=b_v), + ) def masked_fill(x, mask, value): @@ -455,17 +454,18 @@ def masked_fill(x, mask, value): class Attention(paddle.nn.Layer): def __init__( - self, - dim, - num_heads=8, - qkv_bias=True, - scaled_cosine=False, - scale_heads=False, - logit_scale_max=math.log(1.0 / 0.01), - attn_drop=0.0, - proj_drop=0.0, - xattn=False, - rope=False, ): + self, + dim, + num_heads=8, + qkv_bias=True, + scaled_cosine=False, + scale_heads=False, + logit_scale_max=math.log(1.0 / 0.01), + attn_drop=0.0, + proj_drop=0.0, + xattn=False, + rope=False, + ): super().__init__() self.scaled_cosine = scaled_cosine self.scale_heads = scale_heads @@ -482,19 +482,22 @@ def __init__( paddle.set_default_dtype(origin_dtype) self.in_proj_weight = self.create_parameter( shape=[dim, dim * 3], - default_initializer=paddle.nn.initializer.Assign(init_data), ) + default_initializer=paddle.nn.initializer.Assign(init_data), + ) if qkv_bias: init_data = paddle.zeros(shape=[dim * 3]) self.in_proj_bias = self.create_parameter( shape=[dim * 3], - default_initializer=paddle.nn.initializer.Assign(init_data), ) + default_initializer=paddle.nn.initializer.Assign(init_data), + ) else: self.in_proj_bias = None if self.scaled_cosine: init_data = paddle.log(x=10 * paddle.ones(shape=[num_heads, 1, 1])) self.logit_scale = self.create_parameter( shape=[num_heads, 1, 1], - default_initializer=paddle.nn.initializer.Assign(init_data), ) + default_initializer=paddle.nn.initializer.Assign(init_data), + ) else: self.logit_scale = None self.attn_drop = paddle.nn.Dropout(p=attn_drop) @@ -502,12 +505,14 @@ def __init__( init_data = paddle.ones(shape=[num_heads, 1, 1]) self.head_scale = self.create_parameter( shape=[num_heads, 1, 1], - default_initializer=paddle.nn.initializer.Assign(init_data), ) + default_initializer=paddle.nn.initializer.Assign(init_data), + ) else: self.head_scale = None if dist.get_world_size() > 1: self.out_proj = fleet.meta_parallel.ColumnParallelLinear( - dim, dim, weight_attr=None, has_bias=True, gather_output=True) + dim, dim, weight_attr=None, has_bias=True, gather_output=True + ) else: self.out_proj = paddle.nn.Linear(dim, dim) self.out_drop = paddle.nn.Dropout(p=proj_drop) @@ -515,11 +520,11 @@ def __init__( self.xattn_drop = attn_drop self.rope = rope - def forward(self, x, attn_mask: Optional[paddle.Tensor]=None): + def forward(self, x, attn_mask: Optional[paddle.Tensor] = None): L, N, C = x.shape - q, k, v = paddle.nn.functional.linear( - x=x, weight=self.in_proj_weight, bias=self.in_proj_bias).chunk( - chunks=3, axis=-1) + q, k, v = paddle.nn.functional.linear(x=x, weight=self.in_proj_weight, bias=self.in_proj_bias).chunk( + chunks=3, axis=-1 + ) if self.xattn: x = q.reshape((L, N, self.num_heads, -1)) perm_3 = list(range(x.ndim)) @@ -542,8 +547,8 @@ def forward(self, x, attn_mask: Optional[paddle.Tensor]=None): v, p=self.xattn_drop, scale=self.scale if self.logit_scale is None else None, - attn_bias=LowerTriangularMask() - if attn_mask is not None else None, ) + attn_bias=LowerTriangularMask() if attn_mask is not None else None, + ) else: x = q.reshape((L, N * self.num_heads, -1)) perm_6 = list(range(x.ndim)) @@ -566,11 +571,10 @@ def forward(self, x, attn_mask: Optional[paddle.Tensor]=None): perm_9[-1] = -2 perm_9[-2] = -1 attn = paddle.bmm( - x=paddle.nn.functional.normalize( - x=q, axis=-1), - y=x.transpose(perm=perm_9), ) - logit_scale = paddle.clip( - x=self.logit_scale, max=self.logit_scale_max).exp() + x=paddle.nn.functional.normalize(x=q, axis=-1), + y=x.transpose(perm=perm_9), + ) + logit_scale = paddle.clip(x=self.logit_scale, max=self.logit_scale_max).exp() attn = attn.reshape((N, self.num_heads, L, L)) * logit_scale attn = attn.reshape((-1, L, L)) else: @@ -582,11 +586,9 @@ def forward(self, x, attn_mask: Optional[paddle.Tensor]=None): attn = paddle.bmm(x=q, y=x.transpose(perm=perm_10)) if attn_mask is not None: if attn_mask.dtype == "bool": - new_attn_mask = paddle.zeros_like( - x=attn_mask).astype(q.dtype) + new_attn_mask = paddle.zeros_like(x=attn_mask).astype(q.dtype) # new_attn_mask.masked_fill_(attn_mask, float('-inf')) - new_attn_mask = masked_fill(new_attn_mask, attn_mask, - float("-inf")) + new_attn_mask = masked_fill(new_attn_mask, attn_mask, float("-inf")) attn_mask = new_attn_mask attn += attn_mask attn = paddle.nn.functional.softmax(attn, axis=-1) @@ -609,16 +611,17 @@ def forward(self, x, attn_mask: Optional[paddle.Tensor]=None): class CustomAttention(paddle.nn.Layer): def __init__( - self, - dim, - num_heads=8, - qkv_bias=True, - scaled_cosine=True, - scale_heads=False, - logit_scale_max=math.log(1.0 / 0.01), - attn_drop=0.0, - proj_drop=0.0, - xattn=False, ): + self, + dim, + num_heads=8, + qkv_bias=True, + scaled_cosine=True, + scale_heads=False, + logit_scale_max=math.log(1.0 / 0.01), + attn_drop=0.0, + proj_drop=0.0, + xattn=False, + ): super().__init__() self.scaled_cosine = scaled_cosine self.scale_heads = scale_heads @@ -635,19 +638,21 @@ def __init__( paddle.set_default_dtype(origin_dtype) self.in_proj_weight = self.create_parameter( shape=[dim, dim * 3], - default_initializer=paddle.nn.initializer.Assign(init_data), ) + default_initializer=paddle.nn.initializer.Assign(init_data), + ) if qkv_bias: self.in_proj_bias = self.create_parameter( shape=[dim * 3], - default_initializer=paddle.nn.initializer.Assign( - paddle.zeros(shape=[dim * 3])), ) + default_initializer=paddle.nn.initializer.Assign(paddle.zeros(shape=[dim * 3])), + ) else: self.in_proj_bias = None if self.scaled_cosine: init_data = paddle.log(x=10 * paddle.ones(shape=[num_heads, 1, 1])) self.logit_scale = self.create_parameter( shape=[num_heads, 1, 1], - default_initializer=paddle.nn.initializer.Assign(init_data), ) + default_initializer=paddle.nn.initializer.Assign(init_data), + ) else: self.logit_scale = None self.attn_drop = paddle.nn.Dropout(p=attn_drop) @@ -655,12 +660,14 @@ def __init__( init_data = paddle.ones(shape=[num_heads, 1, 1]) self.head_scale = self.create_parameter( shape=[num_heads, 1, 1], - default_initializer=paddle.nn.initializer.Assign(init_data), ) + default_initializer=paddle.nn.initializer.Assign(init_data), + ) else: self.head_scale = None if dist.get_world_size() > 1: self.out_proj = fleet.meta_parallel.ColumnParallelLinear( - dim, dim, weight_attr=None, has_bias=True, gather_output=True) + dim, dim, weight_attr=None, has_bias=True, gather_output=True + ) else: self.out_proj = paddle.nn.Linear(dim, dim) self.out_drop = paddle.nn.Dropout(p=proj_drop) @@ -668,31 +675,28 @@ def __init__( self.xattn_drop = attn_drop def forward( - self, - query: paddle.Tensor, - key: paddle.Tensor, - value: paddle.Tensor, - attn_mask: Optional[paddle.Tensor]=None, ): - q, k, v = _in_projection_packed(query, key, value, self.in_proj_weight, - self.in_proj_bias) + self, + query: paddle.Tensor, + key: paddle.Tensor, + value: paddle.Tensor, + attn_mask: Optional[paddle.Tensor] = None, + ): + q, k, v = _in_projection_packed(query, key, value, self.in_proj_weight, self.in_proj_bias) N_q, B_q, C_q = q.shape N_k, B_k, C_k = k.shape N_v, B_v, C_v = v.shape if self.xattn: - q = q.transpose(perm=[1, 0, 2]).reshape( - (B_q, N_q, self.num_heads, -1)) - k = k.transpose(perm=[1, 0, 2]).reshape( - (B_k, N_k, self.num_heads, -1)) - v = v.transpose(perm=[1, 0, 2]).reshape( - (B_v, N_v, self.num_heads, -1)) + q = q.transpose(perm=[1, 0, 2]).reshape((B_q, N_q, self.num_heads, -1)) + k = k.transpose(perm=[1, 0, 2]).reshape((B_k, N_k, self.num_heads, -1)) + v = v.transpose(perm=[1, 0, 2]).reshape((B_v, N_v, self.num_heads, -1)) x = memory_efficient_attention( q, k, v, p=self.xattn_drop, scale=self.scale if self.logit_scale is None else None, - attn_bias=LowerTriangularMask() - if attn_mask is not None else None, ) + attn_bias=LowerTriangularMask() if attn_mask is not None else None, + ) else: x = q.reshape((N_q, B_q * self.num_heads, -1)) @@ -716,13 +720,11 @@ def forward( perm_15[-1] = -2 perm_15[-2] = -1 attn = paddle.bmm( - x=paddle.nn.functional.normalize( - x=q, axis=-1), - y=x.transpose(perm=perm_15), ) - logit_scale = paddle.clip( - x=self.logit_scale, max=self.logit_scale_max).exp() - attn = attn.reshape( - (B_q, self.num_heads, N_q, N_k)) * logit_scale + x=paddle.nn.functional.normalize(x=q, axis=-1), + y=x.transpose(perm=perm_15), + ) + logit_scale = paddle.clip(x=self.logit_scale, max=self.logit_scale_max).exp() + attn = attn.reshape((B_q, self.num_heads, N_q, N_k)) * logit_scale attn = attn.reshape((-1, N_q, N_k)) else: q = q * self.scale @@ -733,10 +735,8 @@ def forward( attn = paddle.bmm(x=q, y=x.transpose(perm=perm_16)) if attn_mask is not None: if attn_mask.dtype == "bool": - new_attn_mask = paddle.zeros_like( - x=attn_mask).astype(q.dtype) - new_attn_mask = masked_fill(new_attn_mask, attn_mask, - float("-inf")) + new_attn_mask = paddle.zeros_like(x=attn_mask).astype(q.dtype) + new_attn_mask = masked_fill(new_attn_mask, attn_mask, float("-inf")) attn_mask = new_attn_mask attn += attn_mask attn = paddle.nn.functional.softmax(attn, axis=-1) @@ -759,19 +759,20 @@ def forward( class CustomResidualAttentionBlock(paddle.nn.Layer): def __init__( - self, - d_model: int, - n_head: int, - mlp_ratio: float=4.0, - ls_init_value: float=None, - act_layer: Callable=paddle.nn.GELU, - norm_layer: Callable=LayerNorm, - scale_cosine_attn: bool=False, - scale_heads: bool=False, - scale_attn: bool=False, - scale_fc: bool=False, - cross_attn: bool=False, - xattn: bool=False, ): + self, + d_model: int, + n_head: int, + mlp_ratio: float = 4.0, + ls_init_value: float = None, + act_layer: Callable = paddle.nn.GELU, + norm_layer: Callable = LayerNorm, + scale_cosine_attn: bool = False, + scale_heads: bool = False, + scale_attn: bool = False, + scale_fc: bool = False, + cross_attn: bool = False, + xattn: bool = False, + ): super().__init__() self.ln_1 = norm_layer(d_model) self.ln_1_k = norm_layer(d_model) if cross_attn else self.ln_1 @@ -784,74 +785,74 @@ def __init__( proj_drop=0.0, scaled_cosine=scale_cosine_attn, scale_heads=scale_heads, - xattn=xattn, ) - self.ln_attn = norm_layer( - d_model) if scale_attn else paddle.nn.Identity() - self.ls_1 = (LayerScale(d_model, ls_init_value) - if ls_init_value is not None else paddle.nn.Identity()) + xattn=xattn, + ) + self.ln_attn = norm_layer(d_model) if scale_attn else paddle.nn.Identity() + self.ls_1 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else paddle.nn.Identity() self.ln_2 = norm_layer(d_model) mlp_width = int(d_model * mlp_ratio) if dist.get_world_size() > 1: - self.mlp = paddle.nn.Sequential(* [ - ( - "c_fc", - fleet.meta_parallel.ColumnParallelLinear( - d_model, - mlp_width, - weight_attr=None, - has_bias=True, - gather_output=True, ), ), - ("ln", norm_layer(mlp_width) - if scale_fc else paddle.nn.Identity()), - ("gelu", act_layer()), - ( - "c_proj", - fleet.meta_parallel.ColumnParallelLinear( - mlp_width, - d_model, - weight_attr=None, - has_bias=True, - gather_output=True, ), ), - ]) + self.mlp = paddle.nn.Sequential( + *[ + ( + "c_fc", + fleet.meta_parallel.ColumnParallelLinear( + d_model, + mlp_width, + weight_attr=None, + has_bias=True, + gather_output=True, + ), + ), + ("ln", norm_layer(mlp_width) if scale_fc else paddle.nn.Identity()), + ("gelu", act_layer()), + ( + "c_proj", + fleet.meta_parallel.ColumnParallelLinear( + mlp_width, + d_model, + weight_attr=None, + has_bias=True, + gather_output=True, + ), + ), + ] + ) else: - self.mlp = paddle.nn.Sequential(* [ - ("c_fc", paddle.nn.Linear(d_model, mlp_width)), - ("ln", norm_layer(mlp_width) - if scale_fc else paddle.nn.Identity()), - ("gelu", act_layer()), - ("c_proj", paddle.nn.Linear(mlp_width, d_model)), - ]) - self.ls_2 = (LayerScale(d_model, ls_init_value) - if ls_init_value is not None else paddle.nn.Identity()) + self.mlp = paddle.nn.Sequential( + *[ + ("c_fc", paddle.nn.Linear(d_model, mlp_width)), + ("ln", norm_layer(mlp_width) if scale_fc else paddle.nn.Identity()), + ("gelu", act_layer()), + ("c_proj", paddle.nn.Linear(mlp_width, d_model)), + ] + ) + self.ls_2 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else paddle.nn.Identity() def forward( - self, - q: paddle.Tensor, - k: paddle.Tensor, - v: paddle.Tensor, - attn_mask: Optional[paddle.Tensor]=None, ): - q = q + self.ls_1( - self.ln_attn( - self.attn( - self.ln_1(q), - self.ln_1_k(k), - self.ln_1_v(v), - attn_mask=attn_mask))) + self, + q: paddle.Tensor, + k: paddle.Tensor, + v: paddle.Tensor, + attn_mask: Optional[paddle.Tensor] = None, + ): + q = q + self.ls_1(self.ln_attn(self.attn(self.ln_1(q), self.ln_1_k(k), self.ln_1_v(v), attn_mask=attn_mask))) q = q + self.ls_2(self.mlp(self.ln_2(q))) return q class ResidualAttentionBlock(paddle.nn.Layer): def __init__( - self, - d_model: int, - n_head: int, - mlp_ratio: float=4.0, - ls_init_value: float=None, - act_layer: Callable=nn.GELU, - norm_layer: Callable=LayerNorm, - xattn: bool=False, - is_cross_attention: bool=False, ): + self, + d_model: int, + n_head: int, + mlp_ratio: float = 4.0, + ls_init_value: float = None, + act_layer: Callable = nn.GELU, + norm_layer: Callable = LayerNorm, + xattn: bool = False, + is_cross_attention: bool = False, + ): super().__init__() self.ln_1 = norm_layer(d_model) @@ -859,57 +860,64 @@ def __init__( self.attn = Attention(d_model, n_head, xattn=True) else: self.attn = MultiHeadAttention(d_model, n_head) - self.ls_1 = (LayerScale(d_model, ls_init_value) - if ls_init_value is not None else nn.Identity()) + self.ls_1 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity() if is_cross_attention: self.ln_1_kv = norm_layer(d_model) self.ln_2 = norm_layer(d_model) mlp_width = int(d_model * mlp_ratio) if dist.get_world_size() > 1: - self.mlp = paddle.nn.Sequential(* [ - ( - "c_fc", - fleet.meta_parallel.ColumnParallelLinear( - d_model, - mlp_width, - weight_attr=None, - has_bias=True, - gather_output=True, ), ), - ("gelu", act_layer()), - ( - "c_proj", - fleet.meta_parallel.ColumnParallelLinear( - mlp_width, - d_model, - weight_attr=None, - has_bias=True, - gather_output=True, ), ), - ]) + self.mlp = paddle.nn.Sequential( + *[ + ( + "c_fc", + fleet.meta_parallel.ColumnParallelLinear( + d_model, + mlp_width, + weight_attr=None, + has_bias=True, + gather_output=True, + ), + ), + ("gelu", act_layer()), + ( + "c_proj", + fleet.meta_parallel.ColumnParallelLinear( + mlp_width, + d_model, + weight_attr=None, + has_bias=True, + gather_output=True, + ), + ), + ] + ) else: - self.mlp = paddle.nn.Sequential(* [ - ("c_fc", paddle.nn.Linear(d_model, mlp_width)), - ("gelu", act_layer()), - ("c_proj", paddle.nn.Linear(mlp_width, d_model)), - ]) - self.ls_2 = (LayerScale(d_model, ls_init_value) - if ls_init_value is not None else nn.Identity()) + self.mlp = paddle.nn.Sequential( + *[ + ("c_fc", paddle.nn.Linear(d_model, mlp_width)), + ("gelu", act_layer()), + ("c_proj", paddle.nn.Linear(mlp_width, d_model)), + ] + ) + self.ls_2 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity() self.xattn = xattn def attention( - self, - q_x, - k_x=None, - v_x=None, - attn_mask=None, ): + self, + q_x, + k_x=None, + v_x=None, + attn_mask=None, + ): if isinstance(q_x.dtype, paddle.dtype): dtype = q_x.dtype elif isinstance(q_x.dtype, str) and q_x.dtype not in [ - "cpu", - "cuda", - "ipu", - "xpu", + "cpu", + "cuda", + "ipu", + "xpu", ]: dtype = q_x.dtype elif isinstance(q_x.dtype, paddle.Tensor): @@ -920,8 +928,7 @@ def attention( if self.xattn: return self.attn(q_x, attn_mask=attn_mask) - attn_mask = (attn_mask.unsqueeze(0).unsqueeze(0) - if attn_mask is not None else None) + attn_mask = attn_mask.unsqueeze(0).unsqueeze(0) if attn_mask is not None else None q_x = q_x.transpose((1, 0, 2)) k_x = k_x if k_x is not None else q_x v_x = v_x if v_x is not None else q_x @@ -929,19 +936,16 @@ def attention( return out.transpose((1, 0, 2)) def forward( - self, - q_x, - k_x=None, - v_x=None, - attn_mask=None, ): - k_x = (self.ln_1_kv(k_x) - if hasattr(self, "ln_1_kv") and k_x is not None else None) - v_x = (self.ln_1_kv(v_x) - if hasattr(self, "ln_1_kv") and v_x is not None else None) - - x = self.ls_1( - self.attention( - q_x=self.ln_1(q_x), k_x=k_x, v_x=v_x, attn_mask=attn_mask)) + self, + q_x, + k_x=None, + v_x=None, + attn_mask=None, + ): + k_x = self.ln_1_kv(k_x) if hasattr(self, "ln_1_kv") and k_x is not None else None + v_x = self.ln_1_kv(v_x) if hasattr(self, "ln_1_kv") and v_x is not None else None + + x = self.ls_1(self.attention(q_x=self.ln_1(q_x), k_x=k_x, v_x=v_x, attn_mask=attn_mask)) x = x + q_x x = x + self.ls_2(self.mlp(self.ln_2(x))) return x @@ -949,41 +953,44 @@ def forward( class Transformer(paddle.nn.Layer): def __init__( - self, - config, - act_layer: Callable=paddle.nn.GELU, - norm_layer: Callable=LayerNorm, ): + self, + config, + act_layer: Callable = paddle.nn.GELU, + norm_layer: Callable = LayerNorm, + ): super().__init__() self.enable_recompute = False self.width = config.width self.layers = config.layers - self.resblocks = paddle.nn.LayerList(sublayers=[ - ResidualAttentionBlock( - config.width, - config.heads, - mlp_ratio=4.0, - ls_init_value=config.ls_init_value, - act_layer=act_layer, - norm_layer=norm_layer, - xattn=config.xattn, ) for _ in range(config.layers) - ]) + self.resblocks = paddle.nn.LayerList( + sublayers=[ + ResidualAttentionBlock( + config.width, + config.heads, + mlp_ratio=4.0, + ls_init_value=config.ls_init_value, + act_layer=act_layer, + norm_layer=norm_layer, + xattn=config.xattn, + ) + for _ in range(config.layers) + ] + ) def get_cast_dtype(self) -> paddle.dtype: return self.resblocks[0].mlp.c_fc.weight.dtype - def forward(self, x: paddle.Tensor, - attn_mask: Optional[paddle.Tensor]=None): + def forward(self, x: paddle.Tensor, attn_mask: Optional[paddle.Tensor] = None): for r in self.resblocks: if self.enable_recompute: - x = paddle.distributed.fleet.utils.recompute( - r, x, attn_mask, use_reentrant=False) + x = paddle.distributed.fleet.utils.recompute(r, x, attn_mask, use_reentrant=False) else: x = r(x, attn_mask=attn_mask) return x class AttentionalPooler(paddle.nn.Layer): - def __init__(self, config, norm_layer: Callable=LayerNorm): + def __init__(self, config, norm_layer: Callable = LayerNorm): super().__init__() d_model = config.num_classes context_dim = config.embed_dim @@ -995,12 +1002,9 @@ def __init__(self, config, norm_layer: Callable=LayerNorm): paddle.set_default_dtype(origin_dtype) self.query = self.create_parameter( shape=[config.n_queries, d_model], - default_initializer=paddle.nn.initializer.Assign(init_data), ) - self.attn = MultiHeadAttention( - d_model, - config.attn_pooler_heads, - kdim=context_dim, - vdim=context_dim) + default_initializer=paddle.nn.initializer.Assign(init_data), + ) + self.attn = MultiHeadAttention(d_model, config.attn_pooler_heads, kdim=context_dim, vdim=context_dim) self.ln_q = norm_layer(d_model) self.ln_k = norm_layer(context_dim) @@ -1020,21 +1024,22 @@ class EVATextTransformerConfig(PretrainedConfig): model_type = "evatext_transformer" def __init__( - self, - context_length: int=77, - vocab_size: int=49408, - width: int=512, - heads: int=8, - layers: int=12, - ls_init_value: float=None, - output_dim: int=512, - act_layer: Callable=paddle.nn.GELU, - norm_layer: Callable=LayerNorm, - xattn: bool=False, - attn_mask: bool=True, - pad_id: int=0, - quick_gelu: bool=False, - **kwargs, ): + self, + context_length: int = 77, + vocab_size: int = 49408, + width: int = 512, + heads: int = 8, + layers: int = 12, + ls_init_value: float = None, + output_dim: int = 512, + act_layer: Callable = paddle.nn.GELU, + norm_layer: Callable = LayerNorm, + xattn: bool = False, + attn_mask: bool = True, + pad_id: int = 0, + quick_gelu: bool = False, + **kwargs, + ): kwargs["return_dict"] = kwargs.pop("return_dict", True) super().__init__(**kwargs) @@ -1053,14 +1058,10 @@ def __init__( self.quick_gelu = quick_gelu @classmethod - def from_pretrained(cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, - **kwargs) - - if ("model_type" in config_dict and hasattr(cls, "model_type") and - config_dict["model_type"] != cls.model_type): + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: logger.warning( f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." @@ -1096,54 +1097,43 @@ def __init__(self, config: EVATextTransformerConfig): self.num_pos = config.context_length self.heads = config.heads if dist.get_world_size() > 1: - self.token_embedding = fleet.meta_parallel.VocabParallelEmbedding( - config.vocab_size, width) + self.token_embedding = fleet.meta_parallel.VocabParallelEmbedding(config.vocab_size, width) else: self.token_embedding = paddle.nn.Embedding(config.vocab_size, width) - self.transformer = Transformer( - config, act_layer=act_layer, norm_layer=norm_layer) + self.transformer = Transformer(config, act_layer=act_layer, norm_layer=norm_layer) self.ln_final = norm_layer(width) init_data = paddle.empty(shape=[width, self.output_dim]) self.text_projection = self.create_parameter( shape=[width, self.output_dim], - default_initializer=paddle.nn.initializer.Assign(init_data), ) + default_initializer=paddle.nn.initializer.Assign(init_data), + ) init_data = paddle.empty(shape=[self.num_pos, width]) self.positional_embedding = self.create_parameter( shape=[self.num_pos, width], - default_initializer=paddle.nn.initializer.Assign(init_data), ) + default_initializer=paddle.nn.initializer.Assign(init_data), + ) if config.attn_mask: - self.register_buffer( - "attn_mask", self.build_attention_mask(), persistable=False) + self.register_buffer("attn_mask", self.build_attention_mask(), persistable=False) else: self.attn_mask = None # self.init_parameters() def init_parameters(self): - self.token_embedding.weight = params_normal_( - self.token_embedding.weight, std=0.02) - self.positional_embedding = params_normal_( - self.positional_embedding, std=0.01) + self.token_embedding.weight = params_normal_(self.token_embedding.weight, std=0.02) + self.positional_embedding = params_normal_(self.positional_embedding, std=0.01) - proj_std = (self.transformer.width**-0.5 * (2 * self.transformer.layers) - **-0.5) + proj_std = self.transformer.width**-0.5 * (2 * self.transformer.layers) ** -0.5 attn_std = self.transformer.width**-0.5 - fc_std = (2 * self.transformer.width)**-0.5 + fc_std = (2 * self.transformer.width) ** -0.5 for block in self.transformer.resblocks: - block.attn.q_proj.weight = params_normal_( - block.attn.q_proj.weight, std=attn_std) - block.attn.k_proj.weight = params_normal_( - block.attn.k_proj.weight, std=attn_std) - block.attn.v_proj.weight = params_normal_( - block.attn.v_proj.weight, std=attn_std) - block.attn.out_proj.weight = params_normal_( - block.attn.out_proj.weight, std=proj_std) - block.mlp.c_fc.weight = params_normal_( - block.mlp.c_fc.weight, std=fc_std) - block.mlp.c_proj.weight = params_normal_( - block.mlp.c_proj.weight, std=proj_std) + block.attn.q_proj.weight = params_normal_(block.attn.q_proj.weight, std=attn_std) + block.attn.k_proj.weight = params_normal_(block.attn.k_proj.weight, std=attn_std) + block.attn.v_proj.weight = params_normal_(block.attn.v_proj.weight, std=attn_std) + block.attn.out_proj.weight = params_normal_(block.attn.out_proj.weight, std=proj_std) + block.mlp.c_fc.weight = params_normal_(block.mlp.c_fc.weight, std=fc_std) + block.mlp.c_proj.weight = params_normal_(block.mlp.c_proj.weight, std=proj_std) if self.text_projection is not None: - self.text_projection = params_normal_( - self.text_projection, std=self.transformer.width**-0.5) + self.text_projection = params_normal_(self.text_projection, std=self.transformer.width**-0.5) def set_grad_checkpointing(self, enable=True): self.transformer.enable_recompute = enable @@ -1168,10 +1158,10 @@ def forward(self, text): if isinstance(cast_dtype, paddle.dtype): dtype = cast_dtype elif isinstance(cast_dtype, str) and cast_dtype not in [ - "cpu", - "cuda", - "ipu", - "xpu", + "cpu", + "cuda", + "ipu", + "xpu", ]: dtype = cast_dtype elif isinstance(cast_dtype, paddle.Tensor): @@ -1184,10 +1174,10 @@ def forward(self, text): if isinstance(cast_dtype, paddle.dtype): dtype = cast_dtype elif isinstance(cast_dtype, str) and cast_dtype not in [ - "cpu", - "cuda", - "ipu", - "xpu", + "cpu", + "cuda", + "ipu", + "xpu", ]: dtype = cast_dtype elif isinstance(cast_dtype, paddle.Tensor): @@ -1202,6 +1192,6 @@ def forward(self, text): pooled = x[paddle.arange(x.shape[0]), text.argmax(axis=-1)] if self.text_projection is not None: - pooled = pooled @self.text_projection + pooled = pooled @ self.text_projection return pooled diff --git a/paddlemix/models/evaclip/eva_vit_model.py b/paddlemix/models/evaclip/eva_vit_model.py index f829400ad921e..df2195370705e 100644 --- a/paddlemix/models/evaclip/eva_vit_model.py +++ b/paddlemix/models/evaclip/eva_vit_model.py @@ -31,8 +31,7 @@ import paddle.distributed as dist from paddle.distributed import fleet from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker -from paddle.incubate.nn.memory_efficient_attention import \ - memory_efficient_attention +from paddle.incubate.nn.memory_efficient_attention import memory_efficient_attention from paddlenlp.transformers.configuration_utils import PretrainedConfig from paddlenlp.transformers.model_utils import PretrainedModel from paddlenlp.utils.log import logger @@ -40,10 +39,7 @@ from .utils import to_2tuple, trunc_normal_ -def drop_path(x, - drop_prob: float=0.0, - training: bool=False, - scale_by_keep: bool=True): +def drop_path(x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, @@ -56,17 +52,14 @@ def drop_path(x, if drop_prob == 0.0 or not training: return x keep_prob = 1 - drop_prob - shape = (x.shape[0], ) + (1, ) * (x.ndim - 1) - bern_0 = (paddle.to_tensor( - [keep_prob], dtype=paddle.float32) - if not isinstance(keep_prob, paddle.Tensor) else keep_prob) + shape = (x.shape[0],) + (1,) * (x.ndim - 1) + bern_0 = ( + paddle.to_tensor([keep_prob], dtype=paddle.float32) if not isinstance(keep_prob, paddle.Tensor) else keep_prob + ) random_tensor = paddle.assign( - paddle.bernoulli( - paddle.broadcast_to( - bern_0, paddle.empty( - shape=shape, dtype=x.dtype).shape)), - paddle.empty( - shape=shape, dtype=x.dtype), ) + paddle.bernoulli(paddle.broadcast_to(bern_0, paddle.empty(shape=shape, dtype=x.dtype).shape)), + paddle.empty(shape=shape, dtype=x.dtype), + ) if keep_prob > 0.0 and scale_by_keep: random_tensor = random_tensor.divide(keep_prob) return x * random_tensor @@ -89,10 +82,7 @@ def extra_repr(self) -> str: class Mlp(paddle.nn.Layer): - def __init__(self, - config, - act_layer=paddle.nn.GELU, - norm_layer=paddle.nn.LayerNorm): + def __init__(self, config, act_layer=paddle.nn.GELU, norm_layer=paddle.nn.LayerNorm): super().__init__() in_features = config.embed_dim hidden_features = int(config.embed_dim * config.mlp_ratio) @@ -104,19 +94,20 @@ def __init__(self, hidden_features, weight_attr=None, has_bias=True, - gather_output=True, ) + gather_output=True, + ) self.fc2 = fleet.meta_parallel.ColumnParallelLinear( hidden_features, out_features, weight_attr=None, has_bias=True, - gather_output=True, ) + gather_output=True, + ) else: self.fc1 = paddle.nn.Linear(in_features, hidden_features) self.fc2 = paddle.nn.Linear(hidden_features, out_features) self.act = act_layer() - self.ffn_ln = (norm_layer(hidden_features) - if config.subln else paddle.nn.Identity()) + self.ffn_ln = norm_layer(hidden_features) if config.subln else paddle.nn.Identity() self.drop = paddle.nn.Dropout(p=config.drop_rate) def forward(self, x): @@ -131,11 +122,12 @@ def forward(self, x): class SwiGLU(paddle.nn.Layer): def __init__( - self, - config, - drop=0.0, - act_layer=paddle.nn.Silu, - norm_layer=paddle.nn.LayerNorm, ): + self, + config, + drop=0.0, + act_layer=paddle.nn.Silu, + norm_layer=paddle.nn.LayerNorm, + ): super().__init__() in_features = config.embed_dim hidden_features = int(config.embed_dim * config.mlp_ratio) @@ -146,26 +138,28 @@ def __init__( hidden_features, weight_attr=None, has_bias=True, - gather_output=True, ) + gather_output=True, + ) self.w2 = fleet.meta_parallel.ColumnParallelLinear( in_features, hidden_features, weight_attr=None, has_bias=True, - gather_output=True, ) + gather_output=True, + ) self.w3 = fleet.meta_parallel.ColumnParallelLinear( hidden_features, out_features, weight_attr=None, has_bias=True, - gather_output=True, ) + gather_output=True, + ) else: self.w1 = paddle.nn.Linear(in_features, hidden_features) self.w2 = paddle.nn.Linear(in_features, hidden_features) self.w3 = paddle.nn.Linear(hidden_features, out_features) self.act = act_layer() - self.ffn_ln = (norm_layer(hidden_features) - if config.subln else paddle.nn.Identity()) + self.ffn_ln = norm_layer(hidden_features) if config.subln else paddle.nn.Identity() self.drop = paddle.nn.Dropout(p=drop) def forward(self, x): @@ -180,11 +174,7 @@ def forward(self, x): class Attention(paddle.nn.Layer): - def __init__(self, - config, - window_size=None, - rope=None, - norm_layer=paddle.nn.LayerNorm): + def __init__(self, config, window_size=None, rope=None, norm_layer=paddle.nn.LayerNorm): super().__init__() dim = config.embed_dim self.xattn_drop = config.attn_drop_rate @@ -193,8 +183,7 @@ def __init__(self, self.num_heads = config.embed_dim // config.head_width head_dim = dim // self.num_heads - if hasattr(config, - "attn_head_dim") and config.attn_head_dim is not None: + if hasattr(config, "attn_head_dim") and config.attn_head_dim is not None: head_dim = config.attn_head_dim all_head_dim = head_dim * self.num_heads self.scale = config.qk_scale or head_dim**-0.5 @@ -205,23 +194,25 @@ def __init__(self, all_head_dim, weight_attr=None, has_bias=config.qkv_bias, - gather_output=True, ) + gather_output=True, + ) self.k_proj = fleet.meta_parallel.ColumnParallelLinear( dim, all_head_dim, weight_attr=None, has_bias=False, - gather_output=True, ) + gather_output=True, + ) self.v_proj = fleet.meta_parallel.ColumnParallelLinear( dim, all_head_dim, weight_attr=None, has_bias=config.qkv_bias, - gather_output=True, ) + gather_output=True, + ) else: self.q_proj = paddle.nn.Linear(dim, all_head_dim) - self.k_proj = paddle.nn.Linear( - dim, all_head_dim, bias_attr=False) + self.k_proj = paddle.nn.Linear(dim, all_head_dim, bias_attr=False) self.v_proj = paddle.nn.Linear(dim, all_head_dim) else: if dist.get_world_size() > 1: @@ -230,15 +221,14 @@ def __init__(self, all_head_dim * 3, weight_attr=None, has_bias=False, - gather_output=True, ) + gather_output=True, + ) else: - self.qkv = paddle.nn.Linear( - dim, all_head_dim * 3, bias_attr=False) + self.qkv = paddle.nn.Linear(dim, all_head_dim * 3, bias_attr=False) if config.qkv_bias: mpsize = 1 if dist.get_world_size() > 1: - mpsize = (fleet.get_hybrid_communicate_group() - .get_model_parallel_world_size()) + mpsize = fleet.get_hybrid_communicate_group().get_model_parallel_world_size() init_data = paddle.zeros(shape=[all_head_dim // mpsize]) self.q_bias = self.create_parameter( shape=[all_head_dim // mpsize], @@ -253,47 +243,42 @@ def __init__(self, self.v_bias = None if window_size: self.window_size = window_size - self.num_relative_distance = (2 * window_size[0] - 1) * ( - 2 * window_size[1] - 1) + 3 - init_data = paddle.zeros( - shape=[self.num_relative_distance, self.num_heads]) + self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 + init_data = paddle.zeros(shape=[self.num_relative_distance, self.num_heads]) self.relative_position_bias_table = self.create_parameter( shape=[self.num_relative_distance, self.num_heads], - default_initializer=paddle.nn.initializer.Assign(init_data), ) + default_initializer=paddle.nn.initializer.Assign(init_data), + ) coords_h = paddle.arange(end=window_size[0]) coords_w = paddle.arange(end=window_size[1]) coords = paddle.stack(x=paddle.meshgrid([coords_h, coords_w])) coords_flatten = paddle.flatten(x=coords, start_axis=1) - relative_coords = ( - coords_flatten[:, :, (None)] - coords_flatten[:, (None), :]) + relative_coords = coords_flatten[:, :, (None)] - coords_flatten[:, (None), :] relative_coords = relative_coords.transpose(perm=[1, 2, 0]) relative_coords[:, :, (0)] += window_size[0] - 1 relative_coords[:, :, (1)] += window_size[1] - 1 relative_coords[:, :, (0)] *= 2 * window_size[1] - 1 relative_position_index = paddle.zeros( - shape=(window_size[0] * window_size[1] + 1, ) * 2, - dtype=relative_coords.dtype, ) + shape=(window_size[0] * window_size[1] + 1,) * 2, + dtype=relative_coords.dtype, + ) relative_position_index[1:, 1:] = relative_coords.sum(axis=-1) relative_position_index[(0), 0:] = self.num_relative_distance - 3 relative_position_index[0:, (0)] = self.num_relative_distance - 2 relative_position_index[0, 0] = self.num_relative_distance - 1 - self.register_buffer("relative_position_index", - relative_position_index) + self.register_buffer("relative_position_index", relative_position_index) else: self.window_size = None self.relative_position_bias_table = None self.relative_position_index = None self.attn_drop = paddle.nn.Dropout(p=self.xattn_drop) - self.inner_attn_ln = (norm_layer(all_head_dim) - if (config.subln and config.inner_attn_ln) else - paddle.nn.Identity()) + self.inner_attn_ln = ( + norm_layer(all_head_dim) if (config.subln and config.inner_attn_ln) else paddle.nn.Identity() + ) if dist.get_world_size() > 1: self.proj = fleet.meta_parallel.ColumnParallelLinear( - all_head_dim, - dim, - weight_attr=None, - has_bias=True, - gather_output=True) + all_head_dim, dim, weight_attr=None, has_bias=True, gather_output=True + ) else: self.proj = paddle.nn.Linear(all_head_dim, dim) self.proj_drop = paddle.nn.Dropout(p=config.drop_rate) @@ -306,46 +291,37 @@ def forward(self, x, rel_pos_bias=None, attn_mask=None): k = self.k_proj(x) v = self.v_proj(x) - q = q.reshape( - (B, N, self.num_heads, -1)).transpose(perm=[0, 2, 1, 3]) - k = k.reshape( - (B, N, self.num_heads, -1)).transpose(perm=[0, 2, 1, 3]) - v = v.reshape( - (B, N, self.num_heads, -1)).transpose(perm=[0, 2, 1, 3]) + q = q.reshape((B, N, self.num_heads, -1)).transpose(perm=[0, 2, 1, 3]) + k = k.reshape((B, N, self.num_heads, -1)).transpose(perm=[0, 2, 1, 3]) + v = v.reshape((B, N, self.num_heads, -1)).transpose(perm=[0, 2, 1, 3]) else: qkv_bias = None if self.q_bias is not None: out_0 = paddle.zeros_like(x=self.v_bias) out_0.stop_gradient = not False qkv_bias = paddle.concat(x=(self.q_bias, out_0, self.v_bias)) - qkv = paddle.nn.functional.linear( - x=x, weight=self.qkv.weight, bias=qkv_bias) + qkv = paddle.nn.functional.linear(x=x, weight=self.qkv.weight, bias=qkv_bias) if dist.get_world_size() > 1: hcg = fleet.get_hybrid_communicate_group() if hcg.get_model_parallel_world_size() > 1: model_parallel_group = hcg.get_model_parallel_group() - qkv = paddle.distributed.collective._c_concat( - qkv, group=model_parallel_group) + qkv = paddle.distributed.collective._c_concat(qkv, group=model_parallel_group) - qkv = qkv.reshape( - (B, N, 3, self.num_heads, -1)).transpose(perm=[2, 0, 3, 1, 4]) + qkv = qkv.reshape((B, N, 3, self.num_heads, -1)).transpose(perm=[2, 0, 3, 1, 4]) q, k, v = qkv[0], qkv[1], qkv[2] if self.rope: q_t = q[:, :, 1:, :] ro_q_t = self.rope(q_t) - q = paddle.concat( - x=(q[:, :, :1, :], ro_q_t), axis=-2).astype(dtype=v.dtype) + q = paddle.concat(x=(q[:, :, :1, :], ro_q_t), axis=-2).astype(dtype=v.dtype) k_t = k[:, :, 1:, :] ro_k_t = self.rope(k_t) - k = paddle.concat( - x=(k[:, :, :1, :], ro_k_t), axis=-2).astype(dtype=v.dtype) + k = paddle.concat(x=(k[:, :, :1, :], ro_k_t), axis=-2).astype(dtype=v.dtype) if self.xattn: q = q.transpose(perm=[0, 2, 1, 3]) k = k.transpose(perm=[0, 2, 1, 3]) v = v.transpose(perm=[0, 2, 1, 3]) - x = memory_efficient_attention( - q, k, v, p=self.xattn_drop, scale=self.scale) + x = memory_efficient_attention(q, k, v, p=self.xattn_drop, scale=self.scale) x = x.reshape((B, N, -1)) x = self.inner_attn_ln(x) x = self.proj(x) @@ -357,27 +333,28 @@ def forward(self, x, rel_pos_bias=None, attn_mask=None): perm_0 = list(range(x.ndim)) perm_0[-2] = x.ndim - 1 perm_0[-1] = x.ndim - 2 - attn = q @x.transpose(perm=perm_0) + attn = q @ x.transpose(perm=perm_0) if self.relative_position_bias_table is not None: relative_position_bias = self.relative_position_bias_table[ - self.relative_position_index.reshape((-1))].reshape(( + self.relative_position_index.reshape((-1)) + ].reshape( + ( self.window_size[0] * self.window_size[1] + 1, self.window_size[0] * self.window_size[1] + 1, - -1, )) - relative_position_bias = relative_position_bias.transpose( - perm=[2, 0, 1]) - attn = attn + relative_position_bias.unsqueeze(axis=0).astype( - dtype=attn.dtype) + -1, + ) + ) + relative_position_bias = relative_position_bias.transpose(perm=[2, 0, 1]) + attn = attn + relative_position_bias.unsqueeze(axis=0).astype(dtype=attn.dtype) if rel_pos_bias is not None: attn = attn + rel_pos_bias.astype(dtype=attn.dtype) if attn_mask is not None: attn_mask = attn_mask.astype(dtype="bool") - attn = paddle.where(~attn_mask[:, (None), (None), :], attn, - float("-inf")) + attn = paddle.where(~attn_mask[:, (None), (None), :], attn, float("-inf")) attn = paddle.nn.functional.softmax(attn, axis=-1) with get_rng_state_tracker().rng_state("global_seed"): attn = self.attn_drop(attn) - x = attn @v + x = attn @ v perm_1 = list(range(x.ndim)) perm_1[1] = 2 perm_1[2] = 1 @@ -391,13 +368,14 @@ def forward(self, x, rel_pos_bias=None, attn_mask=None): class Block(paddle.nn.Layer): def __init__( - self, - config, - drop_path=0.0, - window_size=None, - rope=None, - act_layer=paddle.nn.GELU, - norm_layer=paddle.nn.LayerNorm, ): + self, + config, + drop_path=0.0, + window_size=None, + rope=None, + act_layer=paddle.nn.GELU, + norm_layer=paddle.nn.LayerNorm, + ): super().__init__() dim = config.embed_dim init_values = config.init_values @@ -405,8 +383,7 @@ def __init__( self.norm1 = norm_layer(dim) self.attn = Attention(config, window_size=window_size, rope=rope) - self.drop_path = (DropPath(drop_path) - if drop_path > 0.0 else paddle.nn.Identity()) + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else paddle.nn.Identity() self.norm2 = norm_layer(dim) if config.naiveswiglu: self.mlp = SwiGLU(config, norm_layer=norm_layer) @@ -415,38 +392,32 @@ def __init__( if init_values is not None and init_values > 0: init_data = init_values * paddle.ones(shape=dim) self.gamma_1 = self.create_parameter( - shape=dim, - default_initializer=paddle.nn.initializer.Assign(init_data)) + shape=dim, default_initializer=paddle.nn.initializer.Assign(init_data) + ) init_data = init_values * paddle.ones(shape=dim) self.gamma_2 = self.create_parameter( - shape=dim, - default_initializer=paddle.nn.initializer.Assign(init_data)) + shape=dim, default_initializer=paddle.nn.initializer.Assign(init_data) + ) else: self.gamma_1, self.gamma_2 = None, None def forward(self, x, rel_pos_bias=None, attn_mask=None): if self.gamma_1 is None: if self.postnorm: - x = x + self.drop_path( - self.norm1( - self.attn( - x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))) + x = x + self.drop_path(self.norm1(self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))) x = x + self.drop_path(self.norm2(self.mlp(x))) else: - x = x + self.drop_path( - self.attn( - self.norm1(x), - rel_pos_bias=rel_pos_bias, - attn_mask=attn_mask)) + x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)) x = x + self.drop_path(self.mlp(self.norm2(x))) elif self.postnorm: - x = x + self.drop_path(self.gamma_1 * self.norm1( - self.attn( - x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))) + x = x + self.drop_path( + self.gamma_1 * self.norm1(self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)) + ) x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x))) else: - x = x + self.drop_path(self.gamma_1 * self.attn( - self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)) + x = x + self.drop_path( + self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask) + ) x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) return x @@ -458,10 +429,8 @@ def __init__(self, config): super().__init__() img_size = to_2tuple(config.img_size) patch_size = to_2tuple(config.patch_size) - num_patches = img_size[1] // patch_size[1] * (img_size[0] // - patch_size[0]) - self.patch_shape = img_size[0] // patch_size[0], img_size[ - 1] // patch_size[1] + num_patches = img_size[1] // patch_size[1] * (img_size[0] // patch_size[0]) + self.patch_shape = img_size[0] // patch_size[0], img_size[1] // patch_size[1] self.img_size = img_size self.patch_size = patch_size self.num_patches = num_patches @@ -469,7 +438,8 @@ def __init__(self, config): in_channels=config.in_chans, out_channels=config.embed_dim, kernel_size=patch_size, - stride=patch_size, ) + stride=patch_size, + ) def forward(self, x, **kwargs): B, C, H, W = x.shape @@ -488,25 +458,25 @@ class RelativePositionBias(paddle.nn.Layer): def __init__(self, window_size, num_heads): super().__init__() self.window_size = window_size - self.num_relative_distance = (2 * window_size[0] - 1) * ( - 2 * window_size[1] - 1) + 3 + self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 init_data = paddle.zeros(shape=[self.num_relative_distance, num_heads]) self.relative_position_bias_table = self.create_parameter( shape=[self.num_relative_distance, num_heads], - default_initializer=paddle.nn.initializer.Assign(init_data), ) + default_initializer=paddle.nn.initializer.Assign(init_data), + ) coords_h = paddle.arange(end=window_size[0]) coords_w = paddle.arange(end=window_size[1]) coords = paddle.stack(x=paddle.meshgrid([coords_h, coords_w])) coords_flatten = paddle.flatten(x=coords, start_axis=1) - relative_coords = coords_flatten[:, :, (None)] - coords_flatten[:, ( - None), :] + relative_coords = coords_flatten[:, :, (None)] - coords_flatten[:, (None), :] relative_coords = relative_coords.transpose(perm=[1, 2, 0]) relative_coords[:, :, (0)] += window_size[0] - 1 relative_coords[:, :, (1)] += window_size[1] - 1 relative_coords[:, :, (0)] *= 2 * window_size[1] - 1 relative_position_index = paddle.zeros( - shape=(window_size[0] * window_size[1] + 1, ) * 2, - dtype=relative_coords.dtype, ) + shape=(window_size[0] * window_size[1] + 1,) * 2, + dtype=relative_coords.dtype, + ) relative_position_index[1:, 1:] = relative_coords.sum(axis=-1) relative_position_index[(0), 0:] = self.num_relative_distance - 3 relative_position_index[0:, (0)] = self.num_relative_distance - 2 @@ -514,11 +484,13 @@ def __init__(self, window_size, num_heads): self.register_buffer("relative_position_index", relative_position_index) def forward(self): - relative_position_bias = self.relative_position_bias_table[ - self.relative_position_index.reshape((-1))].reshape(( + relative_position_bias = self.relative_position_bias_table[self.relative_position_index.reshape((-1))].reshape( + ( self.window_size[0] * self.window_size[1] + 1, self.window_size[0] * self.window_size[1] + 1, - -1, )) + -1, + ) + ) return relative_position_bias.transpose(perm=[2, 0, 1]) @@ -528,42 +500,43 @@ class EVAVisionTransformerConfig(PretrainedConfig): attribute_map: Dict[str, str] = {} def __init__( - self, - img_size=224, - patch_size=16, - in_chans=3, - num_classes=1000, - embed_dim=768, - depth=12, - num_heads=8, - mlp_ratio=4.0, - qkv_bias=False, - qk_scale=None, - drop_rate=0.0, - attn_drop_rate=0.0, - drop_path_rate=0.0, - init_values=None, - patch_dropout=0.0, - use_abs_pos_emb=True, - use_rel_pos_bias=False, - use_shared_rel_pos_bias=False, - rope=False, - use_mean_pooling=True, - attentional_pool=False, - n_queries: int=256, - attn_pooler_heads: int=8, - init_scale=0.001, - enable_recompute=False, - xattn=False, - postnorm=False, - pt_hw_seq_len=16, - intp_freq=False, - naiveswiglu=False, - subln=False, - output_tokens=False, - fusedLN=False, - inner_attn_ln=True, - **kwargs, ): + self, + img_size=224, + patch_size=16, + in_chans=3, + num_classes=1000, + embed_dim=768, + depth=12, + num_heads=8, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.0, + init_values=None, + patch_dropout=0.0, + use_abs_pos_emb=True, + use_rel_pos_bias=False, + use_shared_rel_pos_bias=False, + rope=False, + use_mean_pooling=True, + attentional_pool=False, + n_queries: int = 256, + attn_pooler_heads: int = 8, + init_scale=0.001, + enable_recompute=False, + xattn=False, + postnorm=False, + pt_hw_seq_len=16, + intp_freq=False, + naiveswiglu=False, + subln=False, + output_tokens=False, + fusedLN=False, + inner_attn_ln=True, + **kwargs, + ): kwargs["return_dict"] = kwargs.pop("return_dict", True) super().__init__(**kwargs) self.img_size = img_size @@ -602,14 +575,10 @@ def __init__( self.inner_attn_ln = inner_attn_ln @classmethod - def from_pretrained(cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, - **kwargs) - - if ("model_type" in config_dict and hasattr(cls, "model_type") and - config_dict["model_type"] != cls.model_type): + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: logger.warning( f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." @@ -644,9 +613,7 @@ def __init__(self, config: EVAVisionTransformerConfig): self.embed_dim = embed_dim = config.embed_dim self.naiveswiglu = config.naiveswiglu use_mean_pooling = config.use_mean_pooling - norm_layer = (partial( - FusedLayerNorm, epsilon=1e-6) if config.fusedLN else partial( - LayerNorm, epsilon=1e-6)) + norm_layer = partial(FusedLayerNorm, epsilon=1e-6) if config.fusedLN else partial(LayerNorm, epsilon=1e-6) num_heads = config.embed_dim // config.head_width self.patch_embed = PatchEmbed(config) @@ -654,18 +621,19 @@ def __init__(self, config: EVAVisionTransformerConfig): init_data = paddle.zeros(shape=[1, 1, embed_dim]) self.cls_token = self.create_parameter( shape=[1, 1, embed_dim], - default_initializer=paddle.nn.initializer.Assign(init_data), ) + default_initializer=paddle.nn.initializer.Assign(init_data), + ) if config.use_abs_pos_emb: init_data = paddle.zeros(shape=[1, num_patches + 1, embed_dim]) self.pos_embed = self.create_parameter( shape=[1, num_patches + 1, embed_dim], - default_initializer=paddle.nn.initializer.Assign(init_data), ) + default_initializer=paddle.nn.initializer.Assign(init_data), + ) else: self.pos_embed = None self.pos_drop = paddle.nn.Dropout(p=config.drop_rate) if config.use_shared_rel_pos_bias: - self.rel_pos_bias = RelativePositionBias( - window_size=self.patch_embed.patch_shape, num_heads=num_heads) + self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads) else: self.rel_pos_bias = None if config.rope: @@ -674,55 +642,59 @@ def __init__(self, config: EVAVisionTransformerConfig): self.rope = VisionRotaryEmbeddingFast( dim=half_head_dim, pt_seq_len=config.pt_hw_seq_len, - ft_seq_len=hw_seq_len if config.intp_freq else None, ) + ft_seq_len=hw_seq_len if config.intp_freq else None, + ) else: self.rope = None - dpr = [ - x.item() - for x in paddle.linspace( - start=0, stop=config.drop_path_rate, num=config.depth) - ] - self.blocks = paddle.nn.LayerList(sublayers=[ - Block( - config, - drop_path=dpr[i], - norm_layer=norm_layer, - window_size=self.patch_embed.patch_shape - if config.use_rel_pos_bias else None, - rope=self.rope, ) for i in range(config.depth) - ]) + dpr = [x.item() for x in paddle.linspace(start=0, stop=config.drop_path_rate, num=config.depth)] + self.blocks = paddle.nn.LayerList( + sublayers=[ + Block( + config, + drop_path=dpr[i], + norm_layer=norm_layer, + window_size=self.patch_embed.patch_shape if config.use_rel_pos_bias else None, + rope=self.rope, + ) + for i in range(config.depth) + ] + ) if config.attentional_pool: self.attn_pool = AttentionalPooler(config) - self.norm = (paddle.nn.Identity() - if use_mean_pooling else norm_layer(num_classes)) + self.norm = paddle.nn.Identity() if use_mean_pooling else norm_layer(num_classes) self.fc_norm = norm_layer(num_classes) if use_mean_pooling else None if dist.get_world_size() > 1: - self.head = (fleet.meta_parallel.ColumnParallelLinear( - num_classes, - num_classes, - weight_attr=None, - has_bias=True, - gather_output=True, ) - if num_classes > 0 else paddle.nn.Identity()) + self.head = ( + fleet.meta_parallel.ColumnParallelLinear( + num_classes, + num_classes, + weight_attr=None, + has_bias=True, + gather_output=True, + ) + if num_classes > 0 + else paddle.nn.Identity() + ) else: - self.head = (paddle.nn.Linear(num_classes, num_classes) - if num_classes > 0 else paddle.nn.Identity()) + self.head = paddle.nn.Linear(num_classes, num_classes) if num_classes > 0 else paddle.nn.Identity() else: self.attn_pool = None - self.norm = (paddle.nn.Identity() - if use_mean_pooling else norm_layer(embed_dim)) + self.norm = paddle.nn.Identity() if use_mean_pooling else norm_layer(embed_dim) self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None if dist.get_world_size() > 1: - self.head = (fleet.meta_parallel.ColumnParallelLinear( - embed_dim, - num_classes, - weight_attr=None, - has_bias=True, - gather_output=True, ) - if num_classes > 0 else paddle.nn.Identity()) + self.head = ( + fleet.meta_parallel.ColumnParallelLinear( + embed_dim, + num_classes, + weight_attr=None, + has_bias=True, + gather_output=True, + ) + if num_classes > 0 + else paddle.nn.Identity() + ) else: - self.head = (paddle.nn.Linear(embed_dim, num_classes) - if num_classes > 0 else paddle.nn.Identity()) + self.head = paddle.nn.Linear(embed_dim, num_classes) if num_classes > 0 else paddle.nn.Identity() if self.pos_embed is not None: trunc_normal_(self.pos_embed, std=0.02) trunc_normal_(self.cls_token, std=0.02) @@ -731,13 +703,9 @@ def __init__(self, config: EVAVisionTransformerConfig): if isinstance(self.head, fleet.meta_parallel.ColumnParallelLinear): trunc_normal_(self.head.weight, std=0.02) with paddle.no_grad(): - self.head.weight.set_value( - self.head.weight.scale(scale=config.init_scale)) - self.head.bias.set_value( - self.head.bias.scale(scale=config.init_scale)) - self.patch_dropout = (PatchDropout(config.patch_dropout) - if config.patch_dropout > 0.0 else - paddle.nn.Identity()) + self.head.weight.set_value(self.head.weight.scale(scale=config.init_scale)) + self.head.bias.set_value(self.head.bias.scale(scale=config.init_scale)) + self.patch_dropout = PatchDropout(config.patch_dropout) if config.patch_dropout > 0.0 else paddle.nn.Identity() def fix_init_weight(self): def rescale(param, layer_id): @@ -762,8 +730,7 @@ def get_cast_dtype(self) -> paddle.dtype: def _init_weights(self, m): zeros_params = paddle.nn.initializer.Constant(0.0) ones_params = paddle.nn.initializer.Constant(1.0) - if isinstance(m, (paddle.nn.Linear, - fleet.meta_parallel.ColumnParallelLinear)): + if isinstance(m, (paddle.nn.Linear, fleet.meta_parallel.ColumnParallelLinear)): trunc_normal_(m.weight, std=0.02) if m.bias is not None: zeros_params(m.bias) @@ -775,8 +742,7 @@ def get_num_layers(self): return len(self.blocks) def lock(self, unlocked_groups=0, freeze_bn_stats=False): - assert (unlocked_groups == 0 - ), "partial locking not currently supported for this model" + assert unlocked_groups == 0, "partial locking not currently supported for this model" for param in self.parameters(): param.stop_gradient = not False @@ -792,16 +758,19 @@ def get_classifier(self): def reset_classifier(self, num_classes, global_pool=""): self.num_classes = num_classes if dist.get_world_size() > 1: - self.head = (fleet.meta_parallel.ColumnParallelLinear( - self.embed_dim, - num_classes, - weight_attr=None, - has_bias=True, - gather_output=True, ) - if num_classes > 0 else paddle.nn.Identity()) + self.head = ( + fleet.meta_parallel.ColumnParallelLinear( + self.embed_dim, + num_classes, + weight_attr=None, + has_bias=True, + gather_output=True, + ) + if num_classes > 0 + else paddle.nn.Identity() + ) else: - self.head = (paddle.nn.Linear(self.embed_dim, num_classes) - if num_classes > 0 else paddle.nn.Identity()) + self.head = paddle.nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else paddle.nn.Identity() def forward_features(self, x, return_all_features=False): x = self.patch_embed(x) @@ -813,25 +782,20 @@ def forward_features(self, x, return_all_features=False): with get_rng_state_tracker().rng_state("global_seed"): x = self.pos_drop(x) if os.getenv("RoPE") == "1": - if self.training and not isinstance(self.patch_dropout, - paddle.nn.Identity): + if self.training and not isinstance(self.patch_dropout, paddle.nn.Identity): x, patch_indices_keep = self.patch_dropout(x) - self.rope.forward = partial( - self.rope.forward, patch_indices_keep=patch_indices_keep) + self.rope.forward = partial(self.rope.forward, patch_indices_keep=patch_indices_keep) else: - self.rope.forward = partial( - self.rope.forward, patch_indices_keep=None) + self.rope.forward = partial(self.rope.forward, patch_indices_keep=None) x = self.patch_dropout(x) else: x = self.patch_dropout(x) - rel_pos_bias = self.rel_pos_bias( - ) if self.rel_pos_bias is not None else None + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None cnt = 0 for blk in self.blocks: cnt += 1 if self.enable_recompute: - x = paddle.distributed.fleet.utils.recompute( - blk, x, rel_pos_bias, use_reentrant=False) + x = paddle.distributed.fleet.utils.recompute(blk, x, rel_pos_bias, use_reentrant=False) else: x = blk(x, rel_pos_bias=rel_pos_bias) diff --git a/paddlemix/models/evaclip/loss.py b/paddlemix/models/evaclip/loss.py index 89901ae072014..0181da0b724cb 100644 --- a/paddlemix/models/evaclip/loss.py +++ b/paddlemix/models/evaclip/loss.py @@ -20,10 +20,7 @@ from paddlemix.models.common.distributed_utils import allgather -def gather_features_cat_group_bk(image_features, - text_features, - group, - gather_with_grad=False): +def gather_features_cat_group_bk(image_features, text_features, group, gather_with_grad=False): if group.world_size <= 1: return image_features, text_features features = paddle.concat([image_features, text_features], axis=-1) @@ -37,10 +34,7 @@ def gather_features_cat_group_bk(image_features, return image_features, text_features -def gather_features_cat_group(image_features, - text_features, - group, - gather_with_grad=False): +def gather_features_cat_group(image_features, text_features, group, gather_with_grad=False): if group.world_size <= 1: return image_features, text_features if gather_with_grad: @@ -57,34 +51,35 @@ def gather_features_cat_group(image_features, def gather_features( - image_features, - text_features, - local_loss=False, - gather_with_grad=False, - rank=0, - world_size=1, - use_horovod=False, ): + image_features, + text_features, + local_loss=False, + gather_with_grad=False, + rank=0, + world_size=1, + use_horovod=False, +): hcg = paddle.distributed.fleet.get_hybrid_communicate_group() shardinggroup = hcg.get_sharding_parallel_group() dpgroup = hcg.get_data_parallel_group() if gather_with_grad: if shardinggroup.nranks > 1: image_features, text_features = gather_features_cat_group( - image_features, text_features, shardinggroup, gather_with_grad) + image_features, text_features, shardinggroup, gather_with_grad + ) if dpgroup.nranks > 1: image_features, text_features = gather_features_cat_group( - image_features, text_features, dpgroup, gather_with_grad) + image_features, text_features, dpgroup, gather_with_grad + ) all_image_features = image_features all_text_features = text_features else: image_features_bk = image_features text_features_bk = text_features if shardinggroup.nranks > 1: - image_features, text_features = gather_features_cat_group( - image_features, text_features, shardinggroup) + image_features, text_features = gather_features_cat_group(image_features, text_features, shardinggroup) if dpgroup.nranks > 1: - image_features, text_features = gather_features_cat_group( - image_features, text_features, dpgroup) + image_features, text_features = gather_features_cat_group(image_features, text_features, dpgroup) if not local_loss: dp_rank = hcg.get_data_parallel_rank() sharding_rank = hcg.get_sharding_parallel_rank() @@ -104,13 +99,14 @@ def gather_features( def gather_features_bk( - image_features, - text_features, - local_loss=False, - gather_with_grad=False, - rank=0, - world_size=1, - use_horovod=False, ): + image_features, + text_features, + local_loss=False, + gather_with_grad=False, + rank=0, + world_size=1, + use_horovod=False, +): # We gather tensors from all gpus if gather_with_grad: @@ -137,14 +133,15 @@ def gather_features_bk( class ClipLoss(nn.Layer): def __init__( - self, - local_loss=False, - gather_with_grad=False, - cache_labels=False, - visual_loss=True, - text_loss=False, - rank=0, - world_size=1, ): + self, + local_loss=False, + gather_with_grad=False, + cache_labels=False, + visual_loss=True, + text_loss=False, + rank=0, + world_size=1, + ): super().__init__() self.local_loss = local_loss self.gather_with_grad = gather_with_grad @@ -163,18 +160,18 @@ def forward(self, preds): self.local_loss, self.gather_with_grad, self.rank, - self.world_size, ) + self.world_size, + ) if self.local_loss: - logits_per_image = logit_scale * image_features @all_text_features.T - logits_per_text = logit_scale * text_features @all_image_features.T + logits_per_image = logit_scale * image_features @ all_text_features.T + logits_per_text = logit_scale * text_features @ all_image_features.T else: - logits_per_image = (logit_scale * all_image_features - @all_text_features.T) + logits_per_image = logit_scale * all_image_features @ all_text_features.T logits_per_text = logits_per_image.T else: - logits_per_image = logit_scale * image_features @text_features.T - logits_per_text = logit_scale * text_features @image_features.T + logits_per_image = logit_scale * image_features @ text_features.T + logits_per_text = logit_scale * text_features @ image_features.T # calculated ground-truth and cache if enabled num_logits = logits_per_image.shape[0] diff --git a/paddlemix/models/evaclip/modules/fusedln.py b/paddlemix/models/evaclip/modules/fusedln.py index 3e2df13a6148e..beb68a540da7b 100644 --- a/paddlemix/models/evaclip/modules/fusedln.py +++ b/paddlemix/models/evaclip/modules/fusedln.py @@ -55,17 +55,19 @@ def check_normalized_shape(normalized_shape): class FusedLayerNorm(OriginLayerNorm): def __init__( - self, - normalized_shape, - epsilon=1e-05, - weight_attr=None, - bias_attr=None, - name=None, ): + self, + normalized_shape, + epsilon=1e-05, + weight_attr=None, + bias_attr=None, + name=None, + ): super().__init__( normalized_shape=normalized_shape, epsilon=epsilon, weight_attr=weight_attr, - bias_attr=bias_attr, ) + bias_attr=bias_attr, + ) check_normalized_shape(self._normalized_shape) def forward(self, input): @@ -74,17 +76,19 @@ def forward(self, input): class FastLayerNorm(OriginLayerNorm): def __init__( - self, - normalized_shape, - epsilon=1e-05, - weight_attr=None, - bias_attr=None, - name=None, ): + self, + normalized_shape, + epsilon=1e-05, + weight_attr=None, + bias_attr=None, + name=None, + ): super().__init__( normalized_shape=normalized_shape, epsilon=epsilon, weight_attr=weight_attr, - bias_attr=bias_attr, ) + bias_attr=bias_attr, + ) check_normalized_shape(self._normalized_shape) def forward(self, input): @@ -105,21 +109,19 @@ def backward(ctx, y_grad): if bias is None: if hasattr(weight, "main_grad"): - weight.main_grad, _ = _C_ops.fused_linear_param_grad_add( - x, y_grad, weight.main_grad, None, True) + weight.main_grad, _ = _C_ops.fused_linear_param_grad_add(x, y_grad, weight.main_grad, None, True) return x_grad, None else: - weight_grad, _ = _C_ops.fused_linear_param_grad_add( - x, y_grad, None, None, False) + weight_grad, _ = _C_ops.fused_linear_param_grad_add(x, y_grad, None, None, False) return x_grad, weight_grad if hasattr(weight, "main_grad") and hasattr(bias, "main_grad"): weight.main_grad, bias.main_grad = _C_ops.fused_linear_param_grad_add( - x, y_grad, weight.main_grad, bias.main_grad, True) + x, y_grad, weight.main_grad, bias.main_grad, True + ) return x_grad, None, None else: - weight_grad, bias_grad = _C_ops.fused_linear_param_grad_add( - x, y_grad, None, None, False) + weight_grad, bias_grad = _C_ops.fused_linear_param_grad_add(x, y_grad, None, None, False) return x_grad, weight_grad, bias_grad diff --git a/paddlemix/models/evaclip/modules/rope.py b/paddlemix/models/evaclip/modules/rope.py index 130f517ad4299..adaaf2a9872f7 100644 --- a/paddlemix/models/evaclip/modules/rope.py +++ b/paddlemix/models/evaclip/modules/rope.py @@ -21,21 +21,19 @@ def broadcat(tensors, dim=-1): num_tensors = len(tensors) shape_lens = set(list(map(lambda t: len(t.shape), tensors))) - assert len( - shape_lens) == 1, "tensors must all have the same number of dimensions" + assert len(shape_lens) == 1, "tensors must all have the same number of dimensions" shape_len = list(shape_lens)[0] dim = dim + shape_len if dim < 0 else dim dims = list(zip(*map(lambda t: list(t.shape), tensors))) expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim] - assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims) - ]), "invalid dimensions for broadcastable concatentation" + assert all( + [*map(lambda t: len(set(t[1])) <= 2, expandable_dims)] + ), "invalid dimensions for broadcastable concatentation" max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims)) - expanded_dims = list( - map(lambda t: (t[0], (t[1], ) * num_tensors), max_dims)) + expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims)) expanded_dims.insert(dim, (dim, dims[dim])) expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims))) - tensors = list( - map(lambda t: t[0].expand(shape=t[1]), zip(tensors, expandable_shapes))) + tensors = list(map(lambda t: t[0].expand(shape=t[1]), zip(tensors, expandable_shapes))) return paddle.concat(x=tensors, axis=dim) @@ -48,25 +46,23 @@ def rotate_half(x): class VisionRotaryEmbedding(paddle.nn.Layer): def __init__( - self, - dim, - pt_seq_len, - ft_seq_len=None, - custom_freqs=None, - freqs_for="lang", - theta=10000, - max_freq=10, - num_freqs=1, ): + self, + dim, + pt_seq_len, + ft_seq_len=None, + custom_freqs=None, + freqs_for="lang", + theta=10000, + max_freq=10, + num_freqs=1, + ): super().__init__() if custom_freqs: freqs = custom_freqs elif freqs_for == "lang": - freqs = 1.0 / theta**(paddle.arange( - start=0, end=dim, - step=2)[:dim // 2].astype(dtype="float32") / dim) + freqs = 1.0 / theta ** (paddle.arange(start=0, end=dim, step=2)[: dim // 2].astype(dtype="float32") / dim) elif freqs_for == "pixel": - freqs = paddle.linspace( - start=1.0, stop=max_freq / 2, num=dim // 2) * pi + freqs = paddle.linspace(start=1.0, stop=max_freq / 2, num=dim // 2) * pi elif freqs_for == "constant": freqs = paddle.ones(shape=num_freqs).astype(dtype="float32") else: @@ -92,33 +88,32 @@ def forward(self, t, start_index=0): t_left, t, t_right = ( t[(...), :start_index], t[(...), start_index:end_index], - t[(...), end_index:], ) + t[(...), end_index:], + ) t = t * self.freqs_cos + rotate_half(t) * self.freqs_sin return paddle.concat(x=(t_left, t, t_right), axis=-1) class VisionRotaryEmbeddingFast(paddle.nn.Layer): def __init__( - self, - dim, - pt_seq_len, - ft_seq_len=None, - custom_freqs=None, - freqs_for="lang", - theta=10000, - max_freq=10, - num_freqs=1, - patch_dropout=0.0, ): + self, + dim, + pt_seq_len, + ft_seq_len=None, + custom_freqs=None, + freqs_for="lang", + theta=10000, + max_freq=10, + num_freqs=1, + patch_dropout=0.0, + ): super().__init__() if custom_freqs: freqs = custom_freqs elif freqs_for == "lang": - freqs = 1.0 / theta**(paddle.arange( - start=0, end=dim, - step=2)[:dim // 2].astype(dtype="float32") / dim) + freqs = 1.0 / theta ** (paddle.arange(start=0, end=dim, step=2)[: dim // 2].astype(dtype="float32") / dim) elif freqs_for == "pixel": - freqs = paddle.linspace( - start=1.0, stop=max_freq / 2, num=dim // 2) * pi + freqs = paddle.linspace(start=1.0, stop=max_freq / 2, num=dim // 2) * pi elif freqs_for == "constant": freqs = paddle.ones(shape=num_freqs).astype(dtype="float32") else: diff --git a/paddlemix/models/evaclip/utils.py b/paddlemix/models/evaclip/utils.py index 43da3ab27ee85..afc62e5d79a97 100644 --- a/paddlemix/models/evaclip/utils.py +++ b/paddlemix/models/evaclip/utils.py @@ -59,10 +59,7 @@ def parse(x): to_ntuple = _ntuple -def clip_grad_norm_(parameters, - max_norm, - norm_type, - error_if_nonfinite: bool=False): +def clip_grad_norm_(parameters, max_norm, norm_type, error_if_nonfinite: bool = False): r"""Clips gradient norm of an iterable of parameters. The norm is computed over all gradients together, as if they were @@ -90,19 +87,16 @@ def clip_grad_norm_(parameters, return paddle.to_tensor([0.0]) if norm_type == float("inf"): norms = [g.detach().abs().max() for g in grads] - total_norm = norms[0] if len(norms) == 1 else paddle.max( - paddle.stack(norms)) + total_norm = norms[0] if len(norms) == 1 else paddle.max(paddle.stack(norms)) else: - total_norm = paddle.norm( - paddle.stack([paddle.norm(g.detach(), norm_type) for g in grads]), - norm_type) - if error_if_nonfinite and paddle.logical_or(total_norm.isnan(), - total_norm.isinf()): + total_norm = paddle.norm(paddle.stack([paddle.norm(g.detach(), norm_type) for g in grads]), norm_type) + if error_if_nonfinite and paddle.logical_or(total_norm.isnan(), total_norm.isinf()): raise RuntimeError( f"The total norm of order {norm_type} for gradients from " "`parameters` is non-finite, so it cannot be clipped. To disable " "this error and scale the gradients by the non-finite norm anyway, " - "set `error_if_nonfinite=False`") + "set `error_if_nonfinite=False`" + ) clip_coef = max_norm / (total_norm + 1e-6) # Note: multiplying by the clamped coef is redundant when the coef is clamped to 1, but doing so # avoids a `if clip_coef < 1:` conditional which can require a CPU <=> device synchronization @@ -111,15 +105,10 @@ def clip_grad_norm_(parameters, for g in grads: clipg = paddle.multiply(g, clip_coef_clamped) g.set_value(clipg) - total_norm_clip = paddle.norm( - paddle.stack([paddle.norm(g.detach(), norm_type) for g in grads]), - norm_type) + total_norm_clip = paddle.norm(paddle.stack([paddle.norm(g.detach(), norm_type) for g in grads]), norm_type) return total_norm_clip -def clip_grad_norm(model, - max_norm, - norm_type=2.0, - error_if_nonfinite: bool=False): +def clip_grad_norm(model, max_norm, norm_type=2.0, error_if_nonfinite: bool = False): parameters = model.parameters() return clip_grad_norm_(parameters, max_norm, norm_type, error_if_nonfinite) diff --git a/paddlemix/models/groundingdino/backbone/backbone.py b/paddlemix/models/groundingdino/backbone/backbone.py index c2ac019c50caf..a6bc6c4b63e16 100644 --- a/paddlemix/models/groundingdino/backbone/backbone.py +++ b/paddlemix/models/groundingdino/backbone/backbone.py @@ -15,12 +15,10 @@ Backbone modules. """ -from collections import OrderedDict -from typing import Dict, List, Optional +from typing import List import paddle import paddle.nn as nn -import paddle.nn.functional as F from .position_encoding import build_position_encoding from .swin_transformer import SwinTransformerModel @@ -59,11 +57,11 @@ def build_backbone(args): use_checkpoint = getattr(args, "use_checkpoint", False) if args.backbone in [ - "swin_T_224_1k", - "swin_B_224_22k", - "swin_B_384_22k", - "swin_L_224_22k", - "swin_L_384_22k", + "swin_T_224_1k", + "swin_B_224_22k", + "swin_B_384_22k", + "swin_L_224_22k", + "swin_L_384_22k", ]: pretrain_img_size = int(args.backbone.split("_")[-2]) backbone = SwinTransformerModel.from_pretrained( @@ -71,9 +69,10 @@ def build_backbone(args): pretrain_img_size=pretrain_img_size, out_indices=tuple(return_interm_indices), dilation=False, - use_checkpoint=use_checkpoint, ) + use_checkpoint=use_checkpoint, + ) - bb_num_channels = backbone.num_features[4 - len(return_interm_indices):] + bb_num_channels = backbone.num_features[4 - len(return_interm_indices) :] else: raise NotImplementedError("Unknown backbone {}".format(args.backbone)) @@ -83,9 +82,8 @@ def build_backbone(args): model = Joiner(backbone, position_embedding) model.num_channels = bb_num_channels - assert isinstance( - bb_num_channels, - List), "bb_num_channels is expected to be a List but {}".format( - type(bb_num_channels)) + assert isinstance(bb_num_channels, List), "bb_num_channels is expected to be a List but {}".format( + type(bb_num_channels) + ) return model diff --git a/paddlemix/models/groundingdino/backbone/position_encoding.py b/paddlemix/models/groundingdino/backbone/position_encoding.py index 4c4410e6023db..4f7d1af31f124 100644 --- a/paddlemix/models/groundingdino/backbone/position_encoding.py +++ b/paddlemix/models/groundingdino/backbone/position_encoding.py @@ -18,8 +18,6 @@ import paddle import paddle.nn as nn -from matplotlib.pyplot import axis -from paddlenlp.utils.initializer import uniform_ class PositionEmbeddingSineHW(nn.Layer): @@ -29,12 +27,13 @@ class PositionEmbeddingSineHW(nn.Layer): """ def __init__( - self, - num_pos_feats=64, - temperatureH=10000, - temperatureW=10000, - normalize=False, - scale=None, ): + self, + num_pos_feats=64, + temperatureH=10000, + temperatureW=10000, + normalize=False, + scale=None, + ): super().__init__() self.num_pos_feats = num_pos_feats self.temperatureH = temperatureH @@ -61,23 +60,15 @@ def forward(self, mask: paddle.Tensor): x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale dim_tx = paddle.arange(self.num_pos_feats) - dim_tx = self.temperatureW**( - 2 * (paddle.floor_divide(dim_tx, paddle.to_tensor(2))) / - self.num_pos_feats) + dim_tx = self.temperatureW ** (2 * (paddle.floor_divide(dim_tx, paddle.to_tensor(2))) / self.num_pos_feats) pos_x = x_embed[:, :, :, None] / dim_tx dim_ty = paddle.arange(self.num_pos_feats) - dim_ty = self.temperatureH**( - 2 * (paddle.floor_divide(dim_ty, paddle.to_tensor(2))) / - self.num_pos_feats) + dim_ty = self.temperatureH ** (2 * (paddle.floor_divide(dim_ty, paddle.to_tensor(2))) / self.num_pos_feats) pos_y = y_embed[:, :, :, None] / dim_ty - pos_x = paddle.stack( - (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), - axis=4).flatten(3) - pos_y = paddle.stack( - (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), - axis=4).flatten(3) + pos_x = paddle.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), axis=4).flatten(3) + pos_y = paddle.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), axis=4).flatten(3) pos = paddle.concat((pos_y, pos_x), axis=3).transpose([0, 3, 1, 2]) return pos @@ -91,7 +82,8 @@ def build_position_encoding(args): N_steps, temperatureH=args.pe_temperatureH, temperatureW=args.pe_temperatureW, - normalize=True, ) + normalize=True, + ) elif args.position_embedding in ("v3", "learned"): position_embedding = PositionEmbeddingLearned(N_steps) else: diff --git a/paddlemix/models/groundingdino/backbone/swin_transformer.py b/paddlemix/models/groundingdino/backbone/swin_transformer.py index 2102a8bc1fa5f..dda95ec7b9932 100644 --- a/paddlemix/models/groundingdino/backbone/swin_transformer.py +++ b/paddlemix/models/groundingdino/backbone/swin_transformer.py @@ -15,20 +15,21 @@ import os from typing import Union -import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle.distributed.fleet.utils import recompute from paddle.nn.initializer import Constant +from paddlemix.utils.log import logger + from ..layers import DropPath, to_2tuple trunc_normal_ = nn.initializer.TruncatedNormal(std=0.02) from paddlenlp.transformers.configuration_utils import PretrainedConfig -from paddlenlp.transformers.model_utils import (PretrainedModel, - register_base_model) +from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model + """ swin_transformer model configuration""" __all__ = ["SwinTransformerConfig"] @@ -38,28 +39,29 @@ class SwinTransformerConfig(PretrainedConfig): model_type = "swintransformer" def __init__( - self, - in_chans=3, - embed_dim=128, - depths=[2, 2, 18, 2], - num_heads=[4, 8, 16, 32], - window_size=7, - pretrain_img_size=224, - patch_size=4, - mlp_ratio=4.0, - qkv_bias=True, - qk_scale=None, - drop_rate=0.0, - attn_drop_rate=0.0, - drop_path_rate=0.2, - norm_layer=nn.LayerNorm, - ape=False, - patch_norm=True, - out_indices=(0, 1, 2, 3), - frozen_stages=-1, - dilation=False, - use_checkpoint=False, - **kwargs, ): + self, + in_chans=3, + embed_dim=128, + depths=[2, 2, 18, 2], + num_heads=[4, 8, 16, 32], + window_size=7, + pretrain_img_size=224, + patch_size=4, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.2, + norm_layer=nn.LayerNorm, + ape=False, + patch_norm=True, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + dilation=False, + use_checkpoint=False, + **kwargs, + ): kwargs["return_dict"] = kwargs.pop("return_dict", True) super().__init__(**kwargs) @@ -85,14 +87,10 @@ def __init__( self.use_checkpoint = use_checkpoint @classmethod - def from_pretrained(cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, - **kwargs) - - if ("model_type" in config_dict and hasattr(cls, "model_type") and - config_dict["model_type"] != cls.model_type): + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: logger.warning( f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." @@ -116,12 +114,13 @@ class Mlp(nn.Layer): """Multilayer perceptron.""" def __init__( - self, - in_features, - hidden_features=None, - out_features=None, - act_layer=nn.GELU, - drop=0.0, ): + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, + ): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features @@ -148,10 +147,8 @@ def window_partition(x, window_size): windows: (num_windows*B, window_size, window_size, C) """ B, H, W, C = x.shape - x = x.reshape( - [B, H // window_size, window_size, W // window_size, window_size, C]) - windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape( - [-1, window_size, window_size, C]) + x = x.reshape([B, H // window_size, window_size, W // window_size, window_size, C]) + windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, window_size, window_size, C]) return windows @@ -166,8 +163,7 @@ def window_reverse(windows, window_size, H, W): x: (B, H, W, C) """ B = int(windows.shape[0] / (H * W / window_size / window_size)) - x = windows.reshape( - [B, H // window_size, W // window_size, window_size, window_size, -1]) + x = windows.reshape([B, H // window_size, W // window_size, window_size, window_size, -1]) x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, H, W, -1]) return x @@ -186,14 +182,15 @@ class WindowAttention(nn.Layer): """ def __init__( - self, - dim, - window_size, - num_heads, - qkv_bias=True, - qk_scale=None, - attn_drop=0.0, - proj_drop=0.0, ): + self, + dim, + window_size, + num_heads, + qkv_bias=True, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): super().__init__() self.dim = dim @@ -204,24 +201,19 @@ def __init__( # define a parameter table of relative position bias self.relative_position_bias_table = self.create_parameter( - shape=[(2 * window_size[0] - 1) * (2 * window_size[1] - 1), - num_heads], + shape=[(2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads], dtype=paddle.float32, - default_initializer=Constant(0.0), ) # 2*Wh-1 * 2*Ww-1, nH + default_initializer=Constant(0.0), + ) # 2*Wh-1 * 2*Ww-1, nH # get pair-wise relative position index for each token inside the window coords_h = paddle.arange(self.window_size[0]) coords_w = paddle.arange(self.window_size[1]) - coords = paddle.stack(paddle.meshgrid( - [coords_h, coords_w])) # 2, Wh, Ww + coords = paddle.stack(paddle.meshgrid([coords_h, coords_w])) # 2, Wh, Ww coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww - relative_coords = ( - coords_flatten[:, :, None] - coords_flatten[:, None, :] - ) # 2, Wh*Ww, Wh*Ww - relative_coords = relative_coords.transpose( - [1, 2, 0]) # Wh*Ww, Wh*Ww, 2 - relative_coords[:, :, 0] += self.window_size[ - 0] - 1 # shift to start from 0 + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.transpose([1, 2, 0]) # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += self.window_size[1] - 1 relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 self.relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww @@ -241,34 +233,32 @@ def forward(self, x, mask=None): mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None """ B_, N, C = x.shape - qkv = (self.qkv(x) - .reshape([B_, N, 3, self.num_heads, C // self.num_heads]) - .transpose([2, 0, 3, 1, 4])) + qkv = self.qkv(x).reshape([B_, N, 3, self.num_heads, C // self.num_heads]).transpose([2, 0, 3, 1, 4]) q, k, v = ( qkv[0], qkv[1], - qkv[2], ) # make torchscript happy (cannot use tensor as tuple) + qkv[2], + ) # make torchscript happy (cannot use tensor as tuple) q = q * self.scale attn = paddle.mm(q, k.transpose([0, 1, 3, 2])) index = self.relative_position_index.flatten() - relative_position_bias = paddle.index_select( - self.relative_position_bias_table, index) + relative_position_bias = paddle.index_select(self.relative_position_bias_table, index) - relative_position_bias = relative_position_bias.reshape([ - self.window_size[0] * self.window_size[1], - self.window_size[0] * self.window_size[1], - -1, - ]) # Wh*Ww,Wh*Ww,nH - relative_position_bias = relative_position_bias.transpose( - [2, 0, 1]) # nH, Wh*Ww, Wh*Ww + relative_position_bias = relative_position_bias.reshape( + [ + self.window_size[0] * self.window_size[1], + self.window_size[0] * self.window_size[1], + -1, + ] + ) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.transpose([2, 0, 1]) # nH, Wh*Ww, Wh*Ww attn = attn + relative_position_bias.unsqueeze(0) if mask is not None: nW = mask.shape[0] - attn = attn.reshape( - [-1, nW, self.num_heads, N, N]) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.reshape([-1, nW, self.num_heads, N, N]) + mask.unsqueeze(1).unsqueeze(0) attn = attn.reshape([-1, self.num_heads, N, N]) attn = self.softmax(attn) else: @@ -300,27 +290,27 @@ class SwinTransformerBlock(nn.Layer): """ def __init__( - self, - dim, - num_heads, - window_size=7, - shift_size=0, - mlp_ratio=4.0, - qkv_bias=True, - qk_scale=None, - drop=0.0, - attn_drop=0.0, - drop_path=0.0, - act_layer=nn.GELU, - norm_layer=nn.LayerNorm, ): + self, + dim, + num_heads, + window_size=7, + shift_size=0, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + ): super().__init__() self.dim = dim self.num_heads = num_heads self.window_size = window_size self.shift_size = shift_size self.mlp_ratio = mlp_ratio - assert (0 <= self.shift_size < self.window_size - ), "shift_size must in 0-window_size" + assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" self.norm1 = norm_layer(dim) self.attn = WindowAttention( @@ -330,17 +320,18 @@ def __init__( qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, - proj_drop=drop, ) + proj_drop=drop, + ) - self.drop_path = DropPath( - drop_path) if drop_path > 0.0 else nn.Identity() + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, - drop=drop, ) + drop=drop, + ) self.H = None self.W = None @@ -371,36 +362,26 @@ def forward(self, x, mask_matrix): # cyclic shift if self.shift_size > 0: - shifted_x = paddle.roll( - x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2)) + shifted_x = paddle.roll(x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2)) attn_mask = mask_matrix else: shifted_x = x attn_mask = None # partition windows - x_windows = window_partition( - shifted_x, self.window_size) # nW*B, window_size, window_size, C - x_windows = x_windows.reshape( - [-1, self.window_size * self.window_size, - C]) # nW*B, window_size*window_size, C + x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C + x_windows = x_windows.reshape([-1, self.window_size * self.window_size, C]) # nW*B, window_size*window_size, C # W-MSA/SW-MSA - attn_windows = self.attn( - x_windows, mask=attn_mask) # nW*B, window_size*window_size, C + attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C # merge windows - attn_windows = attn_windows.reshape( - [-1, self.window_size, self.window_size, C]) - shifted_x = window_reverse(attn_windows, self.window_size, Hp, - Wp) # B H' W' C + attn_windows = attn_windows.reshape([-1, self.window_size, self.window_size, C]) + shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C # reverse cyclic shift if self.shift_size > 0: - x = paddle.roll( - shifted_x, - shifts=(self.shift_size, self.shift_size), - axis=(1, 2)) + x = paddle.roll(shifted_x, shifts=(self.shift_size, self.shift_size), axis=(1, 2)) else: x = shifted_x @@ -477,20 +458,21 @@ class BasicLayer(nn.Layer): """ def __init__( - self, - dim, - depth, - num_heads, - window_size=7, - mlp_ratio=4.0, - qkv_bias=True, - qk_scale=None, - drop=0.0, - attn_drop=0.0, - drop_path=0.0, - norm_layer=nn.LayerNorm, - downsample=None, - use_checkpoint=False, ): + self, + dim, + depth, + num_heads, + window_size=7, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + norm_layer=nn.LayerNorm, + downsample=None, + use_checkpoint=False, + ): super().__init__() self.window_size = window_size self.shift_size = window_size // 2 @@ -498,21 +480,24 @@ def __init__( self.use_checkpoint = use_checkpoint # build blocks - self.blocks = nn.LayerList([ - SwinTransformerBlock( - dim=dim, - num_heads=num_heads, - window_size=window_size, - shift_size=0 if (i % 2 == 0) else window_size // 2, - mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop, - attn_drop=attn_drop, - drop_path=drop_path[i] - if isinstance(drop_path, list) else drop_path, - norm_layer=norm_layer, ) for i in range(depth) - ]) + self.blocks = nn.LayerList( + [ + SwinTransformerBlock( + dim=dim, + num_heads=num_heads, + window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer, + ) + for i in range(depth) + ] + ) # patch merging layer if downsample is not None: @@ -530,29 +515,27 @@ def forward(self, x, H, W): # calculate attention mask for SW-MSA Hp = (H + self.window_size - 1) // self.window_size * self.window_size Wp = (W + self.window_size - 1) // self.window_size * self.window_size - img_mask = paddle.zeros( - (1, Hp, Wp, 1), dtype=paddle.float32) # 1 Hp Wp 1 + img_mask = paddle.zeros((1, Hp, Wp, 1), dtype=paddle.float32) # 1 Hp Wp 1 h_slices = ( slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), - slice(-self.shift_size, None), ) + slice(-self.shift_size, None), + ) w_slices = ( slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), - slice(-self.shift_size, None), ) + slice(-self.shift_size, None), + ) cnt = 0 for h in h_slices: for w in w_slices: img_mask[:, h, w, :] = cnt cnt += 1 - mask_windows = window_partition( - img_mask, self.window_size) # nW, window_size, window_size, 1 - mask_windows = mask_windows.reshape( - [-1, self.window_size * self.window_size]) + mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1 + mask_windows = mask_windows.reshape([-1, self.window_size * self.window_size]) attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) - attn_mask = (-100.0 * paddle.ones_like(attn_mask) * - (attn_mask != 0).astype(paddle.float32)) + attn_mask = -100.0 * paddle.ones_like(attn_mask) * (attn_mask != 0).astype(paddle.float32) for blk in self.blocks: blk.H, blk.W = H, W @@ -585,8 +568,7 @@ def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): self.in_chans = in_chans self.embed_dim = embed_dim - self.proj = nn.Conv2D( - in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + self.proj = nn.Conv2D(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) if norm_layer is not None: self.norm = norm_layer(embed_dim) else: @@ -640,32 +622,29 @@ def __init__(self, config: SwinTransformerConfig): patch_size=self.patch_size, in_chans=self.in_chans, embed_dim=self.embed_dim, - norm_layer=self.norm_layer if self.patch_norm else None, ) + norm_layer=self.norm_layer if self.patch_norm else None, + ) # absolute position embedding if self.ape: - patch_size = to_2tuple(self.patch_size) + # patch_size = to_2tuple(self.patch_size) patches_resolution = [ self.pretrain_img_size[0] // self.patch_size[0], self.pretrain_img_size[1] // self.patch_size[1], ] self.absolute_pos_embed = self.create_parameter( - shape=[ - 1, self.embed_dim, patches_resolution[0], - patches_resolution[1] - ], + shape=[1, self.embed_dim, patches_resolution[0], patches_resolution[1]], dtype=paddle.float32, - default_initializer=Constant(0.0), ) + default_initializer=Constant(0.0), + ) trunc_normal_(self.absolute_pos_embed) self.pos_drop = nn.Dropout(p=config.drop_rate) # stochastic depth dpr = [ - x.item() - for x in paddle.linspace(0, config.drop_path_rate, - sum(config.depths)) + x.item() for x in paddle.linspace(0, config.drop_path_rate, sum(config.depths)) ] # stochastic depth decay rule # build layers @@ -673,13 +652,10 @@ def __init__(self, config: SwinTransformerConfig): # prepare downsample list downsamplelist = [PatchMerging for i in range(self.num_layers)] downsamplelist[-1] = None - num_features = [ - int(self.embed_dim * 2**i) for i in range(self.num_layers) - ] + num_features = [int(self.embed_dim * 2**i) for i in range(self.num_layers)] if self.dilation: downsamplelist[-2] = None - num_features[-1] = int(self.embed_dim * 2 - **(self.num_layers - 1)) // 2 + num_features[-1] = int(self.embed_dim * 2 ** (self.num_layers - 1)) // 2 for i_layer in range(self.num_layers): layer = BasicLayer( dim=num_features[i_layer], @@ -691,11 +667,11 @@ def __init__(self, config: SwinTransformerConfig): qk_scale=config.qk_scale, drop=config.drop_rate, attn_drop=config.attn_drop_rate, - drop_path=dpr[sum(config.depths[:i_layer]):sum( - config.depths[:i_layer + 1])], + drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])], norm_layer=self.norm_layer, downsample=downsamplelist[i_layer], - use_checkpoint=config.use_checkpoint, ) + use_checkpoint=config.use_checkpoint, + ) self.layers.append(layer) self.num_features = num_features @@ -715,7 +691,7 @@ def _freeze_stages(self): param.stop_gradient = True if self.frozen_stages >= 1 and self.ape: - self.absolute_pos_embed.stop_gradient = Trueƒ + self.absolute_pos_embed.stop_gradient = True if self.frozen_stages >= 2: self.pos_drop.eval() @@ -732,10 +708,8 @@ def forward_raw(self, x): Wh, Ww = x.shape[2:4] if self.ape: # interpolate the position embedding to the corresponding size - absolute_pos_embed = F.interpolate( - self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic") - x = (x + absolute_pos_embed).flatten(2).transpose( - [0, 2, 1]) # B Wh*Ww C + absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic") + x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1]) # B Wh*Ww C else: x = x.flatten(2).transpose([0, 2, 1]) x = self.pos_drop(x) @@ -749,8 +723,7 @@ def forward_raw(self, x): norm_layer = getattr(self, f"norm{i}") x_out = norm_layer(x_out) - out = x_out.reshape((-1, H, W, self.num_features[i])).transpose( - (0, 3, 1, 2)) + out = x_out.reshape((-1, H, W, self.num_features[i])).transpose((0, 3, 1, 2)) outs.append(out) # in: # torch.Size([2, 3, 1024, 1024]) @@ -766,10 +739,8 @@ def forward_with_mask(self, x: paddle.Tensor, m: paddle.Tensor): Wh, Ww = x.shape[2], x.shape[3] if self.ape: # interpolate the position embedding to the corresponding size - absolute_pos_embed = F.interpolate( - self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic") - x = (x + absolute_pos_embed).flatten(2).transpose( - [0, 2, 1]) # B Wh*Ww C + absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic") + x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1]) # B Wh*Ww C else: x = x.flatten(2).transpose([0, 2, 1]) x = self.pos_drop(x) @@ -783,17 +754,14 @@ def forward_with_mask(self, x: paddle.Tensor, m: paddle.Tensor): norm_layer = getattr(self, f"norm{i}") x_out = norm_layer(x_out) - out = x_out.reshape((-1, H, W, self.num_features[i])).transpose( - (0, 3, 1, 2)) + out = x_out.reshape((-1, H, W, self.num_features[i])).transpose((0, 3, 1, 2)) outs.append(out) feat_dict = [] mask_dict = [] for idx, out_i in enumerate(outs): assert m is not None - mask = F.interpolate( - m[None].cast(paddle.float32), - size=out_i.shape[-2:]).cast(paddle.bool)[0] + mask = F.interpolate(m[None].cast(paddle.float32), size=out_i.shape[-2:]).cast(paddle.bool)[0] feat_dict.append(out_i) mask_dict.append(mask) diff --git a/paddlemix/models/groundingdino/bert_model.py b/paddlemix/models/groundingdino/bert_model.py index e661f4621bc86..ca9c89bb8b5b2 100644 --- a/paddlemix/models/groundingdino/bert_model.py +++ b/paddlemix/models/groundingdino/bert_model.py @@ -14,15 +14,14 @@ import math import warnings -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Tuple import numpy as np import paddle import paddle.nn as nn -import paddle.nn.functional as F -from paddlenlp.taskflow.utils import pad_batch_data -from paddlenlp.transformers.bert.modeling import \ - BaseModelOutputWithPoolingAndCrossAttentions +from paddlenlp.transformers.bert.modeling import ( + BaseModelOutputWithPoolingAndCrossAttentions, +) class GELUActivation(nn.Layer): @@ -33,7 +32,7 @@ class GELUActivation(nn.Layer): Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 """ - def __init__(self, use_gelu_python: bool=False): + def __init__(self, use_gelu_python: bool = False): super().__init__() self.act = nn.functional.gelu @@ -42,20 +41,16 @@ def forward(self, input): class BertSelfAttention(nn.Layer): - def __init__(self, - config, - clamp_min_for_underflow=False, - clamp_max_for_overflow=False): + def __init__(self, config, clamp_min_for_underflow=False, clamp_max_for_overflow=False): super().__init__() - if config.hidden_size % config.num_attention_heads != 0 and not hasattr( - config, "embedding_size"): + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " - f"heads ({config.num_attention_heads})") + f"heads ({config.num_attention_heads})" + ) self.num_attention_heads = config.num_attention_heads - self.attention_head_size = int(config.hidden_size / - config.num_attention_heads) + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query = nn.Linear(config.hidden_size, self.all_head_size) @@ -63,8 +58,7 @@ def __init__(self, self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = getattr( - config, "position_embedding_type", "absolute") # 'absolute' + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") # 'absolute' self.clamp_min_for_underflow = clamp_min_for_underflow self.clamp_max_for_overflow = clamp_max_for_overflow @@ -73,19 +67,21 @@ def __init__(self, def transpose_for_scores(self, x): new_x_shape = tuple(x.shape[:-1]) + ( self.num_attention_heads, - self.attention_head_size, ) + self.attention_head_size, + ) x = x.reshape(new_x_shape) return x.transpose([0, 2, 1, 3]) def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, ): + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): mixed_query_layer = self.query(hidden_states) @@ -97,17 +93,14 @@ def forward( value_layer = past_key_value[1] attention_mask = encoder_attention_mask elif is_cross_attention: - key_layer = self.transpose_for_scores( - self.key(encoder_hidden_states)) - value_layer = self.transpose_for_scores( - self.value(encoder_hidden_states)) + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) attention_mask = encoder_attention_mask elif past_key_value is not None: key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) key_layer = paddle.concat([past_key_value[0], key_layer], axis=2) - value_layer = paddle.concat( - [past_key_value[1], value_layer], axis=2) + value_layer = paddle.concat([past_key_value[1], value_layer], axis=2) else: # here key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) @@ -118,11 +111,9 @@ def forward( past_key_value = (key_layer, value_layer) # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = paddle.matmul(query_layer, - key_layer.transpose([0, 1, 3, 2])) + attention_scores = paddle.matmul(query_layer, key_layer.transpose([0, 1, 3, 2])) # return attention_scores - attention_scores = attention_scores / math.sqrt( - self.attention_head_size) + attention_scores = attention_scores / math.sqrt(self.attention_head_size) if self.clamp_min_for_underflow: attention_scores = paddle.clip(attention_scores, min=-50000) @@ -143,15 +134,13 @@ def forward( context_layer = paddle.matmul(attention_probs, value_layer) context_layer = context_layer.transpose([0, 2, 1, 3]) - new_context_layer_shape = tuple(context_layer.shape[:-2]) + ( - self.all_head_size, ) + new_context_layer_shape = tuple(context_layer.shape[:-2]) + (self.all_head_size,) context_layer = context_layer.reshape(new_context_layer_shape) - outputs = ((context_layer, attention_probs) - if output_attentions else (context_layer, )) + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) if self.is_decoder: - outputs = outputs + (past_key_value, ) + outputs = outputs + (past_key_value,) return outputs @@ -159,8 +148,7 @@ class BertSelfOutput(nn.Layer): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.LayerNorm = nn.LayerNorm( - config.hidden_size, epsilon=config.layer_norm_eps) + self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): @@ -171,25 +159,22 @@ def forward(self, hidden_states, input_tensor): class BertAttention(nn.Layer): - def __init__(self, - config, - clamp_min_for_underflow=False, - clamp_max_for_overflow=False): + def __init__(self, config, clamp_min_for_underflow=False, clamp_max_for_overflow=False): super().__init__() - self.self = BertSelfAttention(config, clamp_min_for_underflow, - clamp_max_for_overflow) + self.self = BertSelfAttention(config, clamp_min_for_underflow, clamp_max_for_overflow) self.output = BertSelfOutput(config) self.pruned_heads = set() def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, ): + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): self_outputs = self.self( hidden_states, attention_mask, @@ -197,11 +182,11 @@ def forward( encoder_hidden_states, encoder_attention_mask, past_key_value, - output_attentions, ) # pass + output_attentions, + ) # pass # return self_outputs attention_output = self.output(self_outputs[0], hidden_states) - outputs = (attention_output, - ) + self_outputs[1:] # add attentions if we output them + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs @@ -224,8 +209,7 @@ class BertOutput(nn.Layer): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.intermediate_size, config.hidden_size) - self.LayerNorm = nn.LayerNorm( - config.hidden_size, epsilon=config.layer_norm_eps) + self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): @@ -240,35 +224,32 @@ class BertEmbeddings(nn.Layer): def __init__(self, config): super().__init__() - self.word_embeddings = nn.Embedding(config.vocab_size, - config.hidden_size) - self.position_embeddings = nn.Embedding(config.max_position_embeddings, - config.hidden_size) - self.token_type_embeddings = nn.Embedding(config.type_vocab_size, - config.hidden_size) - - self.LayerNorm = nn.LayerNorm( - config.hidden_size, epsilon=config.layer_norm_eps) + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.position_embedding_type = getattr( - config, "position_embedding_type", "absolute") + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", - paddle.arange(config.max_position_embeddings).reshape((1, -1)), ) + paddle.arange(config.max_position_embeddings).reshape((1, -1)), + ) self.register_buffer( "token_type_ids", - paddle.zeros( - self.position_ids.shape, dtype=paddle.int64), - persistable=False, ) + paddle.zeros(self.position_ids.shape, dtype=paddle.int64), + persistable=False, + ) def forward( - self, - input_ids=None, - token_type_ids=None, - position_ids=None, - inputs_embeds=None, - past_key_values_length=0, ): + self, + input_ids=None, + token_type_ids=None, + position_ids=None, + inputs_embeds=None, + past_key_values_length=0, + ): if input_ids is not None: input_shape = input_ids.shape else: @@ -277,15 +258,12 @@ def forward( seq_length = input_shape[1] if position_ids is None: - position_ids = self.position_ids[:, past_key_values_length: - seq_length + - past_key_values_length] + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] if token_type_ids is None: if hasattr(self, "token_type_ids"): buffered_token_type_ids = self.token_type_ids[:, :seq_length] - buffered_token_type_ids_expanded = buffered_token_type_ids.expand( - [input_shape[0], seq_length]) + buffered_token_type_ids_expanded = buffered_token_type_ids.expand([input_shape[0], seq_length]) token_type_ids = buffered_token_type_ids_expanded else: token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64) @@ -314,32 +292,30 @@ def __init__(self, config): self.add_cross_attention = config.add_cross_attention if self.add_cross_attention: if not self.is_decoder: - raise ValueError( - f"{self} should be used as a decoder model if cross attention is added" - ) - self.crossattention = BertAttention( - config, position_embedding_type="absolute") + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = BertAttention(config, position_embedding_type="absolute") self.intermediate = BertIntermediate(config) self.output = BertOutput(config) def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, ): - - self_attn_past_key_value = (past_key_value[:2] - if past_key_value is not None else None) + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None self_attention_outputs = self.attention( hidden_states, attention_mask, head_mask, output_attentions=output_attentions, - past_key_value=self_attn_past_key_value, ) + past_key_value=self_attn_past_key_value, + ) # return self_attention_outputs attention_output = self_attention_outputs[0] # if decoder, the last output is tuple of self-attn cache @@ -347,18 +323,17 @@ def forward( outputs = self_attention_outputs[1:-1] present_key_value = self_attention_outputs[-1] else: - outputs = self_attention_outputs[ - 1:] # add self attentions if we output attention weights + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights cross_attn_present_key_value = None if self.is_decoder and encoder_hidden_states is not None: if not hasattr(self, "crossattention"): raise ValueError( f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" - " by setting `config.add_cross_attention=True`") + " by setting `config.add_cross_attention=True`" + ) - cross_attn_past_key_value = (past_key_value[-2:] if - past_key_value is not None else None) + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None cross_attention_outputs = self.crossattention( attention_output, attention_mask, @@ -366,7 +341,8 @@ def forward( encoder_hidden_states, encoder_attention_mask, cross_attn_past_key_value, - output_attentions, ) + output_attentions, + ) attention_output = cross_attention_outputs[0] outputs = outputs + cross_attention_outputs[1:-1] @@ -375,10 +351,10 @@ def forward( layer_output = self.feed_forward_chunk(attention_output) - outputs = (layer_output, ) + outputs + outputs = (layer_output,) + outputs if self.is_decoder: - outputs = outputs + (present_key_value, ) + outputs = outputs + (present_key_value,) return outputs @@ -392,35 +368,33 @@ class BertEncoder(nn.Layer): def __init__(self, config): super().__init__() self.config = config - self.layer = nn.LayerList( - [BertLayer(config) for _ in range(config.num_hidden_layers)]) + self.layer = nn.LayerList([BertLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_values=None, - use_cache=None, - output_attentions=False, - output_hidden_states=False, - return_dict=True, ): + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None - all_cross_attentions = (() if output_attentions and - self.config.add_cross_attention else None) + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None next_decoder_cache = () if use_cache else None for i, layer_module in enumerate(self.layer): if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states, ) + all_hidden_states = all_hidden_states + (hidden_states,) layer_head_mask = head_mask[i] if head_mask is not None else None - past_key_value = past_key_values[ - i] if past_key_values is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None layer_outputs = layer_module( hidden_states, @@ -429,35 +403,39 @@ def forward( encoder_hidden_states, encoder_attention_mask, past_key_value, - output_attentions, ) + output_attentions, + ) # return layer_outputs hidden_states = layer_outputs[0] if use_cache: - next_decoder_cache += (layer_outputs[-1], ) + next_decoder_cache += (layer_outputs[-1],) if output_attentions: - all_self_attentions = all_self_attentions + (layer_outputs[1], ) + all_self_attentions = all_self_attentions + (layer_outputs[1],) if self.config.add_cross_attention: - all_cross_attentions = all_cross_attentions + ( - layer_outputs[2], ) + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states, ) + all_hidden_states = all_hidden_states + (hidden_states,) if not return_dict: - return tuple(v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] if v is not None) + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=hidden_states, pooler_output=None, past_key_values=next_decoder_cache, hidden_states=all_hidden_states, - attentions=all_self_attentions, ) + attentions=all_self_attentions, + ) class BertPooler(nn.Layer): @@ -510,11 +488,12 @@ class PreTrainedModel self.encoder.layer[layer].attention.prune_heads(heads) def get_extended_attention_mask( - self, - attention_mask: paddle.Tensor, - input_shape: Tuple[int], - device: str=None, - dtype: np.float=None, ) -> paddle.Tensor: + self, + attention_mask: paddle.Tensor, + input_shape: Tuple[int], + device: str = None, + dtype: np.float = None, + ) -> paddle.Tensor: if dtype is None: dtype = np.float32 @@ -523,7 +502,8 @@ def get_extended_attention_mask( if device is not None: warnings.warn( "The `device` argument is deprecated and will be removed in v5 of Transformers.", - FutureWarning, ) + FutureWarning, + ) if attention_mask.dim() == 3: extended_attention_mask = attention_mask[:, None, :, :] @@ -535,34 +515,30 @@ def get_extended_attention_mask( f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" ) - extended_attention_mask = paddle.cast( - extended_attention_mask, dtype=dtype) # fp16 compatibility - extended_attention_mask = ( - 1.0 - extended_attention_mask) * np.finfo(dtype).min + extended_attention_mask = paddle.cast(extended_attention_mask, dtype=dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * np.finfo(dtype).min return extended_attention_mask - def get_head_mask(self, - head_mask, - num_hidden_layers, - is_attention_chunked=False): + def get_head_mask(self, head_mask, num_hidden_layers, is_attention_chunked=False): head_mask = [None] * num_hidden_layers return head_mask def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_values=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, ): + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -583,13 +559,11 @@ def forward( If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). """ - output_attentions = (output_attentions if output_attentions is not None - else self.config.output_attentions) - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -597,47 +571,38 @@ def forward( use_cache = False if input_ids is not None and inputs_embeds is not None: - raise ValueError( - "You cannot specify both input_ids and inputs_embeds at the same time" - ) + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.shape elif inputs_embeds is not None: input_shape = inputs_embeds.shape[:-1] else: - raise ValueError( - "You have to specify either input_ids or inputs_embeds") + raise ValueError("You have to specify either input_ids or inputs_embeds") batch_size, seq_length = input_shape # past_key_values_length - past_key_values_length = (past_key_values[0][0].shape[2] - if past_key_values is not None else 0) + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 if attention_mask is None: - attention_mask = paddle.ones(( - (batch_size, seq_length + past_key_values_length))) + attention_mask = paddle.ones(((batch_size, seq_length + past_key_values_length))) if token_type_ids is None: if hasattr(self.embeddings, "token_type_ids"): - buffered_token_type_ids = self.embeddings.token_type_ids[:, : - seq_length] - buffered_token_type_ids_expanded = buffered_token_type_ids.expand( - [batch_size, seq_length]) + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand([batch_size, seq_length]) token_type_ids = buffered_token_type_ids_expanded else: token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64) - extended_attention_mask = self.get_extended_attention_mask( - attention_mask, input_shape) + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) if self.config.is_decoder and encoder_hidden_states is not None: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = paddle.ones(encoder_hidden_shape) - encoder_extended_attention_mask = self.invert_attention_mask( - encoder_attention_mask) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_extended_attention_mask = None @@ -648,7 +613,8 @@ def forward( position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, - past_key_values_length=past_key_values_length, ) + past_key_values_length=past_key_values_length, + ) # return embedding_output encoder_outputs = self.encoder( embedding_output, @@ -660,11 +626,11 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) # return encoder_outputs sequence_output = encoder_outputs[0] - pooled_output = (self.pooler(sequence_output) - if self.pooler is not None else None) + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] @@ -675,7 +641,8 @@ def forward( past_key_values=encoder_outputs.past_key_values, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, - cross_attentions=encoder_outputs.cross_attentions, ) + cross_attentions=encoder_outputs.cross_attentions, + ) class language_model(nn.Layer): @@ -685,9 +652,9 @@ def __init__(self, cfg, bert_config): self.bert_name = cfg.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE print( "LANGUAGE BACKBONE USE GRADIENT CHECKPOINTING: ", - self.cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT, ) - bert_config.gradient_checkpointing = ( - self.cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT) + self.cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT, + ) + bert_config.gradient_checkpointing = self.cfg.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT self.model = BertModel(bert_config) self.language_dim = 768 @@ -702,20 +669,18 @@ def forward(self, x): outputs = self.model( input_ids=input, attention_mask=mask, - output_hidden_states=True, ) + output_hidden_states=True, + ) # outputs has 13 layers, 1 input layer and 12 hidden layers encoded_layers = outputs.hidden_states[1:] features = None - features = paddle.stack(encoded_layers[-self.num_layers:], - 1).mean(1) + features = paddle.stack(encoded_layers[-self.num_layers :], 1).mean(1) # language embedding has shape [len(phrase), seq_len, language_dim] features = features / self.num_layers - embedded = paddle.cast(features * mask.unsqueeze(-1), - paddle.float32) - aggregate = embedded.sum(1) / ( - paddle.cast(mask.sum(-1).unsqueeze(-1), paddle.float32)) + embedded = paddle.cast(features * mask.unsqueeze(-1), paddle.float32) + aggregate = embedded.sum(1) / (paddle.cast(mask.sum(-1).unsqueeze(-1), paddle.float32)) ret = { "aggregate": aggregate, diff --git a/paddlemix/models/groundingdino/bertwarper.py b/paddlemix/models/groundingdino/bertwarper.py index ad618d8cfd9b0..abc4f4d02a286 100644 --- a/paddlemix/models/groundingdino/bertwarper.py +++ b/paddlemix/models/groundingdino/bertwarper.py @@ -14,9 +14,9 @@ import paddle import paddle.nn as nn -import paddle.nn.functional as F -from paddlenlp.transformers.model_outputs import \ - BaseModelOutputWithPoolingAndCrossAttentions +from paddlenlp.transformers.model_outputs import ( + BaseModelOutputWithPoolingAndCrossAttentions, +) from .bert_model import BertModel @@ -37,20 +37,21 @@ def __init__(self, bert_model): self.use_return_dict = True def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_values=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, ): + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): r""" encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -71,11 +72,10 @@ def forward( If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up decoding (see :obj:`past_key_values`). """ - output_attentions = (output_attentions if output_attentions is not None - else self.config.output_attentions) - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) return_dict = return_dict if return_dict is not None else self.use_return_dict if self.config.is_decoder: @@ -84,9 +84,7 @@ def forward( use_cache = False if input_ids is not None and inputs_embeds is not None: - raise ValueError( - "You cannot specify both input_ids and inputs_embeds at the same time" - ) + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.shape batch_size, seq_length = input_shape @@ -94,23 +92,19 @@ def forward( input_shape = inputs_embeds.shape[:-1] batch_size, seq_length = input_shape else: - raise ValueError( - "You have to specify either input_ids or inputs_embeds") + raise ValueError("You have to specify either input_ids or inputs_embeds") # past_key_values_length - past_key_values_length = (past_key_values[0][0].shape[2] - if past_key_values is not None else 0) + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 if attention_mask is None: - attention_mask = paddle.ones(( - (batch_size, seq_length + past_key_values_length))) + attention_mask = paddle.ones(((batch_size, seq_length + past_key_values_length))) if token_type_ids is None: token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: paddle.Tensor = self.get_extended_attention_mask( - attention_mask, input_shape) + extended_attention_mask: paddle.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] @@ -138,7 +132,8 @@ def forward( position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, - past_key_values_length=past_key_values_length, ) + past_key_values_length=past_key_values_length, + ) encoder_outputs = self.encoder( embedding_output, @@ -150,10 +145,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) sequence_output = encoder_outputs[0] - pooled_output = (self.pooler(sequence_output) - if self.pooler is not None else None) + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] @@ -164,7 +159,8 @@ def forward( past_key_values=encoder_outputs.past_key_values, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, - cross_attentions=encoder_outputs.cross_attentions, ) + cross_attentions=encoder_outputs.cross_attentions, + ) class TextEncoderShell(nn.Layer): @@ -178,8 +174,7 @@ def forward(self, **kw): return self.text_encoder(**kw) -def generate_masks_with_special_tokens(tokenized, special_tokens_list, - tokenizer): +def generate_masks_with_special_tokens(tokenized, special_tokens_list, tokenizer): """Generate attention mask between each pair of special tokens Args: input_ids (torch.Tensor): input ids. Shape: [bs, num_token] @@ -198,8 +193,7 @@ def generate_masks_with_special_tokens(tokenized, special_tokens_list, idxs = paddle.nonzero(special_tokens_mask) # generate attention mask and positional ids - attention_mask = ( - paddle.eye(num_token, dtype=paddle.bool).unsqueeze(0).tile([bs, 1, 1])) + attention_mask = paddle.eye(num_token, dtype=paddle.bool).unsqueeze(0).tile([bs, 1, 1]) position_ids = paddle.zeros((bs, num_token)) previous_col = 0 for i in range(idxs.shape[0]): @@ -208,10 +202,8 @@ def generate_masks_with_special_tokens(tokenized, special_tokens_list, attention_mask[row, col, col] = True position_ids[row, col] = 0 else: - attention_mask[row, previous_col + 1:col + 1, previous_col + 1:col + - 1] = True - position_ids[row, previous_col + 1:col + 1] = paddle.arange( - 0, col - previous_col) + attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True + position_ids[row, previous_col + 1 : col + 1] = paddle.arange(0, col - previous_col) previous_col = col @@ -222,8 +214,7 @@ def generate_masks_with_special_tokens(tokenized, special_tokens_list, return attention_mask, position_ids.cast(paddle.int64) -def generate_masks_with_special_tokens_and_transfer_map( - tokenized, special_tokens_list, tokenizer): +def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list, tokenizer): """Generate attention mask between each pair of special tokens Args: input_ids (torch.Tensor): input ids. Shape: [bs, num_token] @@ -242,8 +233,7 @@ def generate_masks_with_special_tokens_and_transfer_map( idxs = paddle.nonzero(special_tokens_mask) # generate attention mask and positional ids - attention_mask = (paddle.eye(num_token, dtype=paddle.int32) - .cast(paddle.bool).unsqueeze(0).tile([bs, 1, 1])) + attention_mask = paddle.eye(num_token, dtype=paddle.int32).cast(paddle.bool).unsqueeze(0).tile([bs, 1, 1]) position_ids = paddle.zeros((bs, num_token)) cate_to_token_mask_list = [[] for _ in range(bs)] previous_col = 0 @@ -253,12 +243,14 @@ def generate_masks_with_special_tokens_and_transfer_map( attention_mask[row, col, col] = True position_ids[row, col] = 0 else: - attention_mask[row, previous_col + 1:col + 1, previous_col + 1:col + - 1] = True - position_ids[row, previous_col + 1:col + 1] = paddle.arange( - 0, col - previous_col) - c2t_maski = paddle.zeros([num_token, ]).cast(paddle.bool) - c2t_maski[previous_col + 1:col] = True + attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True + position_ids[row, previous_col + 1 : col + 1] = paddle.arange(0, col - previous_col) + c2t_maski = paddle.zeros( + [ + num_token, + ] + ).cast(paddle.bool) + c2t_maski[previous_col + 1 : col] = True cate_to_token_mask_list[row].append(c2t_maski) previous_col = col @@ -271,5 +263,4 @@ def generate_masks_with_special_tokens_and_transfer_map( # padding_mask = tokenized['attention_mask'] # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool() - return attention_mask, position_ids.cast( - paddle.int64), cate_to_token_mask_list + return attention_mask, position_ids.cast(paddle.int64), cate_to_token_mask_list diff --git a/paddlemix/models/groundingdino/configuration.py b/paddlemix/models/groundingdino/configuration.py index 853217aa9daa3..da41db3e25490 100644 --- a/paddlemix/models/groundingdino/configuration.py +++ b/paddlemix/models/groundingdino/configuration.py @@ -17,6 +17,8 @@ from paddlenlp.transformers.configuration_utils import PretrainedConfig +from paddlemix.utils.log import logger + __all__ = ["GroundingDinoConfig"] @@ -25,50 +27,51 @@ class GroundingDinoConfig(PretrainedConfig): model_type = "groundingdino" def __init__( - self, - modelname="groundingdino", - backbone="swin_T_224_1k", - position_embedding="sine", - pe_temperatureH=20, - pe_temperatureW=20, - return_interm_indices=[1, 2, 3], - backbone_freeze_keywords=None, - enc_layers=6, - dec_layers=6, - pre_norm=False, - dim_feedforward=2048, - hidden_dim=256, - dropout=0.0, - nheads=8, - num_queries=900, - query_dim=4, - num_patterns=0, - num_feature_levels=4, - enc_n_points=4, - dec_n_points=4, - two_stage_type="standard", - two_stage_bbox_embed_share=False, - two_stage_class_embed_share=False, - transformer_activation="relu", - dec_pred_bbox_embed_share=True, - dn_box_noise_scale=1.0, - dn_label_noise_ratio=0.5, - dn_label_coef=1.0, - dn_bbox_coef=1.0, - embed_init_tgt=True, - dn_labelbook_size=2000, - max_text_len=256, - text_encoder_type="bert-base-uncased", - use_text_enhancer=True, - use_fusion_layer=True, - use_checkpoint=False, - use_transformer_ckpt=False, - use_text_cross_attention=True, - text_dropout=0.0, - fusion_dropout=0.0, - fusion_droppath=0.1, - sub_sentence_present=True, - **kwargs, ): + self, + modelname="groundingdino", + backbone="swin_T_224_1k", + position_embedding="sine", + pe_temperatureH=20, + pe_temperatureW=20, + return_interm_indices=[1, 2, 3], + backbone_freeze_keywords=None, + enc_layers=6, + dec_layers=6, + pre_norm=False, + dim_feedforward=2048, + hidden_dim=256, + dropout=0.0, + nheads=8, + num_queries=900, + query_dim=4, + num_patterns=0, + num_feature_levels=4, + enc_n_points=4, + dec_n_points=4, + two_stage_type="standard", + two_stage_bbox_embed_share=False, + two_stage_class_embed_share=False, + transformer_activation="relu", + dec_pred_bbox_embed_share=True, + dn_box_noise_scale=1.0, + dn_label_noise_ratio=0.5, + dn_label_coef=1.0, + dn_bbox_coef=1.0, + embed_init_tgt=True, + dn_labelbook_size=2000, + max_text_len=256, + text_encoder_type="bert-base-uncased", + use_text_enhancer=True, + use_fusion_layer=True, + use_checkpoint=False, + use_transformer_ckpt=False, + use_text_cross_attention=True, + text_dropout=0.0, + fusion_dropout=0.0, + fusion_droppath=0.1, + sub_sentence_present=True, + **kwargs, + ): kwargs["return_dict"] = kwargs.pop("return_dict", True) super().__init__(**kwargs) self.modelname = modelname @@ -115,14 +118,10 @@ def __init__( self.sub_sentence_present = sub_sentence_present @classmethod - def from_pretrained(cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, - **kwargs) + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - if ("model_type" in config_dict and hasattr(cls, "model_type") and - config_dict["model_type"] != cls.model_type): + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: logger.warning( f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." diff --git a/paddlemix/models/groundingdino/csrc/setup_ms_deformable_attn_op.py b/paddlemix/models/groundingdino/csrc/setup_ms_deformable_attn_op.py index 869e9e8314fa9..dd45756efcf85 100644 --- a/paddlemix/models/groundingdino/csrc/setup_ms_deformable_attn_op.py +++ b/paddlemix/models/groundingdino/csrc/setup_ms_deformable_attn_op.py @@ -17,5 +17,5 @@ if __name__ == "__main__": setup( name="deformable_detr_ops", - ext_modules=CUDAExtension( - sources=["ms_deformable_attn_op.cc", "ms_deformable_attn_op.cu"]), ) + ext_modules=CUDAExtension(sources=["ms_deformable_attn_op.cc", "ms_deformable_attn_op.cu"]), + ) diff --git a/paddlemix/models/groundingdino/csrc/test_ms_deformable_attn_op.py b/paddlemix/models/groundingdino/csrc/test_ms_deformable_attn_op.py index f6e4818963d64..3e7739510df32 100644 --- a/paddlemix/models/groundingdino/csrc/test_ms_deformable_attn_op.py +++ b/paddlemix/models/groundingdino/csrc/test_ms_deformable_attn_op.py @@ -50,22 +50,15 @@ bs, n_heads, c = 2, 8, 8 query_length, n_levels, n_points = 2, 2, 2 spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64) -level_start_index = paddle.concat((paddle.to_tensor( - [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1])) +level_start_index = paddle.concat((paddle.to_tensor([0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1])) value_length = sum([(H * W).item() for H, W in spatial_shapes]) def get_test_tensors(channels): - value = (paddle.rand( - [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01) - sampling_locations = paddle.rand( - [bs, query_length, n_heads, n_levels, n_points, 2], - dtype=paddle.float32) - attention_weights = (paddle.rand( - [bs, query_length, n_heads, n_levels, n_points], dtype=paddle.float32) + - 1e-5) - attention_weights /= attention_weights.sum(-1, keepdim=True).sum( - -2, keepdim=True) + value = paddle.rand([bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01 + sampling_locations = paddle.rand([bs, query_length, n_heads, n_levels, n_points, 2], dtype=paddle.float32) + attention_weights = paddle.rand([bs, query_length, n_heads, n_levels, n_points], dtype=paddle.float32) + 1e-5 + attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) return [value, sampling_locations, attention_weights] @@ -74,23 +67,31 @@ def get_test_tensors(channels): def check_forward_equal_with_paddle_float(): value, sampling_locations, attention_weights = get_test_tensors(c) - output_paddle = (ms_deform_attn_core_paddle( - value, - spatial_shapes, - level_start_index, - sampling_locations, - attention_weights, ).detach().cpu()) - output_cuda = (ms_deformable_attn( - value, - spatial_shapes, - level_start_index, - sampling_locations, - attention_weights, ).detach().cpu()) - fwdok = paddle.allclose( - output_cuda, output_paddle, rtol=1e-2, atol=1e-3).item() + output_paddle = ( + ms_deform_attn_core_paddle( + value, + spatial_shapes, + level_start_index, + sampling_locations, + attention_weights, + ) + .detach() + .cpu() + ) + output_cuda = ( + ms_deformable_attn( + value, + spatial_shapes, + level_start_index, + sampling_locations, + attention_weights, + ) + .detach() + .cpu() + ) + fwdok = paddle.allclose(output_cuda, output_paddle, rtol=1e-2, atol=1e-3).item() max_abs_err = (output_cuda - output_paddle).abs().max().item() - max_rel_err = (( - (output_cuda - output_paddle).abs() / output_paddle.abs()).max().item()) + max_rel_err = ((output_cuda - output_paddle).abs() / output_paddle.abs()).max().item() print( f"*{fwdok} check_forward_equal_with_paddle_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}" @@ -101,7 +102,8 @@ def check_gradient_numerical(channels=4): ( value_paddle, sampling_locations_paddle, - attention_weights_paddle, ) = get_test_tensors(channels) + attention_weights_paddle, + ) = get_test_tensors(channels) value_paddle.stop_gradient = False sampling_locations_paddle.stop_gradient = False attention_weights_paddle.stop_gradient = False @@ -118,7 +120,8 @@ def check_gradient_numerical(channels=4): spatial_shapes, level_start_index, sampling_locations_paddle, - attention_weights_paddle, ) + attention_weights_paddle, + ) output_paddle.sum().backward() output_cuda = ms_deformable_attn( @@ -126,25 +129,22 @@ def check_gradient_numerical(channels=4): spatial_shapes, level_start_index, sampling_locations_cuda, - attention_weights_cuda, ) + attention_weights_cuda, + ) output_cuda.sum().backward() - res = paddle.allclose( - value_paddle.grad, value_cuda.grad, rtol=1e-2, atol=1e-3).item() + res = paddle.allclose(value_paddle.grad, value_cuda.grad, rtol=1e-2, atol=1e-3).item() print(f"*tensor1 {res} check_gradient_numerical(D={channels})") res = paddle.allclose( sampling_locations_paddle.grad, sampling_locations_cuda.grad, rtol=1e-2, - atol=1e-3, ).item() + atol=1e-3, + ).item() print(f"*tensor2 {res} check_gradient_numerical(D={channels})") - res = paddle.allclose( - attention_weights_paddle.grad, - attention_weights_cuda.grad, - rtol=1e-2, - atol=1e-3).item() + res = paddle.allclose(attention_weights_paddle.grad, attention_weights_cuda.grad, rtol=1e-2, atol=1e-3).item() print(f"*tensor3 {res} check_gradient_numerical(D={channels})") diff --git a/paddlemix/models/groundingdino/fuse_modules.py b/paddlemix/models/groundingdino/fuse_modules.py index f395f060c7f94..7940a81a8fcdf 100644 --- a/paddlemix/models/groundingdino/fuse_modules.py +++ b/paddlemix/models/groundingdino/fuse_modules.py @@ -58,11 +58,7 @@ def l2norm(X, dim, eps=1e-8): return X -def func_attention(query, - context, - smooth=1, - raw_feature_norm="softmax", - eps=1e-8): +def func_attention(query, context, smooth=1, raw_feature_norm="softmax", eps=1e-8): """ query: (n_context, queryL, d) context: (n_context, sourceL, d) @@ -112,13 +108,7 @@ def func_attention(query, class BiMultiHeadAttention(nn.Layer): - def __init__(self, - v_dim, - l_dim, - embed_dim, - num_heads, - dropout=0.1, - cfg=None): + def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1, cfg=None): super(BiMultiHeadAttention, self).__init__() self.embed_dim = embed_dim @@ -130,7 +120,7 @@ def __init__(self, assert ( self.head_dim * self.num_heads == self.embed_dim ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})." - self.scale = self.head_dim**(-0.5) + self.scale = self.head_dim ** (-0.5) self.dropout = dropout self.v_proj = nn.Linear(self.v_dim, self.embed_dim) @@ -148,9 +138,7 @@ def __init__(self, self._reset_parameters() def _shape(self, tensor, seq_len, bsz): - return tensor.reshape( - [bsz, seq_len, self.num_heads, self.head_dim]).transpose( - [0, 2, 1, 3]) + return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) def _reset_parameters(self): xavier_uniform_(self.v_proj.weight) @@ -187,16 +175,13 @@ def forward(self, v, l, attention_mask_v=None, attention_mask_l=None): value_l_states = self._shape(self.values_l_proj(l), -1, bsz) proj_shape = (bsz * self.num_heads, -1, self.head_dim) - query_states = self._shape(query_states, tgt_len, - bsz).reshape(proj_shape) + query_states = self._shape(query_states, tgt_len, bsz).reshape(proj_shape) key_states = key_states.reshape(proj_shape) value_v_states = value_v_states.reshape(proj_shape) value_l_states = value_l_states.reshape(proj_shape) src_len = key_states.shape[1] - attn_weights = paddle.bmm( - query_states, - key_states.transpose([0, 2, 1])) # bs*nhead, nimg, ntxt + attn_weights = paddle.bmm(query_states, key_states.transpose([0, 2, 1])) # bs*nhead, nimg, ntxt if attn_weights.shape != [bsz * self.num_heads, tgt_len, src_len]: raise ValueError( @@ -216,8 +201,7 @@ def forward(self, v, l, attention_mask_v=None, attention_mask_l=None): ) # Do not increase 50000, data type half has quite limited range attn_weights_T = attn_weights.transpose([0, 2, 1]) - attn_weights_l = attn_weights_T - paddle.max( - attn_weights_T, axis=-1, keepdim=True) + attn_weights_l = attn_weights_T - paddle.max(attn_weights_T, axis=-1, keepdim=True) if self.clamp_min_for_underflow: attn_weights_l = paddle.clip( attn_weights_l, min=-50000 @@ -230,53 +214,43 @@ def forward(self, v, l, attention_mask_v=None, attention_mask_l=None): # mask vison for language if attention_mask_v is not None: - attention_mask_v = (attention_mask_v[:, None, None, :] - .cast(paddle.float32) - .tile([1, self.num_heads, 1, 1]).flatten(0, 1)) - attn_weights_l = masked_fill(attn_weights_l, - attention_mask_v == 1.0, float("-inf")) + attention_mask_v = ( + attention_mask_v[:, None, None, :].cast(paddle.float32).tile([1, self.num_heads, 1, 1]).flatten(0, 1) + ) + attn_weights_l = masked_fill(attn_weights_l, attention_mask_v == 1.0, float("-inf")) attn_weights_l = F.softmax(attn_weights_l, axis=-1) # mask language for vision if attention_mask_l is not None: - attention_mask_l = (attention_mask_l[:, None, None, :] - .cast(paddle.float32) - .tile([1, self.num_heads, 1, 1]).flatten(0, 1)) - attn_weights = masked_fill(attn_weights, attention_mask_l == 1.0, - float("-inf")) + attention_mask_l = ( + attention_mask_l[:, None, None, :].cast(paddle.float32).tile([1, self.num_heads, 1, 1]).flatten(0, 1) + ) + attn_weights = masked_fill(attn_weights, attention_mask_l == 1.0, float("-inf")) attn_weights_v = F.softmax(attn_weights, axis=-1) - attn_probs_v = F.dropout( - attn_weights_v, p=self.dropout, training=self.training) - attn_probs_l = F.dropout( - attn_weights_l, p=self.dropout, training=self.training) + attn_probs_v = F.dropout(attn_weights_v, p=self.dropout, training=self.training) + attn_probs_l = F.dropout(attn_weights_l, p=self.dropout, training=self.training) attn_output_v = paddle.bmm(attn_probs_v, value_l_states) attn_output_l = paddle.bmm(attn_probs_l, value_v_states) - if attn_output_v.shape != [ - bsz * self.num_heads, tgt_len, self.head_dim - ]: + if attn_output_v.shape != [bsz * self.num_heads, tgt_len, self.head_dim]: raise ValueError( f"`attn_output_v` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output_v.shape}" ) - if attn_output_l.shape != [ - bsz * self.num_heads, src_len, self.head_dim - ]: + if attn_output_l.shape != [bsz * self.num_heads, src_len, self.head_dim]: raise ValueError( f"`attn_output_l` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {attn_output_l.shape}" ) - attn_output_v = attn_output_v.reshape( - [bsz, self.num_heads, tgt_len, self.head_dim]) + attn_output_v = attn_output_v.reshape([bsz, self.num_heads, tgt_len, self.head_dim]) attn_output_v = attn_output_v.transpose([0, 2, 1, 3]) attn_output_v = attn_output_v.reshape([bsz, tgt_len, self.embed_dim]) - attn_output_l = attn_output_l.reshape( - [bsz, self.num_heads, src_len, self.head_dim]) + attn_output_l = attn_output_l.reshape([bsz, self.num_heads, src_len, self.head_dim]) attn_output_l = attn_output_l.transpose([0, 2, 1, 3]) attn_output_l = attn_output_l.reshape([bsz, src_len, self.embed_dim]) @@ -289,15 +263,16 @@ def forward(self, v, l, attention_mask_v=None, attention_mask_l=None): # Bi-Direction MHA (text->image, image->text) class BiAttentionBlock(nn.Layer): def __init__( - self, - v_dim, - l_dim, - embed_dim, - num_heads, - dropout=0.1, - drop_path=0.0, - init_values=1e-4, - cfg=None, ): + self, + v_dim, + l_dim, + embed_dim, + num_heads, + dropout=0.1, + drop_path=0.0, + init_values=1e-4, + cfg=None, + ): """ Inputs: embed_dim - Dimensionality of input and attention feature vectors @@ -316,26 +291,24 @@ def __init__( l_dim=l_dim, embed_dim=embed_dim, num_heads=num_heads, - dropout=dropout, ) + dropout=dropout, + ) # add layer scale for training stability - self.drop_path = DropPath( - drop_path) if drop_path > 0.0 else nn.Identity() + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.gamma_v = self.create_parameter( shape=[v_dim], - attr=paddle.ParamAttr(initializer=Constant(init_values)), ) + attr=paddle.ParamAttr(initializer=Constant(init_values)), + ) self.gamma_l = self.create_parameter( shape=[l_dim], - attr=paddle.ParamAttr(initializer=Constant(init_values)), ) + attr=paddle.ParamAttr(initializer=Constant(init_values)), + ) def forward(self, v, l, attention_mask_v=None, attention_mask_l=None): v = self.layer_norm_v(v) l = self.layer_norm_l(l) - delta_v, delta_l = self.attn( - v, - l, - attention_mask_v=attention_mask_v, - attention_mask_l=attention_mask_l) + delta_v, delta_l = self.attn(v, l, attention_mask_v=attention_mask_v, attention_mask_l=attention_mask_l) # v, l = v + delta_v, l + delta_l v = v + self.drop_path(self.gamma_v * delta_v) l = l + self.drop_path(self.gamma_l * delta_l) diff --git a/paddlemix/models/groundingdino/layers.py b/paddlemix/models/groundingdino/layers.py index f8d4c01da9460..7e15936a14b35 100644 --- a/paddlemix/models/groundingdino/layers.py +++ b/paddlemix/models/groundingdino/layers.py @@ -94,13 +94,14 @@ class MultiHeadAttention(nn.Layer): """ def __init__( - self, - embed_dim, - num_heads, - dropout=0.0, - kdim=None, - vdim=None, - need_weights=False, ): + self, + embed_dim, + num_heads, + dropout=0.0, + kdim=None, + vdim=None, + need_weights=False, + ): super(MultiHeadAttention, self).__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim @@ -112,20 +113,18 @@ def __init__( self.need_weights = need_weights self.head_dim = embed_dim // num_heads - assert (self.head_dim * num_heads == self.embed_dim - ), "embed_dim must be divisible by num_heads" + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" if self._qkv_same_embed_dim: self.in_proj_weight = self.create_parameter( shape=[embed_dim, 3 * embed_dim], attr=None, dtype=self._dtype, - is_bias=False, ) + is_bias=False, + ) self.in_proj_bias = self.create_parameter( - shape=[3 * embed_dim], - attr=None, - dtype=self._dtype, - is_bias=True) + shape=[3 * embed_dim], attr=None, dtype=self._dtype, is_bias=True + ) else: self.q_proj = nn.Linear(embed_dim, embed_dim) self.k_proj = nn.Linear(self.kdim, embed_dim) @@ -147,15 +146,14 @@ def compute_qkv(self, tensor, index): if self._qkv_same_embed_dim: tensor = F.linear( x=tensor, - weight=self.in_proj_weight[:, index * self.embed_dim:(index + 1) - * self.embed_dim], - bias=self.in_proj_bias[index * self.embed_dim:(index + 1) * - self.embed_dim] - if self.in_proj_bias is not None else None, ) + weight=self.in_proj_weight[:, index * self.embed_dim : (index + 1) * self.embed_dim], + bias=self.in_proj_bias[index * self.embed_dim : (index + 1) * self.embed_dim] + if self.in_proj_bias is not None + else None, + ) else: tensor = getattr(self, self._type_list[index])(tensor) - tensor = tensor.reshape( - [0, 0, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) + tensor = tensor.reshape([0, 0, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) return tensor def forward(self, query, key=None, value=None, attn_mask=None): @@ -201,12 +199,11 @@ def forward(self, query, key=None, value=None, attn_mask=None): key = query if key is None else key value = query if value is None else value # compute q ,k ,v - q, k, v = (self.compute_qkv(t, i) - for i, t in enumerate([query, key, value])) + q, k, v = (self.compute_qkv(t, i) for i, t in enumerate([query, key, value])) # scale dot product attention product = paddle.matmul(x=q, y=k, transpose_y=True) - scaling = float(self.head_dim)**-0.5 + scaling = float(self.head_dim) ** -0.5 product = product * scaling if attn_mask is not None: @@ -215,11 +212,7 @@ def forward(self, query, key=None, value=None, attn_mask=None): product = product + attn_mask weights = F.softmax(product) if self.dropout: - weights = F.dropout( - weights, - self.dropout, - training=self.training, - mode="upscale_in_train") + weights = F.dropout(weights, self.dropout, training=self.training, mode="upscale_in_train") out = paddle.matmul(weights, v) @@ -236,10 +229,7 @@ def forward(self, query, key=None, value=None, attn_mask=None): return out if len(outs) == 1 else tuple(outs) -def drop_path(x, - drop_prob: float=0.0, - training: bool=False, - scale_by_keep: bool=True): +def drop_path(x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, @@ -252,21 +242,17 @@ def drop_path(x, if drop_prob == 0.0 or not training: return x keep_prob = 1 - drop_prob - shape = (x.shape[0], ) + (1, ) * ( - x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets - random_tensor = paddle.bernoulli( - paddle.full( - shape, keep_prob, dtype=x.dtype)) + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = paddle.bernoulli(paddle.full(shape, keep_prob, dtype=x.dtype)) if keep_prob > 0.0 and scale_by_keep: - random_tensor = paddle.divide(random_tensor, - paddle.to_tensor(keep_prob)) + random_tensor = paddle.divide(random_tensor, paddle.to_tensor(keep_prob)) return x * random_tensor class DropPath(nn.Layer): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" - def __init__(self, drop_prob: float=0.0, scale_by_keep: bool=True): + def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True): super(DropPath, self).__init__() self.drop_prob = drop_prob self.scale_by_keep = scale_by_keep diff --git a/paddlemix/models/groundingdino/modeling.py b/paddlemix/models/groundingdino/modeling.py index 58d778b122f28..90f4f25a2f20d 100644 --- a/paddlemix/models/groundingdino/modeling.py +++ b/paddlemix/models/groundingdino/modeling.py @@ -18,16 +18,12 @@ import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddle import Tensor -from paddle.nn import Layer -from paddlenlp.transformers import AutoTokenizer, BertModel, RobertaModel -from paddlenlp.transformers.model_utils import (PretrainedModel, - register_base_model) +from paddlenlp.transformers import BertModel, RobertaModel +from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model from paddlenlp.utils.initializer import constant_, xavier_uniform_ from .backbone import build_backbone -from .bertwarper import (BertModelWarper, generate_masks_with_special_tokens, - generate_masks_with_special_tokens_and_transfer_map) +from .bertwarper import BertModelWarper from .configuration import GroundingDinoConfig from .transformer import build_transformer from .utils import MLP, ContrastiveEmbed, inverse_sigmoid @@ -75,14 +71,12 @@ def __init__(self, config: GroundingDinoConfig): elif config.text_encoder_type == "roberta-base": self.bert = RobertaModel.from_pretrained(config.text_encoder_type) else: - raise ValueError("Unknown text_encoder_type {}".format( - config.text_encoder_type)) + raise ValueError("Unknown text_encoder_type {}".format(config.text_encoder_type)) self.bert.pooler.dense.weight.stop_gradient = True self.bert.pooler.dense.bias.stop_gradient = True self.bert = BertModelWarper(bert_model=self.bert) - self.feat_map = nn.Linear( - self.bert.config.hidden_size, self.hidden_dim, bias_attr=True) + self.feat_map = nn.Linear(self.bert.config.hidden_size, self.hidden_dim, bias_attr=True) constant_(self.feat_map.bias, 0) xavier_uniform_(self.feat_map.weight) @@ -94,32 +88,29 @@ def __init__(self, config: GroundingDinoConfig): in_channels = self.backbone.num_channels[_] input_proj_list.append( nn.Sequential( - nn.Conv2D( - in_channels, hidden_dim, kernel_size=1), - nn.GroupNorm(32, hidden_dim), )) + nn.Conv2D(in_channels, hidden_dim, kernel_size=1), + nn.GroupNorm(32, hidden_dim), + ) + ) for _ in range(config.num_feature_levels - num_backbone_outs): input_proj_list.append( nn.Sequential( - nn.Conv2D( - in_channels, - hidden_dim, - kernel_size=3, - stride=2, - padding=1), - nn.GroupNorm(32, hidden_dim), )) + nn.Conv2D(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1), + nn.GroupNorm(32, hidden_dim), + ) + ) in_channels = hidden_dim self.input_proj = nn.LayerList(input_proj_list) else: - assert (two_stage_type == "no" - ), "two_stage_type should be no if num_feature_levels=1 !!!" - self.input_proj = nn.LayerList([ - nn.Sequential( - nn.Conv2D( - self.backbone.num_channels[-1], - hidden_dim, - kernel_size=1), - nn.GroupNorm(32, hidden_dim), ) - ]) + # assert two_stage_type == "no", "two_stage_type should be no if num_feature_levels=1 !!!" + self.input_proj = nn.LayerList( + [ + nn.Sequential( + nn.Conv2D(self.backbone.num_channels[-1], hidden_dim, kernel_size=1), + nn.GroupNorm(32, hidden_dim), + ) + ] + ) # prepare class & box embed _class_embed = ContrastiveEmbed() @@ -129,17 +120,10 @@ def __init__(self, config: GroundingDinoConfig): constant_(_bbox_embed.layers[-1].bias, 0) if config.dec_pred_bbox_embed_share: - box_embed_layerlist = [ - _bbox_embed for i in range(self.transformer.num_decoder_layers) - ] + box_embed_layerlist = [_bbox_embed for i in range(self.transformer.num_decoder_layers)] else: - box_embed_layerlist = [ - copy.deepcopy(_bbox_embed) - for i in range(self.transformer.num_decoder_layers) - ] - class_embed_layerlist = [ - _class_embed for i in range(self.transformer.num_decoder_layers) - ] + box_embed_layerlist = [copy.deepcopy(_bbox_embed) for i in range(self.transformer.num_decoder_layers)] + class_embed_layerlist = [_class_embed for i in range(self.transformer.num_decoder_layers)] self.bbox_embed = nn.LayerList(box_embed_layerlist) self.class_embed = nn.LayerList(class_embed_layerlist) self.transformer.decoder.bbox_embed = self.bbox_embed @@ -161,8 +145,7 @@ def __init__(self, config: GroundingDinoConfig): assert config.dec_pred_bbox_embed_share self.transformer.enc_out_class_embed = _class_embed else: - self.transformer.enc_out_class_embed = copy.deepcopy( - _class_embed) + self.transformer.enc_out_class_embed = copy.deepcopy(_class_embed) self.refpoint_embed = None @@ -178,14 +161,15 @@ def init_ref_points(self, use_num_queries): self.refpoint_embed = nn.Embedding(use_num_queries, self.query_dim) def forward( - self, - x: paddle.Tensor, - m: paddle.Tensor, - input_ids: paddle.Tensor, - attention_mask: paddle.Tensor, - text_self_attention_masks: paddle.Tensor, - position_ids: paddle.Tensor=None, - targets: List=None, ): + self, + x: paddle.Tensor, + m: paddle.Tensor, + input_ids: paddle.Tensor, + attention_mask: paddle.Tensor, + text_self_attention_masks: paddle.Tensor, + position_ids: paddle.Tensor = None, + targets: List = None, + ): tokenized = { "input_ids": input_ids, @@ -194,10 +178,7 @@ def forward( # extract text embeddings if self.sub_sentence_present: - tokenized_for_encoder = { - k: v - for k, v in tokenized.items() if k != "attention_mask" - } + tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"} tokenized_for_encoder["attention_mask"] = text_self_attention_masks tokenized_for_encoder["position_ids"] = position_ids else: @@ -206,28 +187,22 @@ def forward( bert_output = self.bert(**tokenized_for_encoder) # bs, 195, 768 - encoded_text = self.feat_map( - bert_output["last_hidden_state"]) # bs, 195, d_model - text_token_mask = tokenized["attention_mask"].cast( - paddle.bool) # bs, 195 + encoded_text = self.feat_map(bert_output["last_hidden_state"]) # bs, 195, d_model + text_token_mask = tokenized["attention_mask"].cast(paddle.bool) # bs, 195 # text_token_mask: True for nomask, False for mask # text_self_attention_masks: True for nomask, False for mask if encoded_text.shape[1] > self.max_text_len: - encoded_text = encoded_text[:, :self.max_text_len, :] - text_token_mask = text_token_mask[:, :self.max_text_len] - position_ids = position_ids[:, :self.max_text_len] - text_self_attention_masks = text_self_attention_masks[:, :self. - max_text_len, : - self. - max_text_len] + encoded_text = encoded_text[:, : self.max_text_len, :] + text_token_mask = text_token_mask[:, : self.max_text_len] + position_ids = position_ids[:, : self.max_text_len] + text_self_attention_masks = text_self_attention_masks[:, : self.max_text_len, : self.max_text_len] text_dict = { "encoded_text": encoded_text, # bs, 195, d_model "text_token_mask": text_token_mask, # bs, 195 "position_ids": position_ids, # bs, 195 - "text_self_attention_masks": - text_self_attention_masks, # bs, 195,195 + "text_self_attention_masks": text_self_attention_masks, # bs, 195,195 } features, feat_masks, poss = self.backbone(x, m) @@ -249,40 +224,35 @@ def forward( else: src = self.input_proj[l](srcs[-1]) # m = samples.mask - mask = F.interpolate( - m[None].cast(paddle.float32), - size=src.shape[-2:]).cast(paddle.bool)[0] + mask = F.interpolate(m[None].cast(paddle.float32), size=src.shape[-2:]).cast(paddle.bool)[0] # pos_l = self.backbone[1](NestedTensor(src, mask)).cast(src.dtype) pos_l = self.backbone[1](mask).cast(src.dtype) srcs.append(src) masks.append(mask) poss.append(pos_l) - input_query_bbox = input_query_label = attn_mask = dn_meta = None + # input_query_bbox = input_query_label = attn_mask = dn_meta = None + input_query_bbox = input_query_label = attn_mask = None hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer( - srcs, masks, input_query_bbox, poss, input_query_label, attn_mask, - text_dict) + srcs, masks, input_query_bbox, poss, input_query_label, attn_mask, text_dict + ) # deformable-detr-like anchor update outputs_coord_list = [] - for dec_lid, (layer_ref_sig, layer_bbox_embed, layer_hs - ) in enumerate(zip(reference[:-1], self.bbox_embed, hs)): + for dec_lid, (layer_ref_sig, layer_bbox_embed, layer_hs) in enumerate( + zip(reference[:-1], self.bbox_embed, hs) + ): layer_delta_unsig = layer_bbox_embed(layer_hs) - layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid( - layer_ref_sig) + layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid(layer_ref_sig) layer_outputs_unsig = F.sigmoid(layer_outputs_unsig) outputs_coord_list.append(layer_outputs_unsig) outputs_coord_list = paddle.stack(outputs_coord_list) # output - outputs_class = paddle.stack([ - layer_cls_embed(layer_hs, text_dict) - for layer_cls_embed, layer_hs in zip(self.class_embed, hs) - ]) - - out = { - "pred_logits": outputs_class[-1], - "pred_boxes": outputs_coord_list[-1] - } + outputs_class = paddle.stack( + [layer_cls_embed(layer_hs, text_dict) for layer_cls_embed, layer_hs in zip(self.class_embed, hs)] + ) + + out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord_list[-1]} return out diff --git a/paddlemix/models/groundingdino/ms_deform_attn.py b/paddlemix/models/groundingdino/ms_deform_attn.py index 747a7543977fa..5d29c9ea3e20f 100644 --- a/paddlemix/models/groundingdino/ms_deform_attn.py +++ b/paddlemix/models/groundingdino/ms_deform_attn.py @@ -24,17 +24,17 @@ # helpers def _is_power_of_2(n): if (not isinstance(n, int)) or (n < 0): - raise ValueError("invalid input for _is_power_of_2: {} (type: {})". - format(n, type(n))) + raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) return (n & (n - 1) == 0) and n != 0 def deformable_attention_core_func( - value, - value_spatial_shapes, - value_level_start_index, - sampling_locations, - attention_weights, ): + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, +): """ Args: value (Tensor): [bs, value_length, n_head, c] @@ -49,44 +49,46 @@ def deformable_attention_core_func( bs, _, n_head, c = value.shape _, Len_q, _, n_levels, n_points, _ = sampling_locations.shape - value_list = value.split( - value_spatial_shapes.prod(1).split(n_levels), axis=1) + value_list = value.split(value_spatial_shapes.prod(1).split(n_levels), axis=1) sampling_grids = 2 * sampling_locations - 1 sampling_value_list = [] for level, (h, w) in enumerate(value_spatial_shapes): # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ - value_l_ = (value_list[level].flatten(2).transpose([0, 2, 1]) - .reshape([bs * n_head, c, h, w])) + value_l_ = value_list[level].flatten(2).transpose([0, 2, 1]).reshape([bs * n_head, c, h, w]) # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 - sampling_grid_l_ = (sampling_grids[:, :, :, level].transpose( - [0, 2, 1, 3, 4]).flatten(0, 1)) + sampling_grid_l_ = sampling_grids[:, :, :, level].transpose([0, 2, 1, 3, 4]).flatten(0, 1) # N_*M_, D_, Lq_, P_ sampling_value_l_ = F.grid_sample( value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", - align_corners=False, ) + align_corners=False, + ) sampling_value_list.append(sampling_value_l_) # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_) attention_weights = attention_weights.transpose([0, 2, 1, 3, 4]).reshape( - [bs * n_head, 1, Len_q, n_levels * n_points]) - output = ((paddle.stack( - sampling_value_list, axis=-2).flatten(-2) * attention_weights).sum(-1) - .reshape([bs, n_head * c, Len_q])) + [bs * n_head, 1, Len_q, n_levels * n_points] + ) + output = ( + (paddle.stack(sampling_value_list, axis=-2).flatten(-2) * attention_weights) + .sum(-1) + .reshape([bs, n_head * c, Len_q]) + ) return output.transpose([0, 2, 1]) class MSDeformableAttention(nn.Layer): def __init__( - self, - embed_dim=256, - num_heads=8, - num_levels=4, - num_points=4, - lr_mult=0.1, - batch_first=False, ): + self, + embed_dim=256, + num_heads=8, + num_levels=4, + num_points=4, + lr_mult=0.1, + batch_first=False, + ): """ Multi-Scale Deformable Attention Module """ @@ -98,14 +100,14 @@ def __init__( self.total_points = num_heads * num_levels * num_points self.head_dim = embed_dim // num_heads - assert (self.head_dim * num_heads == self.embed_dim - ), "embed_dim must be divisible by num_heads" + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" self.sampling_offsets = nn.Linear( embed_dim, self.total_points * 2, weight_attr=ParamAttr(learning_rate=lr_mult), - bias_attr=ParamAttr(learning_rate=lr_mult), ) + bias_attr=ParamAttr(learning_rate=lr_mult), + ) self.attention_weights = nn.Linear(embed_dim, self.total_points) self.value_proj = nn.Linear(embed_dim, embed_dim) @@ -124,16 +126,11 @@ def __init__( def _reset_parameters(self): # sampling_offsets constant_(self.sampling_offsets.weight) - thetas = paddle.arange( - self.num_heads, - dtype=paddle.float32) * (2.0 * math.pi / self.num_heads) + thetas = paddle.arange(self.num_heads, dtype=paddle.float32) * (2.0 * math.pi / self.num_heads) grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1) grid_init = grid_init / grid_init.abs().max(-1, keepdim=True) - grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile( - [1, self.num_levels, self.num_points, 1]) - scaling = paddle.arange( - 1, self.num_points + 1, - dtype=paddle.float32).reshape([1, 1, -1, 1]) + grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile([1, self.num_levels, self.num_points, 1]) + scaling = paddle.arange(1, self.num_points + 1, dtype=paddle.float32).reshape([1, 1, -1, 1]) grid_init *= scaling self.sampling_offsets.bias.set_value(grid_init.flatten()) # attention_weights @@ -146,13 +143,14 @@ def _reset_parameters(self): constant_(self.output_proj.bias) def forward( - self, - query, - reference_points, - value, - value_spatial_shapes, - value_level_start_index, - value_mask=None, ): + self, + query, + reference_points, + value, + value_spatial_shapes, + value_level_start_index, + value_mask=None, + ): """ Args: query (Tensor): [bs, query_length, C] @@ -182,34 +180,37 @@ def forward( value = value.reshape([bs, Len_v, self.num_heads, self.head_dim]) sampling_offsets = self.sampling_offsets(query).reshape( - [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2]) + [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2] + ) attention_weights = self.attention_weights(query).reshape( - [bs, Len_q, self.num_heads, self.num_levels * self.num_points]) + [bs, Len_q, self.num_heads, self.num_levels * self.num_points] + ) attention_weights = F.softmax(attention_weights).reshape( - [bs, Len_q, self.num_heads, self.num_levels, self.num_points]) + [bs, Len_q, self.num_heads, self.num_levels, self.num_points] + ) if reference_points.shape[-1] == 2: - offset_normalizer = value_spatial_shapes.flip([1]).reshape( - [1, 1, 1, self.num_levels, 1, 2]) + offset_normalizer = value_spatial_shapes.flip([1]).reshape([1, 1, 1, self.num_levels, 1, 2]) sampling_locations = ( - reference_points.reshape([bs, Len_q, 1, self.num_levels, 1, 2]) - + sampling_offsets / offset_normalizer) + reference_points.reshape([bs, Len_q, 1, self.num_levels, 1, 2]) + sampling_offsets / offset_normalizer + ) elif reference_points.shape[-1] == 4: sampling_locations = ( - reference_points[:, :, None, :, None, :2] + sampling_offsets / - self.num_points * reference_points[:, :, None, :, None, 2:] * - 0.5) + reference_points[:, :, None, :, None, :2] + + sampling_offsets / self.num_points * reference_points[:, :, None, :, None, 2:] * 0.5 + ) else: raise ValueError( - "Last dim of reference_points must be 2 or 4, but get {} instead.". - format(reference_points.shape[-1])) + "Last dim of reference_points must be 2 or 4, but get {} instead.".format(reference_points.shape[-1]) + ) output = self.ms_deformable_attn_core( value, value_spatial_shapes.astype("int64"), value_level_start_index.astype("int64"), sampling_locations, - attention_weights, ) + attention_weights, + ) output = self.output_proj(output) if not self.batch_first: diff --git a/paddlemix/models/groundingdino/transformer.py b/paddlemix/models/groundingdino/transformer.py index dca5d32a59662..3b80821551e13 100644 --- a/paddlemix/models/groundingdino/transformer.py +++ b/paddlemix/models/groundingdino/transformer.py @@ -14,56 +14,62 @@ from typing import Optional -import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle.distributed.fleet.utils import recompute -from paddlenlp.utils.initializer import constant_, normal_, xavier_uniform_ +from paddlenlp.utils.initializer import normal_, xavier_uniform_ from .fuse_modules import BiAttentionBlock from .layers import MultiHeadAttention from .ms_deform_attn import MSDeformableAttention as MSDeformAttn from .transformer_vanilla import TransformerEncoderLayer -from .utils import (MLP, _get_activation_fn, _get_clones, - gen_encoder_output_proposals, gen_sineembed_for_position, - get_sine_pos_embed, inverse_sigmoid) +from .utils import ( + MLP, + _get_activation_fn, + _get_clones, + gen_encoder_output_proposals, + gen_sineembed_for_position, + get_sine_pos_embed, + inverse_sigmoid, +) class Transformer(nn.Layer): def __init__( - self, - d_model=256, - nhead=8, - num_queries=300, - num_encoder_layers=6, - num_unicoder_layers=0, - num_decoder_layers=6, - dim_feedforward=2048, - dropout=0.0, - activation="relu", - normalize_before=False, - return_intermediate_dec=False, - query_dim=4, - num_patterns=0, - # for deformable encoder - num_feature_levels=1, - enc_n_points=4, - dec_n_points=4, - # init query - learnable_tgt_init=False, - # two stage - two_stage_type="no", - embed_init_tgt=False, - # for text - use_text_enhancer=False, - use_fusion_layer=False, - use_checkpoint=False, - use_transformer_ckpt=False, - use_text_cross_attention=False, - text_dropout=0.1, - fusion_dropout=0.1, - fusion_droppath=0.0, ): + self, + d_model=256, + nhead=8, + num_queries=300, + num_encoder_layers=6, + num_unicoder_layers=0, + num_decoder_layers=6, + dim_feedforward=2048, + dropout=0.0, + activation="relu", + normalize_before=False, + return_intermediate_dec=False, + query_dim=4, + num_patterns=0, + # for deformable encoder + num_feature_levels=1, + enc_n_points=4, + dec_n_points=4, + # init query + learnable_tgt_init=False, + # two stage + two_stage_type="no", + embed_init_tgt=False, + # for text + use_text_enhancer=False, + use_fusion_layer=False, + use_checkpoint=False, + use_transformer_ckpt=False, + use_text_cross_attention=False, + text_dropout=0.1, + fusion_dropout=0.1, + fusion_droppath=0.0, + ): super().__init__() self.num_feature_levels = num_feature_levels self.num_encoder_layers = num_encoder_layers @@ -80,14 +86,16 @@ def __init__( activation, num_feature_levels, nhead, - enc_n_points, ) + enc_n_points, + ) if use_text_enhancer: text_enhance_layer = TransformerEncoderLayer( d_model=d_model, nhead=nhead // 2, dim_feedforward=dim_feedforward // 2, - dropout=text_dropout, ) + dropout=text_dropout, + ) else: text_enhance_layer = None @@ -98,7 +106,8 @@ def __init__( embed_dim=dim_feedforward // 2, num_heads=nhead // 2, dropout=fusion_dropout, - drop_path=fusion_droppath, ) + drop_path=fusion_droppath, + ) else: feature_fusion_layer = None @@ -112,7 +121,8 @@ def __init__( text_enhance_layer=text_enhance_layer, feature_fusion_layer=feature_fusion_layer, use_checkpoint=use_checkpoint, - use_transformer_ckpt=use_transformer_ckpt, ) + use_transformer_ckpt=use_transformer_ckpt, + ) # choose decoder layer type decoder_layer = DeformableTransformerDecoderLayer( @@ -123,7 +133,8 @@ def __init__( num_feature_levels, nhead, dec_n_points, - use_text_cross_attention=use_text_cross_attention, ) + use_text_cross_attention=use_text_cross_attention, + ) decoder_norm = nn.LayerNorm(d_model) self.decoder = TransformerDecoder( @@ -133,7 +144,8 @@ def __init__( return_intermediate=return_intermediate_dec, d_model=d_model, query_dim=query_dim, - num_feature_levels=num_feature_levels, ) + num_feature_levels=num_feature_levels, + ) self.d_model = d_model self.nhead = nhead @@ -141,22 +153,19 @@ def __init__( self.num_queries = num_queries # useful for single stage model only self.num_patterns = num_patterns if not isinstance(num_patterns, int): - Warning("num_patterns should be int but {}".format( - type(num_patterns))) + Warning("num_patterns should be int but {}".format(type(num_patterns))) self.num_patterns = 0 if num_feature_levels > 1: if self.num_encoder_layers > 0: - self.level_embed = self.create_parameter( - shape=[num_feature_levels, d_model]) + self.level_embed = self.create_parameter(shape=[num_feature_levels, d_model]) else: self.level_embed = None self.learnable_tgt_init = learnable_tgt_init assert learnable_tgt_init, "why not learnable_tgt_init" self.embed_init_tgt = embed_init_tgt - if (two_stage_type != "no" and embed_init_tgt) or ( - two_stage_type == "no"): + if (two_stage_type != "no" and embed_init_tgt) or (two_stage_type == "no"): self.tgt_embed = nn.Embedding(self.num_queries, d_model) normal_(self.tgt_embed.weight) else: @@ -205,14 +214,15 @@ def init_ref_points(self, use_num_queries): self.refpoint_embed = nn.Embedding(use_num_queries, 4) def forward( - self, - srcs, - masks, - refpoint_embed, - pos_embeds, - tgt, - attn_mask=None, - text_dict=None, ): + self, + srcs, + masks, + refpoint_embed, + pos_embeds, + tgt, + attn_mask=None, + text_dict=None, + ): """ Input: - srcs: List of multi features [bs, ci, hi, wi] @@ -227,18 +237,15 @@ def forward( mask_flatten = [] lvl_pos_embed_flatten = [] spatial_shapes = [] - for lvl, (src, mask, - pos_embed) in enumerate(zip(srcs, masks, pos_embeds)): + for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)): bs, c, h, w = src.shape spatial_shapes.append(paddle.to_tensor([h, w])) src = src.flatten(2).transpose([0, 2, 1]) # bs, hw, c - mask = mask.cast(paddle.float32).flatten(1).cast( - paddle.bool) # bs, hw + mask = mask.cast(paddle.float32).flatten(1).cast(paddle.bool) # bs, hw pos_embed = pos_embed.flatten(2).transpose([0, 2, 1]) # bs, hw, c if self.num_feature_levels > 1 and self.level_embed is not None: - lvl_pos_embed = pos_embed + self.level_embed[lvl].reshape( - [1, 1, -1]) + lvl_pos_embed = pos_embed + self.level_embed[lvl].reshape([1, 1, -1]) else: lvl_pos_embed = pos_embed lvl_pos_embed_flatten.append(lvl_pos_embed) @@ -246,20 +253,20 @@ def forward( mask_flatten.append(mask) src_flatten = paddle.concat(src_flatten, 1) # bs, \sum{hxw}, c mask_flatten = paddle.concat(mask_flatten, 1) # bs, \sum{hxw} - lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, - 1) # bs, \sum{hxw}, c + lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1) # bs, \sum{hxw}, c - spatial_shapes = paddle.to_tensor( - paddle.stack(spatial_shapes), dtype=paddle.int32) + spatial_shapes = paddle.to_tensor(paddle.stack(spatial_shapes), dtype=paddle.int32) - level_start_index = paddle.concat(( - paddle.zeros( - [1], dtype=spatial_shapes.dtype), - spatial_shapes.prod(1).cumsum(0)[:-1], )) + level_start_index = paddle.concat( + ( + paddle.zeros([1], dtype=spatial_shapes.dtype), + spatial_shapes.prod(1).cumsum(0)[:-1], + ) + ) valid_ratios = paddle.stack([self.get_valid_ratio(m) for m in masks], 1) # two stage - enc_topk_proposals = enc_refpoint_embed = None + # enc_topk_proposals = enc_refpoint_embed = None ######################################################### # Begin Encoder @@ -275,7 +282,8 @@ def forward( text_attention_mask=~text_dict["text_token_mask"], # we ~ the mask . False means use the token; True means pad the token position_ids=text_dict["position_ids"], - text_self_attention_masks=text_dict["text_self_attention_masks"], ) + text_self_attention_masks=text_dict["text_self_attention_masks"], + ) ######################################################### # End Encoder # - memory: bs, \sum{hw}, c @@ -287,16 +295,13 @@ def forward( text_dict["encoded_text"] = memory_text if self.two_stage_type == "standard": - output_memory, output_proposals = gen_encoder_output_proposals( - memory, mask_flatten, spatial_shapes) + output_memory, output_proposals = gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes) output_memory = self.enc_output_norm(self.enc_output(output_memory)) if text_dict is not None: - enc_outputs_class_unselected = self.enc_out_class_embed( - output_memory, text_dict) + enc_outputs_class_unselected = self.enc_out_class_embed(output_memory, text_dict) else: - enc_outputs_class_unselected = self.enc_out_class_embed( - output_memory) + enc_outputs_class_unselected = self.enc_out_class_embed(output_memory) topk_logits = enc_outputs_class_unselected.max(-1) enc_outputs_coord_unselected = ( @@ -306,47 +311,39 @@ def forward( topk_proposals = paddle.topk(topk_logits, topk, axis=1)[1] # bs, nq - topk_ind = topk_proposals.unsqueeze(axis=-1).tile( - repeat_times=[1, 1, 4]) + topk_ind = topk_proposals.unsqueeze(axis=-1).tile(repeat_times=[1, 1, 4]) # gather boxes refpoint_embed_undetach = paddle.take_along_axis( - arr=enc_outputs_coord_unselected, axis=1, indices=topk_ind) + arr=enc_outputs_coord_unselected, axis=1, indices=topk_ind + ) refpoint_embed_ = refpoint_embed_undetach - init_box_proposal = F.sigmoid( - paddle.take_along_axis( - arr=output_proposals, axis=1, indices=topk_ind)) + init_box_proposal = F.sigmoid(paddle.take_along_axis(arr=output_proposals, axis=1, indices=topk_ind)) tgt_undetach = paddle.take_along_axis( arr=output_memory, axis=1, - indices=topk_proposals.unsqueeze(axis=-1).tile( - repeat_times=[1, 1, self.d_model]), ) + indices=topk_proposals.unsqueeze(axis=-1).tile(repeat_times=[1, 1, self.d_model]), + ) if self.embed_init_tgt: - tgt_ = (self.tgt_embed.weight[:, None, :].tile([1, bs, 1]) - .transpose([1, 0, 2])) # nq, bs, d_model + tgt_ = self.tgt_embed.weight[:, None, :].tile([1, bs, 1]).transpose([1, 0, 2]) # nq, bs, d_model else: tgt_ = tgt_undetach if refpoint_embed is not None: - refpoint_embed = paddle.concat( - [refpoint_embed, refpoint_embed_], axis=1) + refpoint_embed = paddle.concat([refpoint_embed, refpoint_embed_], axis=1) tgt = paddle.concat([tgt, tgt_], axis=1) else: refpoint_embed, tgt = refpoint_embed_, tgt_ elif self.two_stage_type == "no": - tgt_ = (self.tgt_embed.weight[:, None, :].tile( - [1, bs, 1]).transpose([1, 0, 2])) # nq, bs, d_model - refpoint_embed_ = (self.refpoint_embed.weight[:, None, :] - .tile([1, bs, 1]) - .transpose([1, 0, 2])) # nq, bs, 4 + tgt_ = self.tgt_embed.weight[:, None, :].tile([1, bs, 1]).transpose([1, 0, 2]) # nq, bs, d_model + refpoint_embed_ = self.refpoint_embed.weight[:, None, :].tile([1, bs, 1]).transpose([1, 0, 2]) # nq, bs, 4 if refpoint_embed is not None: - refpoint_embed = paddle.concat( - [refpoint_embed, refpoint_embed_], axis=1) + refpoint_embed = paddle.concat([refpoint_embed, refpoint_embed_], axis=1) tgt = paddle.concat([tgt, tgt_], axis=1) else: refpoint_embed, tgt = refpoint_embed_, tgt_ @@ -355,14 +352,14 @@ def forward( tgt_embed = tgt.tile([1, self.num_patterns, 1]) refpoint_embed = refpoint_embed.tile([1, self.num_patterns, 1]) tgt_pat = self.patterns.weight[None, :, :].repeat_interleave( - self.num_queries, 1) # 1, n_q*n_pat, d_model + self.num_queries, 1 + ) # 1, n_q*n_pat, d_model tgt = tgt_embed + tgt_pat init_box_proposal = F.sigmoid(refpoint_embed_) else: - raise NotImplementedError("unknown two_stage_type {}".format( - self.two_stage_type)) + raise NotImplementedError("unknown two_stage_type {}".format(self.two_stage_type)) ######################################################### # End preparing tgt # - tgt: bs, NQ, d_model @@ -416,16 +413,17 @@ def forward( class TransformerEncoder(nn.Layer): def __init__( - self, - encoder_layer, - num_layers, - d_model=256, - num_queries=300, - enc_layer_share=False, - text_enhance_layer=None, - feature_fusion_layer=None, - use_checkpoint=False, - use_transformer_ckpt=False, ): + self, + encoder_layer, + num_layers, + d_model=256, + num_queries=300, + enc_layer_share=False, + text_enhance_layer=None, + feature_fusion_layer=None, + use_checkpoint=False, + use_transformer_ckpt=False, + ): """_summary_ Args: @@ -443,17 +441,12 @@ def __init__( self.text_layers = [] self.fusion_layers = [] if num_layers > 0: - self.layers = _get_clones( - encoder_layer, num_layers, layer_share=enc_layer_share) + self.layers = _get_clones(encoder_layer, num_layers, layer_share=enc_layer_share) if text_enhance_layer is not None: - self.text_layers = _get_clones( - text_enhance_layer, num_layers, layer_share=enc_layer_share) + self.text_layers = _get_clones(text_enhance_layer, num_layers, layer_share=enc_layer_share) if feature_fusion_layer is not None: - self.fusion_layers = _get_clones( - feature_fusion_layer, - num_layers, - layer_share=enc_layer_share) + self.fusion_layers = _get_clones(feature_fusion_layer, num_layers, layer_share=enc_layer_share) else: self.layers = [] del encoder_layer @@ -479,14 +472,11 @@ def get_reference_points(spatial_shapes, valid_ratios): for lvl, (H_, W_) in enumerate(spatial_shapes): ref_y, ref_x = paddle.meshgrid( - paddle.linspace( - 0.5, H_ - 0.5, H_, dtype=paddle.float32), - paddle.linspace( - 0.5, W_ - 0.5, W_, dtype=paddle.float32), ) - ref_y = ref_y.reshape([-1])[None] / (valid_ratios[:, None, lvl, 1] * - H_) - ref_x = ref_x.reshape([-1])[None] / (valid_ratios[:, None, lvl, 0] * - W_) + paddle.linspace(0.5, H_ - 0.5, H_, dtype=paddle.float32), + paddle.linspace(0.5, W_ - 0.5, W_, dtype=paddle.float32), + ) + ref_y = ref_y.reshape([-1])[None] / (valid_ratios[:, None, lvl, 1] * H_) + ref_x = ref_x.reshape([-1])[None] / (valid_ratios[:, None, lvl, 0] * W_) ref = paddle.stack((ref_x, ref_y), -1) reference_points_list.append(ref) reference_points = paddle.concat(reference_points_list, 1) @@ -494,20 +484,21 @@ def get_reference_points(spatial_shapes, valid_ratios): return reference_points def forward( - self, - # for images - src: paddle.Tensor, - pos: paddle.Tensor, - spatial_shapes: paddle.Tensor, - level_start_index: paddle.Tensor, - valid_ratios: paddle.Tensor, - key_padding_mask: paddle.Tensor, - # for texts - memory_text: paddle.Tensor=None, - text_attention_mask: paddle.Tensor=None, - pos_text: paddle.Tensor=None, - text_self_attention_masks: paddle.Tensor=None, - position_ids: paddle.Tensor=None, ): + self, + # for images + src: paddle.Tensor, + pos: paddle.Tensor, + spatial_shapes: paddle.Tensor, + level_start_index: paddle.Tensor, + valid_ratios: paddle.Tensor, + key_padding_mask: paddle.Tensor, + # for texts + memory_text: paddle.Tensor = None, + text_attention_mask: paddle.Tensor = None, + pos_text: paddle.Tensor = None, + text_self_attention_masks: paddle.Tensor = None, + position_ids: paddle.Tensor = None, + ): """ Input: - src: [bs, sum(hi*wi), 256] @@ -533,22 +524,16 @@ def forward( # preparation and reshape if self.num_layers > 0: - reference_points = self.get_reference_points(spatial_shapes, - valid_ratios) + reference_points = self.get_reference_points(spatial_shapes, valid_ratios) if self.text_layers: # generate pos_text bs, n_text, text_dim = memory_text.shape if pos_text is None and position_ids is None: - pos_text = (paddle.arange(n_text).cast(paddle.float32) - .unsqueeze(0).unsqueeze(-1).tile([bs, 1, 1])) - pos_text = get_sine_pos_embed( - pos_text, num_pos_feats=256, exchange_xy=False) + pos_text = paddle.arange(n_text).cast(paddle.float32).unsqueeze(0).unsqueeze(-1).tile([bs, 1, 1]) + pos_text = get_sine_pos_embed(pos_text, num_pos_feats=256, exchange_xy=False) if position_ids is not None: - pos_text = get_sine_pos_embed( - position_ids[..., None], - num_pos_feats=256, - exchange_xy=False) + pos_text = get_sine_pos_embed(position_ids[..., None], num_pos_feats=256, exchange_xy=False) # main process for layer_id, layer in enumerate(self.layers): @@ -560,20 +545,23 @@ def forward( memory_text, key_padding_mask, text_attention_mask, - **{"preserve_rng_state": True}, ) + **{"preserve_rng_state": True}, + ) else: output, memory_text = self.fusion_layers[layer_id]( v=output, l=memory_text, attention_mask_v=key_padding_mask, - attention_mask_l=text_attention_mask, ) + attention_mask_l=text_attention_mask, + ) if self.text_layers: memory_text = self.text_layers[layer_id]( src=memory_text, src_mask=text_self_attention_masks, # note we use ~ for mask here src_key_padding_mask=text_attention_mask, - pos=(pos_text if pos_text is not None else None), ) + pos=(pos_text if pos_text is not None else None), + ) # main process if self.use_transformer_ckpt: @@ -585,7 +573,8 @@ def forward( spatial_shapes, level_start_index, key_padding_mask, - **{"preserve_rng_state": True}, ) + **{"preserve_rng_state": True}, + ) else: output = layer( src=output, @@ -593,21 +582,23 @@ def forward( reference_points=reference_points, spatial_shapes=spatial_shapes, level_start_index=level_start_index, - key_padding_mask=key_padding_mask, ) + key_padding_mask=key_padding_mask, + ) return output, memory_text class TransformerDecoder(nn.Layer): def __init__( - self, - decoder_layer, - num_layers, - norm=None, - return_intermediate=False, - d_model=256, - query_dim=4, - num_feature_levels=1, ): + self, + decoder_layer, + num_layers, + norm=None, + return_intermediate=False, + d_model=256, + query_dim=4, + num_feature_levels=1, + ): super().__init__() if num_layers > 0: self.layers = _get_clones(decoder_layer, num_layers) @@ -618,8 +609,7 @@ def __init__( self.return_intermediate = return_intermediate assert return_intermediate, "support return_intermediate only" self.query_dim = query_dim - assert query_dim in [2, 4], "query_dim should be 2/4 but {}".format( - query_dim) + assert query_dim in [2, 4], "query_dim should be 2/4 but {}".format(query_dim) self.num_feature_levels = num_feature_levels self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model, 2) @@ -634,23 +624,23 @@ def __init__( self.ref_anchor_head = None def forward( - self, - tgt, - memory, - tgt_mask: Optional[paddle.Tensor]=None, - memory_mask: Optional[paddle.Tensor]=None, - tgt_key_padding_mask: Optional[paddle.Tensor]=None, - memory_key_padding_mask: Optional[paddle.Tensor]=None, - pos: Optional[paddle.Tensor]=None, - refpoints_unsigmoid: Optional[ - paddle.Tensor]=None, # num_queries, bs, 2 - # for memory - level_start_index: Optional[paddle.Tensor]=None, # num_levels - spatial_shapes: Optional[paddle.Tensor]=None, # bs, num_levels, 2 - valid_ratios: Optional[paddle.Tensor]=None, - # for text - memory_text: Optional[paddle.Tensor]=None, - text_attention_mask: Optional[paddle.Tensor]=None, ): + self, + tgt, + memory, + tgt_mask: Optional[paddle.Tensor] = None, + memory_mask: Optional[paddle.Tensor] = None, + tgt_key_padding_mask: Optional[paddle.Tensor] = None, + memory_key_padding_mask: Optional[paddle.Tensor] = None, + pos: Optional[paddle.Tensor] = None, + refpoints_unsigmoid: Optional[paddle.Tensor] = None, # num_queries, bs, 2 + # for memory + level_start_index: Optional[paddle.Tensor] = None, # num_levels + spatial_shapes: Optional[paddle.Tensor] = None, # bs, num_levels, 2 + valid_ratios: Optional[paddle.Tensor] = None, + # for text + memory_text: Optional[paddle.Tensor] = None, + text_attention_mask: Optional[paddle.Tensor] = None, + ): """ Input: - tgt: nq, bs, d_model @@ -669,20 +659,16 @@ def forward( if reference_points.shape[-1] == 4: reference_points_input = ( - reference_points[:, :, None] * - paddle.concat([valid_ratios, valid_ratios], -1)[None, :] + reference_points[:, :, None] * paddle.concat([valid_ratios, valid_ratios], -1)[None, :] ) # nq, bs, nlevel, 4 else: assert reference_points.shape[-1] == 2 - reference_points_input = (reference_points[:, :, None] * - valid_ratios[None, :]) - query_sine_embed = gen_sineembed_for_position( - reference_points_input[:, :, 0, :]) # nq, bs, 256*2 + reference_points_input = reference_points[:, :, None] * valid_ratios[None, :] + query_sine_embed = gen_sineembed_for_position(reference_points_input[:, :, 0, :]) # nq, bs, 256*2 # conditional query raw_query_pos = self.ref_point_head(query_sine_embed) # nq, bs, 256 - pos_scale = self.query_scale( - output) if self.query_scale is not None else 1 + pos_scale = self.query_scale(output) if self.query_scale is not None else 1 query_pos = pos_scale * raw_query_pos # main process @@ -700,10 +686,10 @@ def forward( memory_spatial_shapes=spatial_shapes, memory_pos=pos, self_attn_mask=tgt_mask, - cross_attn_mask=memory_mask, ) + cross_attn_mask=memory_mask, + ) - if (output.isnan().any() | - output.isinf().any()) and paddle.in_dynamic_mode(): + if (output.isnan().any() | output.isinf().any()) and paddle.in_dynamic_mode(): print(f"output layer_id {layer_id} is nan") try: num_nan = output.isnan().sum().item() @@ -734,14 +720,15 @@ def forward( class DeformableTransformerEncoderLayer(nn.Layer): def __init__( - self, - d_model=256, - d_ffn=1024, - dropout=0.1, - activation="relu", - n_levels=4, - n_heads=8, - n_points=4, ): + self, + d_model=256, + d_ffn=1024, + dropout=0.1, + activation="relu", + n_levels=4, + n_heads=8, + n_points=4, + ): super().__init__() # self attention @@ -750,7 +737,8 @@ def __init__( num_levels=n_levels, num_heads=n_heads, num_points=n_points, - batch_first=True, ) + batch_first=True, + ) self.dropout1 = nn.Dropout(dropout) self.norm1 = nn.LayerNorm(d_model) @@ -773,13 +761,14 @@ def forward_ffn(self, src): return src def forward( - self, - src, - pos, - reference_points, - spatial_shapes, - level_start_index, - key_padding_mask=None, ): + self, + src, + pos, + reference_points, + spatial_shapes, + level_start_index, + key_padding_mask=None, + ): src2 = self.self_attn( query=self.with_pos_embed(src, pos), @@ -787,7 +776,8 @@ def forward( value=src, value_spatial_shapes=spatial_shapes, value_level_start_index=level_start_index, - value_mask=key_padding_mask, ) + value_mask=key_padding_mask, + ) src = src + self.dropout1(src2) src = self.norm1(src) @@ -799,16 +789,17 @@ def forward( class DeformableTransformerDecoderLayer(nn.Layer): def __init__( - self, - d_model=256, - d_ffn=1024, - dropout=0.1, - activation="relu", - n_levels=4, - n_heads=8, - n_points=4, - use_text_feat_guide=False, - use_text_cross_attention=False, ): + self, + d_model=256, + d_ffn=1024, + dropout=0.1, + activation="relu", + n_levels=4, + n_heads=8, + n_points=4, + use_text_feat_guide=False, + use_text_cross_attention=False, + ): super().__init__() # cross attention @@ -817,15 +808,15 @@ def __init__( num_levels=n_levels, num_heads=n_heads, num_points=n_points, - batch_first=True, ) + batch_first=True, + ) self.dropout1 = nn.Dropout(dropout) if dropout > 0 else nn.Identity() self.norm1 = nn.LayerNorm(d_model) # cross attention text if use_text_cross_attention: self.ca_text = MultiHeadAttention(d_model, n_heads, dropout=dropout) - self.catext_dropout = nn.Dropout( - dropout) if dropout > 0 else nn.Identity() + self.catext_dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity() self.catext_norm = nn.LayerNorm(d_model) # self attention @@ -835,8 +826,7 @@ def __init__( # ffn self.linear1 = nn.Linear(d_model, d_ffn) - self.activation = _get_activation_fn( - activation, d_model=d_ffn, batch_dim=1) + self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1) self.dropout3 = nn.Dropout(dropout) if dropout > 0 else nn.Identity() self.linear2 = nn.Linear(d_ffn, d_model) self.dropout4 = nn.Dropout(dropout) if dropout > 0 else nn.Identity() @@ -864,30 +854,24 @@ def forward_ffn(self, tgt): return tgt def forward( - self, - # for tgt - tgt: Optional[paddle.Tensor], # nq, bs, d_model - tgt_query_pos: Optional[ - paddle.Tensor]=None, # pos for query. MLP(Sine(pos)) - tgt_query_sine_embed: Optional[ - paddle.Tensor]=None, # pos for query. Sine(pos) - tgt_key_padding_mask: Optional[paddle.Tensor]=None, - tgt_reference_points: Optional[paddle.Tensor]=None, # nq, bs, 4 - memory_text: Optional[paddle.Tensor]=None, # bs, num_token, d_model - text_attention_mask: Optional[paddle.Tensor]=None, # bs, num_token - # for memory - memory: Optional[paddle.Tensor]=None, # hw, bs, d_model - memory_key_padding_mask: Optional[paddle.Tensor]=None, - memory_level_start_index: Optional[ - paddle.Tensor]=None, # num_levels - memory_spatial_shapes: Optional[ - paddle.Tensor]=None, # bs, num_levels, 2 - memory_pos: Optional[paddle.Tensor]=None, # pos for memory - # sa - self_attn_mask: Optional[ - paddle.Tensor]=None, # mask used for self-attention - cross_attn_mask: Optional[ - paddle.Tensor]=None, # mask used for cross-attention + self, + # for tgt + tgt: Optional[paddle.Tensor], # nq, bs, d_model + tgt_query_pos: Optional[paddle.Tensor] = None, # pos for query. MLP(Sine(pos)) + tgt_query_sine_embed: Optional[paddle.Tensor] = None, # pos for query. Sine(pos) + tgt_key_padding_mask: Optional[paddle.Tensor] = None, + tgt_reference_points: Optional[paddle.Tensor] = None, # nq, bs, 4 + memory_text: Optional[paddle.Tensor] = None, # bs, num_token, d_model + text_attention_mask: Optional[paddle.Tensor] = None, # bs, num_token + # for memory + memory: Optional[paddle.Tensor] = None, # hw, bs, d_model + memory_key_padding_mask: Optional[paddle.Tensor] = None, + memory_level_start_index: Optional[paddle.Tensor] = None, # num_levels + memory_spatial_shapes: Optional[paddle.Tensor] = None, # bs, num_levels, 2 + memory_pos: Optional[paddle.Tensor] = None, # pos for memory + # sa + self_attn_mask: Optional[paddle.Tensor] = None, # mask used for self-attention + cross_attn_mask: Optional[paddle.Tensor] = None, # mask used for cross-attention ): """ Input: @@ -904,8 +888,8 @@ def forward( q, k, tgt, - attn_mask=self_attn_mask - if self_attn_mask is None else ~self_attn_mask, )[0] + attn_mask=self_attn_mask if self_attn_mask is None else ~self_attn_mask, + )[0] tgt = tgt + self.dropout2(tgt2) tgt = self.norm2(tgt) @@ -914,7 +898,8 @@ def forward( self.with_pos_embed(tgt, tgt_query_pos), memory_text, memory_text, - attn_mask=~text_attention_mask, )[0] + attn_mask=~text_attention_mask, + )[0] tgt = tgt + self.catext_dropout(tgt2) tgt = self.catext_norm(tgt) @@ -924,7 +909,8 @@ def forward( value=memory, value_spatial_shapes=memory_spatial_shapes, value_level_start_index=memory_level_start_index, - value_mask=memory_key_padding_mask, ) + value_mask=memory_key_padding_mask, + ) tgt = tgt + self.dropout1(tgt2) tgt = self.norm1(tgt) @@ -962,4 +948,5 @@ def build_transformer(args): use_text_cross_attention=args.use_text_cross_attention, text_dropout=args.text_dropout, fusion_dropout=args.fusion_dropout, - fusion_droppath=args.fusion_droppath, ) + fusion_droppath=args.fusion_droppath, + ) diff --git a/paddlemix/models/groundingdino/transformer_vanilla.py b/paddlemix/models/groundingdino/transformer_vanilla.py index 858dc6cd87395..7dacba17b74a7 100644 --- a/paddlemix/models/groundingdino/transformer_vanilla.py +++ b/paddlemix/models/groundingdino/transformer_vanilla.py @@ -15,22 +15,14 @@ from typing import Optional import paddle -import paddle.nn.functional as F from paddle import Tensor, nn from .layers import MultiHeadAttention -from .utils import (MLP, _get_activation_fn, _get_clones, - gen_encoder_output_proposals, gen_sineembed_for_position, - sigmoid_focal_loss) +from .utils import _get_activation_fn, _get_clones class TextTransformer(nn.Layer): - def __init__(self, - num_layers, - d_model=256, - nheads=8, - dim_feedforward=2048, - dropout=0.1): + def __init__(self, num_layers, d_model=256, nheads=8, dim_feedforward=2048, dropout=0.1): super().__init__() self.num_layers = num_layers self.d_model = d_model @@ -42,12 +34,11 @@ def __init__(self, d_model=d_model, nhead=nheads, dim_feedforward=dim_feedforward, - dropout=dropout, ) + dropout=dropout, + ) self.layers = _get_clones(single_encoder_layer, num_layers) - def forward(self, - memory_text: paddle.Tensor, - text_attention_mask: paddle.Tensor): + def forward(self, memory_text: paddle.Tensor, text_attention_mask: paddle.Tensor): """ Args: @@ -74,13 +65,14 @@ def forward(self, class TransformerEncoderLayer(nn.Layer): def __init__( - self, - d_model, - nhead, - dim_feedforward=2048, - dropout=0.1, - activation="relu", - normalize_before=False, ): + self, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + normalize_before=False, + ): super().__init__() self.self_attn = MultiHeadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model @@ -101,11 +93,12 @@ def with_pos_embed(self, tensor, pos: Optional[Tensor]): return tensor if pos is None else tensor + pos def forward( - self, - src, - src_mask: Optional[Tensor]=None, - src_key_padding_mask: Optional[Tensor]=None, - pos: Optional[Tensor]=None, ): + self, + src, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + ): # repeat attn mask if src_mask.dim() == 3 and src_mask.shape[0] == src.shape[1]: # bs, num_q, num_k diff --git a/paddlemix/models/groundingdino/utils.py b/paddlemix/models/groundingdino/utils.py index c2c4115e02561..42984c8ea91c0 100644 --- a/paddlemix/models/groundingdino/utils.py +++ b/paddlemix/models/groundingdino/utils.py @@ -41,10 +41,11 @@ def _get_clones(module, N, layer_share=False): def get_sine_pos_embed( - pos_tensor: paddle.Tensor, - num_pos_feats: int=128, - temperature: int=10000, - exchange_xy: bool=True, ): + pos_tensor: paddle.Tensor, + num_pos_feats: int = 128, + temperature: int = 10000, + exchange_xy: bool = True, +): """generate sine position embedding from a position tensor Args: pos_tensor (paddle.Tensor): shape: [..., n]. @@ -57,20 +58,14 @@ def get_sine_pos_embed( """ scale = 2 * math.pi dim_t = paddle.arange(num_pos_feats) - dim_t = temperature**( - 2.0 * paddle.floor_divide(dim_t, paddle.to_tensor(2)) / num_pos_feats) + dim_t = temperature ** (2.0 * paddle.floor_divide(dim_t, paddle.to_tensor(2)) / num_pos_feats) def sine_func(x: paddle.Tensor): sin_x = x * scale / dim_t - sin_x = paddle.stack( - (sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), axis=3).flatten(2) + sin_x = paddle.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), axis=3).flatten(2) return sin_x - pos_res = [ - sine_func(x) - for x in pos_tensor.split( - [1] * pos_tensor.shape[-1], axis=-1) - ] + pos_res = [sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], axis=-1)] if exchange_xy: pos_res[0], pos_res[1] = pos_res[1], pos_res[0] pos_res = paddle.concat(pos_res, axis=-1) @@ -78,10 +73,11 @@ def sine_func(x: paddle.Tensor): def gen_encoder_output_proposals( - memory: paddle.Tensor, - memory_padding_mask: paddle.Tensor, - spatial_shapes: paddle.Tensor, - learnedwh=None, ): + memory: paddle.Tensor, + memory_padding_mask: paddle.Tensor, + spatial_shapes: paddle.Tensor, + learnedwh=None, +): """ Input: - memory: bs, \sum{hw}, d_model @@ -96,23 +92,19 @@ def gen_encoder_output_proposals( proposals = [] _cur = 0 for lvl, (H_, W_) in enumerate(spatial_shapes): - mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].reshape( - [N_, H_, W_, 1]) + mask_flatten_ = memory_padding_mask[:, _cur : (_cur + H_ * W_)].reshape([N_, H_, W_, 1]) valid_H = paddle.sum(~mask_flatten_[:, :, 0, 0], 1) valid_W = paddle.sum(~mask_flatten_[:, 0, :, 0], 1) # import ipdb; ipdb.set_trace() grid_y, grid_x = paddle.meshgrid( - paddle.linspace( - 0, H_ - 1, H_, dtype=paddle.float32), - paddle.linspace( - 0, W_ - 1, W_, dtype=paddle.float32), ) - grid = paddle.concat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], - -1) # H_, W_, 2 - - scale = paddle.concat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], - 1).reshape([N_, 1, 1, 2]) + paddle.linspace(0, H_ - 1, H_, dtype=paddle.float32), + paddle.linspace(0, W_ - 1, W_, dtype=paddle.float32), + ) + grid = paddle.concat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # H_, W_, 2 + + scale = paddle.concat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).reshape([N_, 1, 1, 2]) grid = (grid.unsqueeze(0).tile([N_, 1, 1, 1]) + 0.5) / scale if learnedwh is not None: @@ -126,33 +118,21 @@ def gen_encoder_output_proposals( _cur += H_ * W_ output_proposals = paddle.concat(proposals, 1) - output_proposals_valid = ((output_proposals > 0.01) & - (output_proposals < 0.99)).all(-1, keepdim=True) - output_proposals = paddle.log(output_proposals / - (1 - output_proposals)) # unsigmoid - output_proposals = masked_fill(output_proposals, - memory_padding_mask.unsqueeze(-1), - float("inf")) - output_proposals = masked_fill(output_proposals, ~output_proposals_valid, - float("inf")) + output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) + output_proposals = paddle.log(output_proposals / (1 - output_proposals)) # unsigmoid + output_proposals = masked_fill(output_proposals, memory_padding_mask.unsqueeze(-1), float("inf")) + output_proposals = masked_fill(output_proposals, ~output_proposals_valid, float("inf")) output_memory = memory - output_memory = masked_fill(output_memory, - memory_padding_mask.unsqueeze(-1), float(0)) - output_memory = masked_fill(output_memory, ~output_proposals_valid, - float(0)) + output_memory = masked_fill(output_memory, memory_padding_mask.unsqueeze(-1), float(0)) + output_memory = masked_fill(output_memory, ~output_proposals_valid, float(0)) return output_memory, output_proposals class RandomBoxPerturber: - def __init__(self, - x_noise_scale=0.2, - y_noise_scale=0.2, - w_noise_scale=0.2, - h_noise_scale=0.2) -> None: - self.noise_scale = paddle.to_tensor( - [x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale]) + def __init__(self, x_noise_scale=0.2, y_noise_scale=0.2, w_noise_scale=0.2, h_noise_scale=0.2) -> None: + self.noise_scale = paddle.to_tensor([x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale]) def __call__(self, refanchors: paddle.Tensor) -> paddle.Tensor: nq, bs, query_dim = refanchors.shape @@ -165,12 +145,13 @@ def __call__(self, refanchors: paddle.Tensor) -> paddle.Tensor: def sigmoid_focal_loss( - inputs, - targets, - num_boxes, - alpha: float=0.25, - gamma: float=2, - no_reduction=False, ): + inputs, + targets, + num_boxes, + alpha: float = 0.25, + gamma: float = 2, + no_reduction=False, +): """ Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. Args: @@ -187,10 +168,9 @@ def sigmoid_focal_loss( Loss tensor """ prob = inputs.sigmoid() - ce_loss = F.binary_cross_entropy_with_logits( - inputs, targets, reduction="none") + ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") p_t = prob * targets + (1 - prob) * (1 - targets) - loss = ce_loss * ((1 - p_t)**gamma) + loss = ce_loss * ((1 - p_t) ** gamma) if alpha >= 0: alpha_t = alpha * targets + (1 - alpha) * (1 - targets) @@ -209,8 +189,7 @@ def __init__(self, input_dim, hidden_dim, output_dim, num_layers): super().__init__() self.num_layers = num_layers h = [hidden_dim] * (num_layers - 1) - self.layers = nn.LayerList( - nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + self.layers = nn.LayerList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) def forward(self, x): for i, layer in enumerate(self.layers): @@ -238,34 +217,27 @@ def gen_sineembed_for_position(pos_tensor): scale = 2 * math.pi dim_t = paddle.arange(128) - dim_t = 10000**(2 * (paddle.floor_divide(dim_t, paddle.to_tensor(2))) / 128) + dim_t = 10000 ** (2 * (paddle.floor_divide(dim_t, paddle.to_tensor(2))) / 128) x_embed = pos_tensor[:, :, 0] * scale y_embed = pos_tensor[:, :, 1] * scale pos_x = x_embed[:, :, None] / dim_t pos_y = y_embed[:, :, None] / dim_t - pos_x = paddle.stack( - (pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), axis=3).flatten(2) - pos_y = paddle.stack( - (pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), axis=3).flatten(2) + pos_x = paddle.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), axis=3).flatten(2) + pos_y = paddle.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), axis=3).flatten(2) if pos_tensor.shape[-1] == 2: pos = paddle.concat((pos_y, pos_x), aixs=2) elif pos_tensor.shape[-1] == 4: w_embed = pos_tensor[:, :, 2] * scale pos_w = w_embed[:, :, None] / dim_t - pos_w = paddle.stack( - (pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), - axis=3).flatten(2) + pos_w = paddle.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), axis=3).flatten(2) h_embed = pos_tensor[:, :, 3] * scale pos_h = h_embed[:, :, None] / dim_t - pos_h = paddle.stack( - (pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), - axis=3).flatten(2) + pos_h = paddle.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), axis=3).flatten(2) pos = paddle.concat((pos_y, pos_x, pos_w, pos_h), axis=2) else: - raise ValueError("Unknown pos_tensor shape(-1):{}".format( - pos_tensor.shape[-1])) + raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.shape[-1])) return pos @@ -297,12 +269,11 @@ def forward(self, x, text_dict): y = text_dict["encoded_text"] text_token_mask = text_dict["text_token_mask"] - res = x @y.transpose([0, 2, 1]) + res = x @ y.transpose([0, 2, 1]) masked_fill(res, ~text_token_mask[:, None, :], float("-inf")) # padding to max_text_len - new_res = paddle.full((*res.shape[:-1], self.max_text_len), - float("-inf")) - new_res[..., :res.shape[-1]] = res + new_res = paddle.full((*res.shape[:-1], self.max_text_len), float("-inf")) + new_res[..., : res.shape[-1]] = res return new_res diff --git a/paddlemix/models/imagebind/configuration.py b/paddlemix/models/imagebind/configuration.py index 2ce6a32933571..04bd63b46cc2a 100644 --- a/paddlemix/models/imagebind/configuration.py +++ b/paddlemix/models/imagebind/configuration.py @@ -16,8 +16,7 @@ import os from typing import Union -from paddlenlp.transformers.clip.configuration import (CLIPTextConfig, - CLIPVisionConfig) +from paddlenlp.transformers.clip.configuration import CLIPTextConfig, CLIPVisionConfig from paddlenlp.transformers.configuration_utils import PretrainedConfig from paddlemix.utils.log import logger @@ -35,28 +34,23 @@ class ImageBindVisionConfig(CLIPVisionConfig): model_type = "imagebind_vision_model" def __init__( - self, - **kwargs, ): - kwargs["return_dict"] = kwargs.pop("return_dict", True) - super().__init__(**kwargs) - - def __init__( - self, - hidden_size=768, - intermediate_size=3072, - projection_dim=512, - num_hidden_layers=12, - num_attention_heads=12, - num_channels=3, - image_size=224, - patch_size=32, - hidden_act="quick_gelu", - layer_norm_eps=0.00001, - dropout=0.0, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - **kwargs, ): + self, + hidden_size=768, + intermediate_size=3072, + projection_dim=512, + num_hidden_layers=12, + num_attention_heads=12, + num_channels=3, + image_size=224, + patch_size=32, + hidden_act="quick_gelu", + layer_norm_eps=0.00001, + dropout=0.0, + attention_dropout=0.0, + initializer_range=0.02, + initializer_factor=1.0, + **kwargs, + ): kwargs["return_dict"] = kwargs.pop("return_dict", True) super().__init__(**kwargs) @@ -81,29 +75,31 @@ class ImageBindTextConfig(CLIPTextConfig): model_type = "imagebind_text_model" def __init__( - self, - vocab_size=49408, - hidden_size=512, - intermediate_size=2048, - projection_dim=512, - num_hidden_layers=12, - num_attention_heads=8, - max_position_embeddings=77, - hidden_act="quick_gelu", - layer_norm_eps=0.00001, - dropout=0.0, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=1.0, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - **kwargs, ): + self, + vocab_size=49408, + hidden_size=512, + intermediate_size=2048, + projection_dim=512, + num_hidden_layers=12, + num_attention_heads=8, + max_position_embeddings=77, + hidden_act="quick_gelu", + layer_norm_eps=0.00001, + dropout=0.0, + attention_dropout=0.0, + initializer_range=0.02, + initializer_factor=1.0, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + **kwargs, + ): super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, - **kwargs, ) + **kwargs, + ) self.vocab_size = vocab_size self.hidden_size = hidden_size @@ -125,24 +121,21 @@ class ImageBindAudioConfig(PretrainedConfig): model_type = "imagebind_audio_model" def __init__( - self, - **kwargs, ): + self, + **kwargs, + ): kwargs["return_dict"] = kwargs.pop("return_dict", True) super().__init__(**kwargs) @classmethod - def from_pretrained(cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, - **kwargs) + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the qformer config dict if we are loading from Blip2Config if config_dict.get("model_type") == "imagebind": config_dict = config_dict["audio_config"] - if ("model_type" in config_dict and hasattr(cls, "model_type") and - config_dict["model_type"] != cls.model_type): + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: logger.warning( f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." @@ -156,24 +149,21 @@ class ImageBindDepthConfig(PretrainedConfig): model_type = "imagebind_depth_model" def __init__( - self, - **kwargs, ): + self, + **kwargs, + ): kwargs["return_dict"] = kwargs.pop("return_dict", True) super().__init__(**kwargs) @classmethod - def from_pretrained(cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, - **kwargs) + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the qformer config dict if we are loading from Blip2Config if config_dict.get("model_type") == "imagebind": config_dict = config_dict["depth_config"] - if ("model_type" in config_dict and hasattr(cls, "model_type") and - config_dict["model_type"] != cls.model_type): + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: logger.warning( f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." @@ -187,24 +177,21 @@ class ImageBindThermalConfig(PretrainedConfig): model_type = "imagebind_thermal_model" def __init__( - self, - **kwargs, ): + self, + **kwargs, + ): kwargs["return_dict"] = kwargs.pop("return_dict", True) super().__init__(**kwargs) @classmethod - def from_pretrained(cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, - **kwargs) + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the qformer config dict if we are loading from Blip2Config if config_dict.get("model_type") == "imagebind": config_dict = config_dict["thermal_config"] - if ("model_type" in config_dict and hasattr(cls, "model_type") and - config_dict["model_type"] != cls.model_type): + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: logger.warning( f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." @@ -218,24 +205,21 @@ class ImageBindIMUConfig(PretrainedConfig): model_type = "imagebind_imu_model" def __init__( - self, - **kwargs, ): + self, + **kwargs, + ): kwargs["return_dict"] = kwargs.pop("return_dict", True) super().__init__(**kwargs) @classmethod - def from_pretrained(cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, - **kwargs) + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the qformer config dict if we are loading from Blip2Config if config_dict.get("model_type") == "imagebind": config_dict = config_dict["imu_config"] - if ("model_type" in config_dict and hasattr(cls, "model_type") and - config_dict["model_type"] != cls.model_type): + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: logger.warning( f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." @@ -250,16 +234,17 @@ class ImageBindConfig(PretrainedConfig): is_composition = True def __init__( - self, - text_config=None, - vision_config=None, - audio_config=None, - depth_config=None, - thermal_config=None, - imu_config=None, - projection_dim=512, - logit_scale_init_value=2.6592, - **kwargs, ): + self, + text_config=None, + vision_config=None, + audio_config=None, + depth_config=None, + thermal_config=None, + imu_config=None, + projection_dim=512, + logit_scale_init_value=2.6592, + **kwargs, + ): kwargs["return_dict"] = kwargs.pop("return_dict", True) super().__init__(**kwargs) @@ -285,39 +270,27 @@ def __init__( if text_config is None: text_config = {} - logger.info( - "text_config is None. Initializing the ImageBindTextConfig with default values." - ) + logger.info("text_config is None. Initializing the ImageBindTextConfig with default values.") if vision_config is None: vision_config = {} - logger.info( - "vision_config is None. initializing the ImageBindVisionConfig with default values." - ) + logger.info("vision_config is None. initializing the ImageBindVisionConfig with default values.") if audio_config is None: audio_config = {} - logger.info( - "audio_config is None. initializing the ImageBindAudioConfig with default values." - ) + logger.info("audio_config is None. initializing the ImageBindAudioConfig with default values.") if depth_config is None: depth_config = {} - logger.info( - "depth_config is None. initializing the ImageBindDepthConfig with default values." - ) + logger.info("depth_config is None. initializing the ImageBindDepthConfig with default values.") if thermal_config is None: thermal_config = {} - logger.info( - "thermal_config is None. initializing the ImageBindThermalConfig with default values." - ) + logger.info("thermal_config is None. initializing the ImageBindThermalConfig with default values.") if imu_config is None: imu_config = {} - logger.info( - "imu_config is None. initializing the ImageBindIMUConfig with default values." - ) + logger.info("imu_config is None. initializing the ImageBindIMUConfig with default values.") # text_config["projection_dim"] = projection_dim # vision_config["projection_dim"] = projection_dim @@ -334,10 +307,11 @@ def __init__( @classmethod def from_text_vision_configs( - cls, - text_config: ImageBindTextConfig, - vision_config: ImageBindVisionConfig, - **kwargs, ): + cls, + text_config: ImageBindTextConfig, + vision_config: ImageBindVisionConfig, + **kwargs, + ): r""" Instantiate a [`ImageBindConfig`] (or a derived class) from clip text model configuration and clip vision model configuration. @@ -349,7 +323,8 @@ def from_text_vision_configs( return cls( text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), - **kwargs, ) + **kwargs, + ) def to_dict(self): """ diff --git a/paddlemix/models/imagebind/helpers.py b/paddlemix/models/imagebind/helpers.py index 228a17265b9f6..f6f3a4efef632 100644 --- a/paddlemix/models/imagebind/helpers.py +++ b/paddlemix/models/imagebind/helpers.py @@ -24,7 +24,8 @@ AUDIO="audio", THERMAL="thermal", DEPTH="depth", - IMU="imu", ) + IMU="imu", +) class Normalize(paddle.nn.Layer): @@ -38,10 +39,11 @@ def forward(self, x): class LearnableLogitScaling(paddle.nn.Layer): def __init__( - self, - logit_scale_init: float=1 / 0.07, - learnable: bool=True, - max_logit_scale: float=100, ) -> None: + self, + logit_scale_init: float = 1 / 0.07, + learnable: bool = True, + max_logit_scale: float = 100, + ) -> None: super().__init__() self.max_logit_scale = max_logit_scale self.logit_scale_init = logit_scale_init @@ -49,18 +51,15 @@ def __init__( log_logit_scale = paddle.ones(shape=[]) * np.log(self.logit_scale_init) if learnable: self.log_logit_scale = paddle.create_parameter( - shape=log_logit_scale.shape - if log_logit_scale.dim() != 0 else [1], + shape=log_logit_scale.shape if log_logit_scale.dim() != 0 else [1], dtype=log_logit_scale.dtype, - default_initializer=paddle.nn.initializer.Assign( - value=log_logit_scale), ) + default_initializer=paddle.nn.initializer.Assign(value=log_logit_scale), + ) else: self.register_buffer("log_logit_scale", log_logit_scale) def forward(self, x): - return (paddle.clip( - x=self.log_logit_scale.exp(), - max=self.max_logit_scale).unsqueeze(0) * x) + return paddle.clip(x=self.log_logit_scale.exp(), max=self.max_logit_scale).unsqueeze(0) * x class EinOpsRearrange(paddle.nn.Layer): @@ -81,14 +80,20 @@ class VerboseNNModule(paddle.nn.Layer): @staticmethod def get_readable_tensor_repr(name: str, tensor: paddle.Tensor) -> str: - st = ("(" + name + "): " + "tensor(" + str(tuple(tensor[1].shape)) + - ", requires_grad=" + str(not tensor[1].stop_gradient) + ")\n") + st = ( + "(" + + name + + "): " + + "tensor(" + + str(tuple(tensor[1].shape)) + + ", requires_grad=" + + str(not tensor[1].stop_gradient) + + ")\n" + ) return st -def cast_if_src_dtype(tensor: paddle.Tensor, - src_dtype: paddle.dtype, - tgt_dtype: paddle.dtype): +def cast_if_src_dtype(tensor: paddle.Tensor, src_dtype: paddle.dtype, tgt_dtype: paddle.dtype): updated = False if tensor.dtype == src_dtype: tensor = tensor.cast(tgt_dtype) diff --git a/paddlemix/models/imagebind/modeling.py b/paddlemix/models/imagebind/modeling.py index e3f85a9a2016a..6b7104c8f9ae5 100644 --- a/paddlemix/models/imagebind/modeling.py +++ b/paddlemix/models/imagebind/modeling.py @@ -14,23 +14,27 @@ from functools import partial from types import SimpleNamespace -from typing import Any, Dict, List, Tuple -import numpy as np import paddle -from paddle import nn -from paddle.nn import functional as F -from paddlenlp.transformers.model_utils import (PretrainedModel, - register_base_model) +from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model from .configuration import ImageBindConfig -from .helpers import (EinOpsRearrange, LearnableLogitScaling, Normalize, - SelectElement, SelectEOSAndProject, VerboseNNModule, - cast_if_src_dtype) +from .helpers import ( + LearnableLogitScaling, + Normalize, + SelectElement, + SelectEOSAndProject, +) from .multimodal_preprocessors import ( - AudioPreprocessor, IMUPreprocessor, PadIm2Video, PatchEmbedGeneric, - RGBDTPreprocessor, SpatioTemporalPosEmbeddingHelper, TextPreprocessor, - ThermalPreprocessor) + AudioPreprocessor, + IMUPreprocessor, + PadIm2Video, + PatchEmbedGeneric, + RGBDTPreprocessor, + SpatioTemporalPosEmbeddingHelper, + TextPreprocessor, + ThermalPreprocessor, +) from .transformer import MultiheadAttention, SimpleTransformer ModalityType = SimpleNamespace( @@ -39,7 +43,8 @@ AUDIO="audio", THERMAL="thermal", DEPTH="depth", - IMU="imu", ) + IMU="imu", +) __all__ = [ "ImageBindModel", @@ -85,8 +90,8 @@ def __init__(self, config: ImageBindConfig): text_embed_dim = config.text_config.text_embed_dim text_num_blocks = config.text_config.text_num_blocks text_num_heads = config.text_config.text_num_heads - context_length = config.text_config.context_length - vocab_size = config.text_config.vocab_size + # context_length = config.text_config.context_length + # vocab_size = config.text_config.vocab_size # depth_config depth_embed_dim = config.depth_config.depth_embed_dim @@ -104,7 +109,7 @@ def __init__(self, config: ImageBindConfig): # imu_config imu_embed_dim = config.imu_config.imu_embed_dim - imu_kernel_size = config.imu_config.imu_kernel_size + # imu_kernel_size = config.imu_config.imu_kernel_size imu_num_blocks = config.imu_config.imu_num_blocks imu_num_heads = config.imu_config.imu_num_heads imu_drop_path = config.imu_config.imu_drop_path @@ -123,7 +128,8 @@ def __init__(self, config: ImageBindConfig): depth_kernel_size, thermal_embed_dim, thermal_kernel_size, - imu_embed_dim, ) + imu_embed_dim, + ) self.modality_trunks = self._create_modality_trunks( vision_embed_dim, vision_num_blocks, @@ -146,7 +152,8 @@ def __init__(self, config: ImageBindConfig): imu_embed_dim, imu_num_blocks, imu_num_heads, - imu_drop_path, ) + imu_drop_path, + ) self.modality_heads = self._create_modality_heads( out_embed_dim, vision_embed_dim, @@ -154,48 +161,52 @@ def __init__(self, config: ImageBindConfig): audio_embed_dim, depth_embed_dim, thermal_embed_dim, - imu_embed_dim, ) - self.modality_postprocessors = self._create_modality_postprocessors( - out_embed_dim) + imu_embed_dim, + ) + self.modality_postprocessors = self._create_modality_postprocessors(out_embed_dim) def _create_modality_preprocessors( - self, - video_frames, - vision_embed_dim, - kernel_size, - text_embed_dim, - audio_embed_dim, - audio_kernel_size, - audio_stride, - audio_num_mel_bins, - audio_target_len, - depth_embed_dim, - depth_kernel_size, - thermal_embed_dim, - thermal_kernel_size, - imu_embed_dim, ): - rgbt_stem = PatchEmbedGeneric(proj_stem=[ - PadIm2Video( - pad_type="repeat", ntimes=2), - paddle.nn.Conv3D( - in_channels=3, - kernel_size=kernel_size, - out_channels=vision_embed_dim, - stride=kernel_size, - bias_attr=False, ), - ]) + self, + video_frames, + vision_embed_dim, + kernel_size, + text_embed_dim, + audio_embed_dim, + audio_kernel_size, + audio_stride, + audio_num_mel_bins, + audio_target_len, + depth_embed_dim, + depth_kernel_size, + thermal_embed_dim, + thermal_kernel_size, + imu_embed_dim, + ): + rgbt_stem = PatchEmbedGeneric( + proj_stem=[ + PadIm2Video(pad_type="repeat", ntimes=2), + paddle.nn.Conv3D( + in_channels=3, + kernel_size=kernel_size, + out_channels=vision_embed_dim, + stride=kernel_size, + bias_attr=False, + ), + ] + ) rgbt_preprocessor = RGBDTPreprocessor( img_size=[3, video_frames, 224, 224], num_cls_tokens=1, - pos_embed_fn=partial( - SpatioTemporalPosEmbeddingHelper, learnable=True), + pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True), rgbt_stem=rgbt_stem, - depth_stem=None, ) + depth_stem=None, + ) text_preprocessor = TextPreprocessor( context_length=77, vocab_size=49408, embed_dim=text_embed_dim, - causal_masking=True, ) + causal_masking=True, + ) audio_stem = PatchEmbedGeneric( proj_stem=[ paddle.nn.Conv2D( @@ -203,19 +214,22 @@ def _create_modality_preprocessors( kernel_size=audio_kernel_size, stride=audio_stride, out_channels=audio_embed_dim, - bias_attr=False, ) + bias_attr=False, + ) ], norm_layer=paddle.nn.LayerNorm( normalized_shape=audio_embed_dim, epsilon=1e-05, weight_attr=None, - bias_attr=None, ), ) + bias_attr=None, + ), + ) audio_preprocessor = AudioPreprocessor( img_size=[1, audio_num_mel_bins, audio_target_len], num_cls_tokens=1, - pos_embed_fn=partial( - SpatioTemporalPosEmbeddingHelper, learnable=True), - audio_stem=audio_stem, ) + pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True), + audio_stem=audio_stem, + ) depth_stem = PatchEmbedGeneric( [ paddle.nn.Conv2D( @@ -223,20 +237,23 @@ def _create_modality_preprocessors( in_channels=1, out_channels=depth_embed_dim, stride=depth_kernel_size, - bias_attr=False, ) + bias_attr=False, + ) ], norm_layer=paddle.nn.LayerNorm( normalized_shape=depth_embed_dim, epsilon=1e-05, weight_attr=None, - bias_attr=None, ), ) + bias_attr=None, + ), + ) depth_preprocessor = RGBDTPreprocessor( img_size=[1, 224, 224], num_cls_tokens=1, - pos_embed_fn=partial( - SpatioTemporalPosEmbeddingHelper, learnable=True), + pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True), rgbt_stem=None, - depth_stem=depth_stem, ) + depth_stem=depth_stem, + ) thermal_stem = PatchEmbedGeneric( [ paddle.nn.Conv2D( @@ -244,37 +261,39 @@ def _create_modality_preprocessors( in_channels=1, out_channels=thermal_embed_dim, stride=thermal_kernel_size, - bias_attr=False, ) + bias_attr=False, + ) ], norm_layer=paddle.nn.LayerNorm( normalized_shape=thermal_embed_dim, epsilon=1e-05, weight_attr=None, - bias_attr=None, ), ) + bias_attr=None, + ), + ) thermal_preprocessor = ThermalPreprocessor( img_size=[1, 224, 224], num_cls_tokens=1, - pos_embed_fn=partial( - SpatioTemporalPosEmbeddingHelper, learnable=True), - thermal_stem=thermal_stem, ) + pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True), + thermal_stem=thermal_stem, + ) imu_stem = PatchEmbedGeneric( - [ - paddle.nn.Linear( - in_features=48, out_features=imu_embed_dim, bias_attr=False) - ], + [paddle.nn.Linear(in_features=48, out_features=imu_embed_dim, bias_attr=False)], norm_layer=paddle.nn.LayerNorm( normalized_shape=imu_embed_dim, epsilon=1e-05, weight_attr=None, - bias_attr=None, ), ) + bias_attr=None, + ), + ) imu_preprocessor = IMUPreprocessor( img_size=[6, 2000], num_cls_tokens=1, kernel_size=8, embed_dim=imu_embed_dim, - pos_embed_fn=partial( - SpatioTemporalPosEmbeddingHelper, learnable=True), - imu_stem=imu_stem, ) + pos_embed_fn=partial(SpatioTemporalPosEmbeddingHelper, learnable=True), + imu_stem=imu_stem, + ) modality_preprocessors = { ModalityType.VISION: rgbt_preprocessor, ModalityType.TEXT: text_preprocessor, @@ -286,31 +305,31 @@ def _create_modality_preprocessors( return paddle.nn.LayerDict(sublayers=modality_preprocessors) def _create_modality_trunks( - self, - vision_embed_dim, - vision_num_blocks, - vision_num_heads, - text_embed_dim, - text_num_blocks, - text_num_heads, - audio_embed_dim, - audio_num_blocks, - audio_num_heads, - audio_drop_path, - depth_embed_dim, - depth_num_blocks, - depth_num_heads, - depth_drop_path, - thermal_embed_dim, - thermal_num_blocks, - thermal_num_heads, - thermal_drop_path, - imu_embed_dim, - imu_num_blocks, - imu_num_heads, - imu_drop_path, ): - def instantiate_trunk(embed_dim, num_blocks, num_heads, - pre_transformer_ln, add_bias_kv, drop_path): + self, + vision_embed_dim, + vision_num_blocks, + vision_num_heads, + text_embed_dim, + text_num_blocks, + text_num_heads, + audio_embed_dim, + audio_num_blocks, + audio_num_heads, + audio_drop_path, + depth_embed_dim, + depth_num_blocks, + depth_num_heads, + depth_drop_path, + thermal_embed_dim, + thermal_num_blocks, + thermal_num_heads, + thermal_drop_path, + imu_embed_dim, + imu_num_blocks, + imu_num_heads, + imu_drop_path, + ): + def instantiate_trunk(embed_dim, num_blocks, num_heads, pre_transformer_ln, add_bias_kv, drop_path): return SimpleTransformer( embed_dim=embed_dim, num_blocks=num_blocks, @@ -321,14 +340,17 @@ def instantiate_trunk(embed_dim, num_blocks, num_heads, add_bias_kv=add_bias_kv, embed_dim=embed_dim, num_heads=num_heads, - bias_attr=True, ), + bias_attr=True, + ), pre_transformer_layer=paddle.nn.Sequential( paddle.nn.LayerNorm( normalized_shape=embed_dim, epsilon=1e-06, weight_attr=None, - bias_attr=None, ) - if pre_transformer_ln else paddle.nn.Identity(), + bias_attr=None, + ) + if pre_transformer_ln + else paddle.nn.Identity(), # EinOpsRearrange('b l d -> l b d') ), # post_transformer_layer=EinOpsRearrange('l b d -> b l d') @@ -341,144 +363,159 @@ def instantiate_trunk(embed_dim, num_blocks, num_heads, vision_num_heads, pre_transformer_ln=True, add_bias_kv=False, - drop_path=0.0, ) + drop_path=0.0, + ) modality_trunks[ModalityType.TEXT] = instantiate_trunk( text_embed_dim, text_num_blocks, text_num_heads, pre_transformer_ln=False, add_bias_kv=False, - drop_path=0.0, ) + drop_path=0.0, + ) modality_trunks[ModalityType.AUDIO] = instantiate_trunk( audio_embed_dim, audio_num_blocks, audio_num_heads, pre_transformer_ln=False, add_bias_kv=True, - drop_path=audio_drop_path, ) + drop_path=audio_drop_path, + ) modality_trunks[ModalityType.DEPTH] = instantiate_trunk( depth_embed_dim, depth_num_blocks, depth_num_heads, pre_transformer_ln=False, add_bias_kv=True, - drop_path=depth_drop_path, ) + drop_path=depth_drop_path, + ) modality_trunks[ModalityType.THERMAL] = instantiate_trunk( thermal_embed_dim, thermal_num_blocks, thermal_num_heads, pre_transformer_ln=False, add_bias_kv=True, - drop_path=thermal_drop_path, ) + drop_path=thermal_drop_path, + ) modality_trunks[ModalityType.IMU] = instantiate_trunk( imu_embed_dim, imu_num_blocks, imu_num_heads, pre_transformer_ln=False, add_bias_kv=True, - drop_path=imu_drop_path, ) + drop_path=imu_drop_path, + ) return paddle.nn.LayerDict(sublayers=modality_trunks) def _create_modality_heads( - self, - out_embed_dim, - vision_embed_dim, - text_embed_dim, - audio_embed_dim, - depth_embed_dim, - thermal_embed_dim, - imu_embed_dim, ): + self, + out_embed_dim, + vision_embed_dim, + text_embed_dim, + audio_embed_dim, + depth_embed_dim, + thermal_embed_dim, + imu_embed_dim, + ): modality_heads = {} modality_heads[ModalityType.VISION] = paddle.nn.Sequential( paddle.nn.LayerNorm( normalized_shape=vision_embed_dim, epsilon=1e-06, weight_attr=None, - bias_attr=None, ), + bias_attr=None, + ), SelectElement(index=0), paddle.nn.Linear( in_features=vision_embed_dim, out_features=out_embed_dim, - bias_attr=False, ), ) + bias_attr=False, + ), + ) modality_heads[ModalityType.TEXT] = SelectEOSAndProject( proj=paddle.nn.Sequential( paddle.nn.LayerNorm( normalized_shape=text_embed_dim, epsilon=1e-06, weight_attr=None, - bias_attr=None, ), + bias_attr=None, + ), paddle.nn.Linear( in_features=text_embed_dim, out_features=out_embed_dim, - bias_attr=False, ), )) + bias_attr=False, + ), + ) + ) modality_heads[ModalityType.AUDIO] = paddle.nn.Sequential( paddle.nn.LayerNorm( normalized_shape=audio_embed_dim, epsilon=1e-06, weight_attr=None, - bias_attr=None, ), + bias_attr=None, + ), SelectElement(index=0), - paddle.nn.Linear( - in_features=audio_embed_dim, - out_features=out_embed_dim, - bias_attr=False), ) + paddle.nn.Linear(in_features=audio_embed_dim, out_features=out_embed_dim, bias_attr=False), + ) modality_heads[ModalityType.DEPTH] = paddle.nn.Sequential( paddle.nn.LayerNorm( normalized_shape=depth_embed_dim, epsilon=1e-06, weight_attr=None, - bias_attr=None, ), + bias_attr=None, + ), SelectElement(index=0), - paddle.nn.Linear( - in_features=depth_embed_dim, - out_features=out_embed_dim, - bias_attr=False), ) + paddle.nn.Linear(in_features=depth_embed_dim, out_features=out_embed_dim, bias_attr=False), + ) modality_heads[ModalityType.THERMAL] = paddle.nn.Sequential( paddle.nn.LayerNorm( normalized_shape=thermal_embed_dim, epsilon=1e-06, weight_attr=None, - bias_attr=None, ), + bias_attr=None, + ), SelectElement(index=0), paddle.nn.Linear( in_features=thermal_embed_dim, out_features=out_embed_dim, - bias_attr=False, ), ) + bias_attr=False, + ), + ) modality_heads[ModalityType.IMU] = paddle.nn.Sequential( paddle.nn.LayerNorm( normalized_shape=imu_embed_dim, epsilon=1e-06, weight_attr=None, - bias_attr=None, ), + bias_attr=None, + ), SelectElement(index=0), paddle.nn.Dropout(p=0.5), - paddle.nn.Linear( - in_features=imu_embed_dim, - out_features=out_embed_dim, - bias_attr=False), ) + paddle.nn.Linear(in_features=imu_embed_dim, out_features=out_embed_dim, bias_attr=False), + ) return paddle.nn.LayerDict(sublayers=modality_heads) def _create_modality_postprocessors(self, out_embed_dim): modality_postprocessors = {} modality_postprocessors[ModalityType.VISION] = Normalize(dim=-1) modality_postprocessors[ModalityType.TEXT] = paddle.nn.Sequential( - Normalize(dim=-1), LearnableLogitScaling(learnable=True)) + Normalize(dim=-1), LearnableLogitScaling(learnable=True) + ) modality_postprocessors[ModalityType.AUDIO] = paddle.nn.Sequential( Normalize(dim=-1), - LearnableLogitScaling( - logit_scale_init=20.0, learnable=False), ) + LearnableLogitScaling(logit_scale_init=20.0, learnable=False), + ) modality_postprocessors[ModalityType.DEPTH] = paddle.nn.Sequential( Normalize(dim=-1), - LearnableLogitScaling( - logit_scale_init=5.0, learnable=False), ) + LearnableLogitScaling(logit_scale_init=5.0, learnable=False), + ) modality_postprocessors[ModalityType.THERMAL] = paddle.nn.Sequential( Normalize(dim=-1), - LearnableLogitScaling( - logit_scale_init=10.0, learnable=False), ) + LearnableLogitScaling(logit_scale_init=10.0, learnable=False), + ) modality_postprocessors[ModalityType.IMU] = paddle.nn.Sequential( Normalize(dim=-1), - LearnableLogitScaling( - logit_scale_init=5.0, learnable=False), ) + LearnableLogitScaling(logit_scale_init=5.0, learnable=False), + ) return paddle.nn.LayerDict(sublayers=modality_postprocessors) def forward(self, inputs): @@ -487,24 +524,18 @@ def forward(self, inputs): reduce_list = modality_value.ndim >= 5 if reduce_list: B, S = modality_value.shape[:2] - modality_value = modality_value.reshape( - B * S, *modality_value.shape[2:]) + modality_value = modality_value.reshape(B * S, *modality_value.shape[2:]) if modality_value is not None: - modality_value = self.modality_preprocessors[modality_key](**{ - modality_key: modality_value - }) + modality_value = self.modality_preprocessors[modality_key](**{modality_key: modality_value}) print( f"modal: {modality_key} paddle_modality_value['trunk']['tokens'].mean(): {modality_value['trunk']['tokens'].mean().item()}" ) trunk_inputs = modality_value["trunk"] head_inputs = modality_value["head"] - modality_value = self.modality_trunks[modality_key]( - **trunk_inputs) - modality_value = self.modality_heads[modality_key]( - modality_value, **head_inputs) - modality_value = self.modality_postprocessors[modality_key]( - modality_value) + modality_value = self.modality_trunks[modality_key](**trunk_inputs) + modality_value = self.modality_heads[modality_key](modality_value, **head_inputs) + modality_value = self.modality_postprocessors[modality_key](modality_value) if reduce_list: modality_value = modality_value.reshape(B, S, -1) modality_value = modality_value.mean(axis=1) diff --git a/paddlemix/models/imagebind/multimodal_modules.py b/paddlemix/models/imagebind/multimodal_modules.py index 507b22caefa68..20d6f198208ca 100644 --- a/paddlemix/models/imagebind/multimodal_modules.py +++ b/paddlemix/models/imagebind/multimodal_modules.py @@ -12,11 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import gzip import html -import io import math -import sys from functools import lru_cache from typing import Callable, List, Optional, Tuple @@ -24,9 +21,6 @@ import numpy as np import paddle import regex as re -from iopath.common.file_io import g_pathmgr - -import paddlemix.utils.paddle_aux from .helpers import VerboseNNModule, cast_if_src_dtype @@ -35,15 +29,12 @@ def get_sinusoid_encoding_table(n_position, d_hid): """Sinusoid position encoding table""" def get_position_angle_vec(position): - return [(position / np.power(10000, 2 * (hid_j // 2) / d_hid)) - for hid_j in range(d_hid)] + return [(position / np.power(10000, 2 * (hid_j // 2) / d_hid)) for hid_j in range(d_hid)] - sinusoid_table = np.array( - [get_position_angle_vec(pos_i) for pos_i in range(n_position)]) + sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)]) sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) - return paddle.to_tensor( - data=sinusoid_table, dtype="float32").unsqueeze(axis=0) + return paddle.to_tensor(data=sinusoid_table, dtype="float32").unsqueeze(axis=0) def interpolate_pos_encoding_2d(target_spatial_size, pos_embed): @@ -53,10 +44,10 @@ def interpolate_pos_encoding_2d(target_spatial_size, pos_embed): dim = pos_embed.shape[-1] pos_embed, updated = cast_if_src_dtype(pos_embed, "bfloat16", "float32") pos_embed = paddle.nn.functional.interpolate( - x=pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), - dim).transpose(perm=[0, 3, 1, 2]), + x=pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).transpose(perm=[0, 3, 1, 2]), scale_factor=math.sqrt(target_spatial_size / N), - mode="bicubic", ) + mode="bicubic", + ) if updated: pos_embed, _ = cast_if_src_dtype(pos_embed, "float32", "bfloat16") @@ -65,17 +56,12 @@ def interpolate_pos_encoding_2d(target_spatial_size, pos_embed): return pos_embed -def interpolate_pos_encoding(npatch_per_img, - pos_embed, - patches_layout, - input_shape=None, - first_patch_idx=1): +def interpolate_pos_encoding(npatch_per_img, pos_embed, patches_layout, input_shape=None, first_patch_idx=1): assert first_patch_idx == 0 or first_patch_idx == 1, "there is 1 CLS token or none" N = pos_embed.shape[1] - first_patch_idx if npatch_per_img == N: return pos_embed - assert (patches_layout[-1] == patches_layout[-2] - ), "Interpolation of pos embed not supported for non-square layouts" + assert patches_layout[-1] == patches_layout[-2], "Interpolation of pos embed not supported for non-square layouts" class_emb = pos_embed[:, :first_patch_idx] pos_embed = pos_embed[:, first_patch_idx:] if input_shape is None or patches_layout[0] == 1: @@ -87,24 +73,20 @@ def interpolate_pos_encoding(npatch_per_img, # pos_embed = pos_embed.view(1, num_frames, num_spatial_tokens, -1) pos_embed = pos_embed.reshape((1, num_frames, num_spatial_tokens, -1)) - pos_embed = interpolate_pos_encoding_2d( - npatch_per_img, pos_embed[0, 0, ...].unsqueeze(axis=0)) + pos_embed = interpolate_pos_encoding_2d(npatch_per_img, pos_embed[0, 0, ...].unsqueeze(axis=0)) else: raise ValueError("This type of interpolation isn't implemented") return paddle.concat(x=(class_emb, pos_embed), axis=1) -def _get_pos_embedding(npatch_per_img, - pos_embed, - patches_layout, - input_shape, - first_patch_idx=1): +def _get_pos_embedding(npatch_per_img, pos_embed, patches_layout, input_shape, first_patch_idx=1): pos_embed = interpolate_pos_encoding( npatch_per_img, pos_embed, patches_layout, input_shape=input_shape, - first_patch_idx=first_patch_idx, ) + first_patch_idx=first_patch_idx, + ) return pos_embed @@ -113,7 +95,7 @@ class PatchEmbedGeneric(paddle.nn.Layer): PatchEmbed from Hydra """ - def __init__(self, proj_stem, norm_layer: Optional[paddle.nn.Layer]=None): + def __init__(self, proj_stem, norm_layer: Optional[paddle.nn.Layer] = None): super().__init__() if len(proj_stem) > 1: self.proj = paddle.nn.Sequential(*proj_stem) @@ -144,12 +126,13 @@ def forward(self, x): class SpatioTemporalPosEmbeddingHelper(VerboseNNModule): def __init__( - self, - patches_layout: List, - num_patches: int, - num_cls_tokens: int, - embed_dim: int, - learnable: bool, ) -> None: + self, + patches_layout: List, + num_patches: int, + num_cls_tokens: int, + embed_dim: int, + learnable: bool, + ) -> None: super().__init__() self.num_cls_tokens = num_cls_tokens self.patches_layout = patches_layout @@ -161,14 +144,13 @@ def __init__( self.pos_embed = paddle.create_parameter( shape=[1, self.num_tokens, embed_dim], dtype="float32", - default_initializer=paddle.nn.initializer.Constant(value=0.0), ) + default_initializer=paddle.nn.initializer.Constant(value=0.0), + ) paddle.nn.initializer.TruncatedNormal(std=0.02)(self.pos_embed) # import timm # timm.models.layers.trunc_normal_(self.pos_embed, std=0.02) else: - self.register_buffer( - "pos_embed", - get_sinusoid_encoding_table(self.num_tokens, embed_dim)) + self.register_buffer("pos_embed", get_sinusoid_encoding_table(self.num_tokens, embed_dim)) def get_pos_embedding(self, vision_input, all_vision_tokens): input_shape = vision_input.shape @@ -177,24 +159,25 @@ def get_pos_embedding(self, vision_input, all_vision_tokens): pos_embed=self.pos_embed, patches_layout=self.patches_layout, input_shape=input_shape, - first_patch_idx=self.num_cls_tokens, ) + first_patch_idx=self.num_cls_tokens, + ) return pos_embed class RGBDTPreprocessor(VerboseNNModule): def __init__( - self, - rgbt_stem: PatchEmbedGeneric, - depth_stem: Optional[PatchEmbedGeneric], - img_size: Tuple=(3, 224, 224), - num_cls_tokens: int=1, - pos_embed_fn: Optional[Callable]=None, - use_type_embed: bool=False, - init_param_style: str="openclip", ) -> None: + self, + rgbt_stem: PatchEmbedGeneric, + depth_stem: Optional[PatchEmbedGeneric], + img_size: Tuple = (3, 224, 224), + num_cls_tokens: int = 1, + pos_embed_fn: Optional[Callable] = None, + use_type_embed: bool = False, + init_param_style: str = "openclip", + ) -> None: super().__init__() stem = rgbt_stem if rgbt_stem is not None else depth_stem - self.patches_layout, self.num_patches, self.embed_dim = stem.get_patch_layout( - img_size) + self.patches_layout, self.num_patches, self.embed_dim = stem.get_patch_layout(img_size) self.rgbt_stem = rgbt_stem self.depth_stem = depth_stem self.use_pos_embed = pos_embed_fn is not None @@ -205,19 +188,22 @@ def __init__( patches_layout=self.patches_layout, num_cls_tokens=num_cls_tokens, num_patches=self.num_patches, - embed_dim=self.embed_dim, ) + embed_dim=self.embed_dim, + ) if self.num_cls_tokens > 0: self.cls_token = paddle.create_parameter( shape=[1, self.num_cls_tokens, self.embed_dim], dtype="float32", - default_initializer=paddle.nn.initializer.Constant(value=0.0), ) + default_initializer=paddle.nn.initializer.Constant(value=0.0), + ) if self.use_type_embed: self.type_embed = paddle.create_parameter( shape=[1, 1, self.embed_dim], dtype="float32", - default_initializer=paddle.nn.initializer.Constant(value=0.0), ) + default_initializer=paddle.nn.initializer.Constant(value=0.0), + ) self.init_parameters(init_param_style) @paddle.no_grad() @@ -225,11 +211,9 @@ def init_parameters(self, init_param_style): if init_param_style == "openclip": scale = self.embed_dim**-0.5 if self.use_pos_embed: - paddle.nn.initializer.Normal()( - self.pos_embedding_helper.pos_embed) + paddle.nn.initializer.Normal()(self.pos_embedding_helper.pos_embed) - self.pos_embedding_helper.pos_embed.set_value( - self.pos_embedding_helper.pos_embed * scale) + self.pos_embedding_helper.pos_embed.set_value(self.pos_embedding_helper.pos_embed * scale) if self.num_cls_tokens > 0: paddle.nn.initializer.Normal()(self.cls_token) @@ -250,8 +234,7 @@ def tokenize_input_and_cls_pos(self, input, stem, mask): class_tokens = self.cls_token.expand(shape=[B, -1, -1]) tokens = paddle.concat(x=(class_tokens, tokens), axis=1) if self.use_pos_embed: - pos_embed = self.pos_embedding_helper.get_pos_embedding(input, - tokens) + pos_embed = self.pos_embedding_helper.get_pos_embedding(input, tokens) tokens = tokens + pos_embed if self.use_type_embed: tokens = tokens + self.type_embed.expand(shape=[B, -1, -1]) @@ -261,11 +244,9 @@ def forward(self, vision=None, depth=None, patch_mask=None): if patch_mask is not None: raise NotImplementedError() if vision is not None: - vision_tokens = self.tokenize_input_and_cls_pos( - vision, self.rgbt_stem, patch_mask) + vision_tokens = self.tokenize_input_and_cls_pos(vision, self.rgbt_stem, patch_mask) if depth is not None: - depth_tokens = self.tokenize_input_and_cls_pos( - depth, self.depth_stem, patch_mask) + depth_tokens = self.tokenize_input_and_cls_pos(depth, self.depth_stem, patch_mask) if vision is not None and depth is not None: final_tokens = vision_tokens + depth_tokens else: @@ -303,14 +284,15 @@ def build_causal_attention_mask(context_length): class TextPreprocessor(VerboseNNModule): def __init__( - self, - vocab_size: int, - context_length: int, - embed_dim: int, - causal_masking: bool, - supply_seq_len_to_head: bool=True, - num_cls_tokens: int=0, - init_param_style: str="openclip", ) -> None: + self, + vocab_size: int, + context_length: int, + embed_dim: int, + causal_masking: bool, + supply_seq_len_to_head: bool = True, + num_cls_tokens: int = 0, + init_param_style: str = "openclip", + ) -> None: super().__init__() self.vocab_size = vocab_size self.context_length = context_length @@ -319,8 +301,10 @@ def __init__( self.pos_embed = paddle.create_parameter( shape=[1, self.context_length + num_cls_tokens, embed_dim], dtype="float32", - default_initializer=paddle.nn.initializer.Assign(value=paddle.empty( - shape=[1, self.context_length + num_cls_tokens, embed_dim])), ) + default_initializer=paddle.nn.initializer.Assign( + value=paddle.empty(shape=[1, self.context_length + num_cls_tokens, embed_dim]) + ), + ) self.causal_masking = causal_masking if self.causal_masking: mask = build_causal_attention_mask(self.context_length) @@ -334,7 +318,8 @@ def __init__( self.cls_token = paddle.create_parameter( shape=[1, self.num_cls_tokens, embed_dim], dtype="float32", - default_initializer=paddle.nn.initializer.Constant(value=0.0), ) + default_initializer=paddle.nn.initializer.Constant(value=0.0), + ) self.init_parameters(init_param_style) @paddle.no_grad() @@ -402,8 +387,7 @@ def forward(self, x): x = x.tile(repeat_times=new_shape) elif self.pad_type == "zero": padarg = [0, 0] * len(x.shape) - padarg[2 * self.time_dim + 1] = self.ntimes - x.shape[ - self.time_dim] + padarg[2 * self.time_dim + 1] = self.ntimes - x.shape[self.time_dim] x = paddle.nn.functional.pad(x=x, pad=padarg) return x @@ -419,9 +403,9 @@ def bytes_to_unicode(): To avoid that, we want lookup tables between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on. """ - bs = (list(range(ord("!"), ord("~") + 1)) + - list(range(ord("¡"), ord("¬") + 1)) + - list(range(ord("®"), ord("ÿ") + 1))) + bs = ( + list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + ) cs = bs[:] n = 0 for b in range(2**8): @@ -457,122 +441,17 @@ def whitespace_clean(text): return text -class SimpleTokenizer(object): - def __init__(self, bpe_path: str, context_length=77): - self.byte_encoder = bytes_to_unicode() - self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} - with g_pathmgr.open(bpe_path, "rb") as fh: - bpe_bytes = io.BytesIO(fh.read()) - merges: List[str] = gzip.open(bpe_bytes).read().decode( - "utf-8").split("\n") - merges = merges[1:49152 - 256 - 2 + 1] - - merges: List[Tuple[str, .. - .]] = [tuple(merge.split()) for merge in merges] - vocab = list(bytes_to_unicode().values()) - vocab = vocab + [(v + "") for v in vocab] - for merge in merges: - vocab.append("".join(merge)) - - vocab.extend(["<|startoftext|>", "<|endoftext|>"]) - self.encoder = dict(zip(vocab, range(len(vocab)))) - self.decoder = {v: k for k, v in self.encoder.items()} - self.bpe_ranks = dict(zip(merges, range(len(merges)))) - self.cache = { - "<|startoftext|>": "<|startoftext|>", - "<|endoftext|>": "<|endoftext|>", - } - self.pat = re.compile( - "<\\|startoftext\\|>|<\\|endoftext\\|>|'s|'t|'re|'ve|'m|'ll|'d|[\\p{L}]+|[\\p{N}]|[^\\s\\p{L}\\p{N}]+", - re.IGNORECASE, ) - self.context_length = context_length - - def bpe(self, token): - if token in self.cache: - return self.cache[token] - - word = tuple(token[:-1]) + (token[-1] + "", ) - pairs = get_pairs(word) - if not pairs: - return token + "" - while True: - bigram = min( - pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) - if bigram not in self.bpe_ranks: - break - first, second = bigram - new_word = [] - i = 0 - while i < len(word): - try: - j = word.index(first, i) - new_word.extend(word[i:j]) - i = j - except: - new_word.extend(word[i:]) - break - if word[i] == first and i < len(word) - 1 and word[i + - 1] == second: - new_word.append(first + second) - i += 2 - else: - new_word.append(word[i]) - i += 1 - new_word = tuple(new_word) - word = new_word - if len(word) == 1: - break - else: - pairs = get_pairs(word) - word = " ".join(word) - self.cache[token] = word - return word - - def encode(self, text): - bpe_tokens = [] - text = whitespace_clean(basic_clean(text)).lower() - for token in re.findall(self.pat, text): - token = "".join(self.byte_encoder[b] for b in token.encode("utf-8")) - - bpe_tokens.extend(self.encoder[bpe_token] - for bpe_token in self.bpe(token).split(" ")) - return bpe_tokens - - def decode(self, tokens): - text = "".join([self.decoder[token] for token in tokens]) - text = (bytearray([self.byte_decoder[c] for c in text]).decode( - "utf-8", errors="replace").replace("", " ")) - return text - - def __call__(self, texts, context_length=None): - if not context_length: - context_length = self.context_length - if isinstance(texts, str): - texts = [texts] - sot_token = self.encoder["<|startoftext|>"] - eot_token = self.encoder["<|endoftext|>"] - all_tokens = [([sot_token] + self.encode(text) + [eot_token]) - for text in texts] - result = paddle.zeros( - shape=[len(all_tokens), context_length], dtype="int64") - for i, tokens in enumerate(all_tokens): - tokens = tokens[:context_length] - result[(i), :len(tokens)] = paddle.to_tensor(data=tokens) - if len(result) == 1: - return result[0] - return result - - class IMUPreprocessor(VerboseNNModule): def __init__( - self, - kernel_size: int, - imu_stem: PatchEmbedGeneric, - embed_dim: int, - img_size: Tuple=(6, 2000), - num_cls_tokens: int=1, - pos_embed_fn: Optional[Callable]=None, - init_param_style: str="openclip", ) -> None: + self, + kernel_size: int, + imu_stem: PatchEmbedGeneric, + embed_dim: int, + img_size: Tuple = (6, 2000), + num_cls_tokens: int = 1, + pos_embed_fn: Optional[Callable] = None, + init_param_style: str = "openclip", + ) -> None: super().__init__() self.imu_stem = imu_stem self.embed_dim = embed_dim @@ -583,16 +462,17 @@ def __init__( self.pos_embed = paddle.create_parameter( shape=[1, img_size[1] // kernel_size + num_cls_tokens, embed_dim], dtype="float32", - default_initializer=paddle.nn.initializer.Assign(value=paddle.empty( - shape=[ - 1, img_size[1] // kernel_size + num_cls_tokens, embed_dim - ])), ) + default_initializer=paddle.nn.initializer.Assign( + value=paddle.empty(shape=[1, img_size[1] // kernel_size + num_cls_tokens, embed_dim]) + ), + ) if self.num_cls_tokens > 0: self.cls_token = paddle.create_parameter( shape=[1, self.num_cls_tokens, self.embed_dim], dtype="float32", - default_initializer=paddle.nn.initializer.Constant(value=0.0), ) + default_initializer=paddle.nn.initializer.Constant(value=0.0), + ) self.init_parameters(init_param_style) @paddle.no_grad() @@ -624,8 +504,7 @@ def tokenize_input_and_cls_pos(self, input, stem): def forward(self, imu): - imu = imu.unfold(-1, self.kernel_size, self.kernel_size).transpose( - perm=[0, 2, 1, 3]) # 需要对齐 + imu = imu.unfold(-1, self.kernel_size, self.kernel_size).transpose(perm=[0, 2, 1, 3]) # 需要对齐 imu = imu.reshape((imu.shape[0], imu.shape[1], -1)) imu_tokens = self.tokenize_input_and_cls_pos(imu, self.imu_stem) return_dict = {"trunk": {"tokens": imu_tokens}, "head": {}} diff --git a/paddlemix/models/imagebind/multimodal_preprocessors.py b/paddlemix/models/imagebind/multimodal_preprocessors.py index 79910119e60ec..397adbd1d19cf 100644 --- a/paddlemix/models/imagebind/multimodal_preprocessors.py +++ b/paddlemix/models/imagebind/multimodal_preprocessors.py @@ -1,20 +1,18 @@ # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import gzip import html -import io import math from functools import lru_cache from typing import Callable, List, Optional, Tuple @@ -23,7 +21,6 @@ import numpy as np import paddle import regex as re -from iopath.common.file_io import g_pathmgr from .helpers import VerboseNNModule, cast_if_src_dtype @@ -32,15 +29,12 @@ def get_sinusoid_encoding_table(n_position, d_hid): """Sinusoid position encoding table""" def get_position_angle_vec(position): - return [(position / np.power(10000, 2 * (hid_j // 2) / d_hid)) - for hid_j in range(d_hid)] + return [(position / np.power(10000, 2 * (hid_j // 2) / d_hid)) for hid_j in range(d_hid)] - sinusoid_table = np.array( - [get_position_angle_vec(pos_i) for pos_i in range(n_position)]) + sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)]) sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) - return paddle.to_tensor( - data=sinusoid_table, dtype='float32').unsqueeze(axis=0) + return paddle.to_tensor(data=sinusoid_table, dtype="float32").unsqueeze(axis=0) def interpolate_pos_encoding_2d(target_spatial_size, pos_embed): @@ -48,60 +42,51 @@ def interpolate_pos_encoding_2d(target_spatial_size, pos_embed): if N == target_spatial_size: return pos_embed dim = pos_embed.shape[-1] - pos_embed, updated = cast_if_src_dtype(pos_embed, 'bfloat16', 'float32') + pos_embed, updated = cast_if_src_dtype(pos_embed, "bfloat16", "float32") pos_embed = paddle.nn.functional.interpolate( - x=pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), - dim).transpose(perm=[0, 3, 1, 2]), + x=pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).transpose(perm=[0, 3, 1, 2]), scale_factor=math.sqrt(target_spatial_size / N), - mode='bicubic', ) + mode="bicubic", + ) if updated: - pos_embed, _ = cast_if_src_dtype(pos_embed, 'float32', 'bfloat16') + pos_embed, _ = cast_if_src_dtype(pos_embed, "float32", "bfloat16") # pos_embed = pos_embed.transpose(perm=[0, 2, 3, 1]).view(1, -1, dim) pos_embed = pos_embed.transpose(perm=[0, 2, 3, 1]).reshape((1, -1, dim)) return pos_embed -def interpolate_pos_encoding(npatch_per_img, - pos_embed, - patches_layout, - input_shape=None, - first_patch_idx=1): - assert first_patch_idx == 0 or first_patch_idx == 1, 'there is 1 CLS token or none' +def interpolate_pos_encoding(npatch_per_img, pos_embed, patches_layout, input_shape=None, first_patch_idx=1): + assert first_patch_idx == 0 or first_patch_idx == 1, "there is 1 CLS token or none" N = pos_embed.shape[1] - first_patch_idx if npatch_per_img == N: return pos_embed - assert (patches_layout[-1] == patches_layout[-2] - ), 'Interpolation of pos embed not supported for non-square layouts' + assert patches_layout[-1] == patches_layout[-2], "Interpolation of pos embed not supported for non-square layouts" class_emb = pos_embed[:, :first_patch_idx] pos_embed = pos_embed[:, first_patch_idx:] if input_shape is None or patches_layout[0] == 1: pos_embed = interpolate_pos_encoding_2d(npatch_per_img, pos_embed) elif patches_layout[0] > 1: - assert len(input_shape) == 4, 'temporal interpolation not supported' + assert len(input_shape) == 4, "temporal interpolation not supported" num_frames = patches_layout[0] num_spatial_tokens = patches_layout[1] * patches_layout[2] # pos_embed = pos_embed.view(1, num_frames, num_spatial_tokens, -1) pos_embed = pos_embed.reshape((1, num_frames, num_spatial_tokens, -1)) - pos_embed = interpolate_pos_encoding_2d( - npatch_per_img, pos_embed[0, 0, ...].unsqueeze(axis=0)) + pos_embed = interpolate_pos_encoding_2d(npatch_per_img, pos_embed[0, 0, ...].unsqueeze(axis=0)) else: raise ValueError("This type of interpolation isn't implemented") return paddle.concat(x=(class_emb, pos_embed), axis=1) -def _get_pos_embedding(npatch_per_img, - pos_embed, - patches_layout, - input_shape, - first_patch_idx=1): +def _get_pos_embedding(npatch_per_img, pos_embed, patches_layout, input_shape, first_patch_idx=1): pos_embed = interpolate_pos_encoding( npatch_per_img, pos_embed, patches_layout, input_shape=input_shape, - first_patch_idx=first_patch_idx, ) + first_patch_idx=first_patch_idx, + ) return pos_embed @@ -110,7 +95,7 @@ class PatchEmbedGeneric(paddle.nn.Layer): PatchEmbed from Hydra """ - def __init__(self, proj_stem, norm_layer: Optional[paddle.nn.Layer]=None): + def __init__(self, proj_stem, norm_layer: Optional[paddle.nn.Layer] = None): super().__init__() if len(proj_stem) > 1: self.proj = paddle.nn.Sequential(*proj_stem) @@ -141,12 +126,13 @@ def forward(self, x): class SpatioTemporalPosEmbeddingHelper(VerboseNNModule): def __init__( - self, - patches_layout: List, - num_patches: int, - num_cls_tokens: int, - embed_dim: int, - learnable: bool, ) -> None: + self, + patches_layout: List, + num_patches: int, + num_cls_tokens: int, + embed_dim: int, + learnable: bool, + ) -> None: super().__init__() self.num_cls_tokens = num_cls_tokens self.patches_layout = patches_layout @@ -158,13 +144,12 @@ def __init__( self.pos_embed = paddle.create_parameter( shape=[1, self.num_tokens, embed_dim], dtype="float32", - default_initializer=paddle.nn.initializer.Constant(value=0.0), ) + default_initializer=paddle.nn.initializer.Constant(value=0.0), + ) paddle.nn.initializer.TruncatedNormal(std=0.02)(self.pos_embed) else: - self.register_buffer( - 'pos_embed', - get_sinusoid_encoding_table(self.num_tokens, embed_dim)) + self.register_buffer("pos_embed", get_sinusoid_encoding_table(self.num_tokens, embed_dim)) def get_pos_embedding(self, vision_input, all_vision_tokens): input_shape = vision_input.shape @@ -173,24 +158,25 @@ def get_pos_embedding(self, vision_input, all_vision_tokens): pos_embed=self.pos_embed, patches_layout=self.patches_layout, input_shape=input_shape, - first_patch_idx=self.num_cls_tokens, ) + first_patch_idx=self.num_cls_tokens, + ) return pos_embed class RGBDTPreprocessor(VerboseNNModule): def __init__( - self, - rgbt_stem: PatchEmbedGeneric, - depth_stem: Optional[PatchEmbedGeneric], - img_size: Tuple=(3, 224, 224), - num_cls_tokens: int=1, - pos_embed_fn: Optional[Callable]=None, - use_type_embed: bool=False, - init_param_style: str='openclip', ) -> None: + self, + rgbt_stem: PatchEmbedGeneric, + depth_stem: Optional[PatchEmbedGeneric], + img_size: Tuple = (3, 224, 224), + num_cls_tokens: int = 1, + pos_embed_fn: Optional[Callable] = None, + use_type_embed: bool = False, + init_param_style: str = "openclip", + ) -> None: super().__init__() stem = rgbt_stem if rgbt_stem is not None else depth_stem - self.patches_layout, self.num_patches, self.embed_dim = stem.get_patch_layout( - img_size) + self.patches_layout, self.num_patches, self.embed_dim = stem.get_patch_layout(img_size) self.rgbt_stem = rgbt_stem self.depth_stem = depth_stem self.use_pos_embed = pos_embed_fn is not None @@ -201,39 +187,40 @@ def __init__( patches_layout=self.patches_layout, num_cls_tokens=num_cls_tokens, num_patches=self.num_patches, - embed_dim=self.embed_dim, ) + embed_dim=self.embed_dim, + ) if self.num_cls_tokens > 0: self.cls_token = paddle.create_parameter( shape=[1, self.num_cls_tokens, self.embed_dim], dtype="float32", - default_initializer=paddle.nn.initializer.Constant(value=0.0), ) + default_initializer=paddle.nn.initializer.Constant(value=0.0), + ) if self.use_type_embed: self.type_embed = paddle.create_parameter( shape=[1, 1, self.embed_dim], dtype="float32", - default_initializer=paddle.nn.initializer.Constant(value=0.0), ) + default_initializer=paddle.nn.initializer.Constant(value=0.0), + ) self.init_parameters(init_param_style) @paddle.no_grad() def init_parameters(self, init_param_style): - if init_param_style == 'openclip': + if init_param_style == "openclip": scale = self.embed_dim**-0.5 if self.use_pos_embed: - paddle.nn.initializer.Normal()( - self.pos_embedding_helper.pos_embed) + paddle.nn.initializer.Normal()(self.pos_embedding_helper.pos_embed) - self.pos_embedding_helper.pos_embed.set_value( - self.pos_embedding_helper.pos_embed * scale) + self.pos_embedding_helper.pos_embed.set_value(self.pos_embedding_helper.pos_embed * scale) if self.num_cls_tokens > 0: paddle.nn.initializer.Normal()(self.cls_token) self.cls_token.set_value(self.cls_token * scale) - elif init_param_style == 'vit': + elif init_param_style == "vit": self.cls_token.data.fill_(value=0) else: - raise ValueError(f'Unknown init {init_param_style}') + raise ValueError(f"Unknown init {init_param_style}") if self.use_type_embed: paddle.nn.initializer.Normal()(self.type_embed) @@ -246,8 +233,7 @@ def tokenize_input_and_cls_pos(self, input, stem, mask): class_tokens = self.cls_token.expand(shape=[B, -1, -1]) tokens = paddle.concat(x=(class_tokens, tokens), axis=1) if self.use_pos_embed: - pos_embed = self.pos_embedding_helper.get_pos_embedding(input, - tokens) + pos_embed = self.pos_embedding_helper.get_pos_embedding(input, tokens) tokens = tokens + pos_embed if self.use_type_embed: tokens = tokens + self.type_embed.expand(shape=[B, -1, -1]) @@ -257,16 +243,14 @@ def forward(self, vision=None, depth=None, patch_mask=None): if patch_mask is not None: raise NotImplementedError() if vision is not None: - vision_tokens = self.tokenize_input_and_cls_pos( - vision, self.rgbt_stem, patch_mask) + vision_tokens = self.tokenize_input_and_cls_pos(vision, self.rgbt_stem, patch_mask) if depth is not None: - depth_tokens = self.tokenize_input_and_cls_pos( - depth, self.depth_stem, patch_mask) + depth_tokens = self.tokenize_input_and_cls_pos(depth, self.depth_stem, patch_mask) if vision is not None and depth is not None: final_tokens = vision_tokens + depth_tokens else: final_tokens = vision_tokens if vision is not None else depth_tokens - return_dict = {'trunk': {'tokens': final_tokens}, 'head': {}} + return_dict = {"trunk": {"tokens": final_tokens}, "head": {}} return return_dict @@ -290,21 +274,22 @@ def build_causal_attention_mask(context_length): out_0 = paddle.empty(shape=[context_length, context_length]) out_0.stop_gradient = not False mask = out_0 - mask.fill_(value=float('-inf')) + mask.fill_(value=float("-inf")) mask = paddle.triu(mask, 1) return mask class TextPreprocessor(VerboseNNModule): def __init__( - self, - vocab_size: int, - context_length: int, - embed_dim: int, - causal_masking: bool, - supply_seq_len_to_head: bool=True, - num_cls_tokens: int=0, - init_param_style: str='openclip', ) -> None: + self, + vocab_size: int, + context_length: int, + embed_dim: int, + causal_masking: bool, + supply_seq_len_to_head: bool = True, + num_cls_tokens: int = 0, + init_param_style: str = "openclip", + ) -> None: super().__init__() self.vocab_size = vocab_size self.context_length = context_length @@ -313,12 +298,14 @@ def __init__( self.pos_embed = paddle.create_parameter( shape=[1, self.context_length + num_cls_tokens, embed_dim], dtype="float32", - default_initializer=paddle.nn.initializer.Assign(value=paddle.empty( - shape=[1, self.context_length + num_cls_tokens, embed_dim])), ) + default_initializer=paddle.nn.initializer.Assign( + value=paddle.empty(shape=[1, self.context_length + num_cls_tokens, embed_dim]) + ), + ) self.causal_masking = causal_masking if self.causal_masking: mask = build_causal_attention_mask(self.context_length) - self.register_buffer('mask', mask) + self.register_buffer("mask", mask) self.supply_seq_len_to_head = supply_seq_len_to_head self.num_cls_tokens = num_cls_tokens self.embed_dim = embed_dim @@ -328,24 +315,25 @@ def __init__( self.cls_token = paddle.create_parameter( shape=[1, self.num_cls_tokens, embed_dim], dtype="float32", - default_initializer=paddle.nn.initializer.Constant(value=0.0), ) + default_initializer=paddle.nn.initializer.Constant(value=0.0), + ) self.init_parameters(init_param_style) @paddle.no_grad() - def init_parameters(self, init_param_style='openclip'): + def init_parameters(self, init_param_style="openclip"): paddle.nn.initializer.Normal(std=0.02)(self.token_embedding.weight) paddle.nn.initializer.Normal(std=0.01)(self.pos_embed) - if init_param_style == 'openclip': + if init_param_style == "openclip": scale = self.embed_dim**-0.5 if self.num_cls_tokens > 0: paddle.nn.initializer.Normal()(self.cls_token) self.cls_token.set_value(self.cls_token * scale) - elif init_param_style == 'vit': + elif init_param_style == "vit": self.cls_token.data.fill_(value=0) else: - raise ValueError(f'Unknown init {init_param_style}') + raise ValueError(f"Unknown init {init_param_style}") def forward(self, text): text_tokens = self.token_embedding(text) @@ -354,12 +342,12 @@ def forward(self, text): class_tokens = self.cls_token.expand(shape=[B, -1, -1]) text_tokens = paddle.concat(x=(class_tokens, text_tokens), axis=1) text_tokens = text_tokens + self.pos_embed - return_dict = {'trunk': {'tokens': text_tokens}, 'head': {}} + return_dict = {"trunk": {"tokens": text_tokens}, "head": {}} if self.supply_seq_len_to_head: text_lengths = text.argmax(axis=-1) - return_dict['head'] = {'seq_len': text_lengths} + return_dict["head"] = {"seq_len": text_lengths} if self.causal_masking: - return_dict['trunk'].update({'attn_mask': self.mask}) + return_dict["trunk"].update({"attn_mask": self.mask}) return return_dict @@ -376,28 +364,27 @@ def forward(self, x): elif x.ndim == 5: return x else: - raise ValueError(f'Dimension incorrect {x.shape}') + raise ValueError(f"Dimension incorrect {x.shape}") class PadIm2Video(Im2Video): def __init__(self, ntimes, pad_type, time_dim=2): super().__init__(time_dim=time_dim) assert ntimes > 0 - assert pad_type in ['zero', 'repeat'] + assert pad_type in ["zero", "repeat"] self.ntimes = ntimes self.pad_type = pad_type def forward(self, x): x = super().forward(x) if x.shape[self.time_dim] == 1: - if self.pad_type == 'repeat': + if self.pad_type == "repeat": new_shape = [1] * len(x.shape) new_shape[self.time_dim] = self.ntimes x = x.tile(repeat_times=new_shape) - elif self.pad_type == 'zero': + elif self.pad_type == "zero": padarg = [0, 0] * len(x.shape) - padarg[2 * self.time_dim + 1] = self.ntimes - x.shape[ - self.time_dim] + padarg[2 * self.time_dim + 1] = self.ntimes - x.shape[self.time_dim] x = paddle.nn.functional.pad(x=x, pad=padarg) return x @@ -413,9 +400,9 @@ def bytes_to_unicode(): To avoid that, we want lookup tables between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on. """ - bs = (list(range(ord('!'), ord('~') + 1)) + - list(range(ord('¡'), ord('¬') + 1)) + - list(range(ord('®'), ord('ÿ') + 1))) + bs = ( + list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + ) cs = bs[:] n = 0 for b in range(2**8): @@ -446,127 +433,22 @@ def basic_clean(text): def whitespace_clean(text): - text = re.sub('\\s+', ' ', text) + text = re.sub("\\s+", " ", text) text = text.strip() return text -class SimpleTokenizer(object): - def __init__(self, bpe_path: str, context_length=77): - self.byte_encoder = bytes_to_unicode() - self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} - with g_pathmgr.open(bpe_path, 'rb') as fh: - bpe_bytes = io.BytesIO(fh.read()) - merges: List[str] = gzip.open(bpe_bytes).read().decode( - 'utf-8').split('\n') - merges = merges[1:49152 - 256 - 2 + 1] - - merges: List[Tuple[str, .. - .]] = [tuple(merge.split()) for merge in merges] - vocab = list(bytes_to_unicode().values()) - vocab = vocab + [(v + '') for v in vocab] - for merge in merges: - vocab.append(''.join(merge)) - - vocab.extend(['<|startoftext|>', '<|endoftext|>']) - self.encoder = dict(zip(vocab, range(len(vocab)))) - self.decoder = {v: k for k, v in self.encoder.items()} - self.bpe_ranks = dict(zip(merges, range(len(merges)))) - self.cache = { - '<|startoftext|>': '<|startoftext|>', - '<|endoftext|>': '<|endoftext|>', - } - self.pat = re.compile( - "<\\|startoftext\\|>|<\\|endoftext\\|>|'s|'t|'re|'ve|'m|'ll|'d|[\\p{L}]+|[\\p{N}]|[^\\s\\p{L}\\p{N}]+", - re.IGNORECASE, ) - self.context_length = context_length - - def bpe(self, token): - if token in self.cache: - return self.cache[token] - - word = tuple(token[:-1]) + (token[-1] + '', ) - pairs = get_pairs(word) - if not pairs: - return token + '' - while True: - bigram = min( - pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) - if bigram not in self.bpe_ranks: - break - first, second = bigram - new_word = [] - i = 0 - while i < len(word): - try: - j = word.index(first, i) - new_word.extend(word[i:j]) - i = j - except: - new_word.extend(word[i:]) - break - if word[i] == first and i < len(word) - 1 and word[i + - 1] == second: - new_word.append(first + second) - i += 2 - else: - new_word.append(word[i]) - i += 1 - new_word = tuple(new_word) - word = new_word - if len(word) == 1: - break - else: - pairs = get_pairs(word) - word = ' '.join(word) - self.cache[token] = word - return word - - def encode(self, text): - bpe_tokens = [] - text = whitespace_clean(basic_clean(text)).lower() - for token in re.findall(self.pat, text): - token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) - - bpe_tokens.extend(self.encoder[bpe_token] - for bpe_token in self.bpe(token).split(' ')) - return bpe_tokens - - def decode(self, tokens): - text = ''.join([self.decoder[token] for token in tokens]) - text = (bytearray([self.byte_decoder[c] for c in text]).decode( - 'utf-8', errors='replace').replace('', ' ')) - return text - - def __call__(self, texts, context_length=None): - if not context_length: - context_length = self.context_length - if isinstance(texts, str): - texts = [texts] - sot_token = self.encoder['<|startoftext|>'] - eot_token = self.encoder['<|endoftext|>'] - all_tokens = [([sot_token] + self.encode(text) + [eot_token]) - for text in texts] - result = paddle.zeros( - shape=[len(all_tokens), context_length], dtype='int64') - for i, tokens in enumerate(all_tokens): - tokens = tokens[:context_length] - result[(i), :len(tokens)] = paddle.to_tensor(data=tokens) - if len(result) == 1: - return result[0] - return result - - class IMUPreprocessor(VerboseNNModule): def __init__( - self, - kernel_size: int, - imu_stem: PatchEmbedGeneric, - embed_dim: int, - img_size: Tuple=(6, 2000), - num_cls_tokens: int=1, - pos_embed_fn: Optional[Callable]=None, - init_param_style: str='openclip', ) -> None: + self, + kernel_size: int, + imu_stem: PatchEmbedGeneric, + embed_dim: int, + img_size: Tuple = (6, 2000), + num_cls_tokens: int = 1, + pos_embed_fn: Optional[Callable] = None, + init_param_style: str = "openclip", + ) -> None: super().__init__() self.imu_stem = imu_stem self.embed_dim = embed_dim @@ -577,32 +459,33 @@ def __init__( self.pos_embed = paddle.create_parameter( shape=[1, img_size[1] // kernel_size + num_cls_tokens, embed_dim], dtype="float32", - default_initializer=paddle.nn.initializer.Assign(value=paddle.empty( - shape=[ - 1, img_size[1] // kernel_size + num_cls_tokens, embed_dim - ])), ) + default_initializer=paddle.nn.initializer.Assign( + value=paddle.empty(shape=[1, img_size[1] // kernel_size + num_cls_tokens, embed_dim]) + ), + ) if self.num_cls_tokens > 0: self.cls_token = paddle.create_parameter( shape=[1, self.num_cls_tokens, self.embed_dim], dtype="float32", - default_initializer=paddle.nn.initializer.Constant(value=0.0), ) + default_initializer=paddle.nn.initializer.Constant(value=0.0), + ) self.init_parameters(init_param_style) @paddle.no_grad() def init_parameters(self, init_param_style): paddle.nn.initializer.TruncatedNormal(std=0.01)(self.pos_embed) - if init_param_style == 'openclip': + if init_param_style == "openclip": scale = self.embed_dim**-0.5 if self.num_cls_tokens > 0: paddle.nn.initializer.TruncatedNormal()(self.cls_token) self.cls_token.set_value(self.cls_token * scale) - elif init_param_style == 'vit': + elif init_param_style == "vit": self.cls_token.data.fill_(value=0) else: - raise ValueError(f'Unknown init {init_param_style}') + raise ValueError(f"Unknown init {init_param_style}") def tokenize_input_and_cls_pos(self, input, stem): tokens = stem.norm_layer(stem.proj(input)) @@ -618,9 +501,8 @@ def tokenize_input_and_cls_pos(self, input, stem): def forward(self, imu): - imu = imu.unfold(-1, self.kernel_size, self.kernel_size).transpose( - perm=[0, 2, 1, 3]) # 需要对齐 + imu = imu.unfold(-1, self.kernel_size, self.kernel_size).transpose(perm=[0, 2, 1, 3]) # 需要对齐 imu = imu.reshape((imu.shape[0], imu.shape[1], -1)) imu_tokens = self.tokenize_input_and_cls_pos(imu, self.imu_stem) - return_dict = {'trunk': {'tokens': imu_tokens}, 'head': {}} + return_dict = {"trunk": {"tokens": imu_tokens}, "head": {}} return return_dict diff --git a/paddlemix/models/imagebind/transformer.py b/paddlemix/models/imagebind/transformer.py index 59e64bc4c9f8a..bb2fdae67b0d9 100644 --- a/paddlemix/models/imagebind/transformer.py +++ b/paddlemix/models/imagebind/transformer.py @@ -21,36 +21,35 @@ class Attention(paddle.nn.Layer): def __init__( - self, - dim, - num_heads=8, - qkv_bias=False, - qk_scale=None, - attn_drop=0.0, - proj_drop=0.0, ): + self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim**-0.5 - self.qkv = paddle.nn.Linear( - in_features=dim, out_features=dim * 3, bias_attr=qkv_bias) + self.qkv = paddle.nn.Linear(in_features=dim, out_features=dim * 3, bias_attr=qkv_bias) self.attn_drop = paddle.nn.Dropout(p=attn_drop) self.proj = paddle.nn.Linear(in_features=dim, out_features=dim) self.proj_drop = paddle.nn.Dropout(p=proj_drop) def forward(self, x): B, N, C = x.shape - qkv = (self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) - .transpose(perm=[2, 0, 3, 1, 4])) + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).transpose(perm=[2, 0, 3, 1, 4]) q, k, v = qkv[0], qkv[1], qkv[2] x = k perm_2 = list(range(x.ndim)) perm_2[-2] = -1 perm_2[-1] = -2 - attn = q @x.transpose(perm=perm_2) * self.scale + attn = q @ x.transpose(perm=perm_2) * self.scale attn = paddle.nn.functional.softmax(attn, axis=-1) attn = self.attn_drop(attn) - x = attn @v + x = attn @ v perm_3 = list(range(x.ndim)) perm_3[1] = 2 perm_3[2] = 1 @@ -62,20 +61,19 @@ def forward(self, x): class Mlp(paddle.nn.Layer): def __init__( - self, - in_features, - hidden_features=None, - out_features=None, - act_layer=paddle.nn.GELU, - drop=0.0, ): + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=paddle.nn.GELU, + drop=0.0, + ): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features - self.fc1 = paddle.nn.Linear( - in_features=in_features, out_features=hidden_features) + self.fc1 = paddle.nn.Linear(in_features=in_features, out_features=hidden_features) self.act = act_layer() - self.fc2 = paddle.nn.Linear( - in_features=hidden_features, out_features=out_features) + self.fc2 = paddle.nn.Linear(in_features=hidden_features, out_features=out_features) self.drop = paddle.nn.Dropout(p=drop) def forward(self, x): @@ -89,31 +87,28 @@ def forward(self, x): class MultiheadAttention(paddle.nn.MultiHeadAttention): def __init__(self, embed_dim, num_heads, *arg, add_bias_kv=None, **kwargs): - super(MultiheadAttention, self).__init__(embed_dim, num_heads, *arg, - **kwargs) + super(MultiheadAttention, self).__init__(embed_dim, num_heads, *arg, **kwargs) self.add_bias_kv = add_bias_kv self.embed_dim = embed_dim if self.add_bias_kv: self.bias_k = paddle.create_parameter( shape=[1, 1, embed_dim], dtype="float32", - default_initializer=paddle.nn.initializer.Constant(value=0.0), ) + default_initializer=paddle.nn.initializer.Constant(value=0.0), + ) self.bias_v = paddle.create_parameter( shape=[1, 1, embed_dim], dtype="float32", - default_initializer=paddle.nn.initializer.Constant(value=0.0), ) + default_initializer=paddle.nn.initializer.Constant(value=0.0), + ) def compute_kv(self, key, value): k = self.k_proj(key) v = self.v_proj(value) bsz, _, _ = k.shape if self.add_bias_kv: - k = paddle.concat( - [k, paddle.repeat_interleave( - self.bias_k, bsz, axis=0)], axis=1) - v = paddle.concat( - [v, paddle.repeat_interleave( - self.bias_v, bsz, axis=0)], axis=1) + k = paddle.concat([k, paddle.repeat_interleave(self.bias_k, bsz, axis=0)], axis=1) + v = paddle.concat([v, paddle.repeat_interleave(self.bias_v, bsz, axis=0)], axis=1) k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim]) k = tensor.transpose(x=k, perm=[0, 2, 1, 3]) v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim]) @@ -122,8 +117,7 @@ def compute_kv(self, key, value): def forward(self, x: paddle.Tensor, attn_mask: paddle.Tensor): # x = paddle.transpose(x, perm=[1,0, 2]) - return super(MultiheadAttention, self).forward( - x, x, x, attn_mask=attn_mask) + return super(MultiheadAttention, self).forward(x, x, x, attn_mask=attn_mask) class ViTAttention(Attention): @@ -142,16 +136,17 @@ def forward(self, x): class BlockWithMasking(paddle.nn.Layer): def __init__( - self, - dim: int, - attn_target: Callable, - mlp_ratio: int=4, - act_layer: Callable=paddle.nn.GELU, - norm_layer: Callable=paddle.nn.LayerNorm, - ffn_dropout_rate: float=0.0, - drop_path: float=0.0, - layer_scale_type: Optional[str]=None, - layer_scale_init_value: float=0.0001, ): + self, + dim: int, + attn_target: Callable, + mlp_ratio: int = 4, + act_layer: Callable = paddle.nn.GELU, + norm_layer: Callable = paddle.nn.LayerNorm, + ffn_dropout_rate: float = 0.0, + drop_path: float = 0.0, + layer_scale_type: Optional[str] = None, + layer_scale_init_value: float = 0.0001, + ): super().__init__() assert not isinstance( attn_target, paddle.nn.Layer @@ -167,7 +162,8 @@ def __init__( in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, - drop=ffn_dropout_rate, ) + drop=ffn_dropout_rate, + ) self.norm_2 = norm_layer(dim) self.layer_scale_type = layer_scale_type if self.layer_scale_type is not None: @@ -183,21 +179,21 @@ def __init__( self.layer_scale_gamma1 = paddle.create_parameter( shape=gamma_shape, dtype="float32", - default_initializer=paddle.nn.initializer.Constant(value=1.0), ) + default_initializer=paddle.nn.initializer.Constant(value=1.0), + ) self.layer_scale_gamma2 = paddle.create_parameter( shape=gamma_shape, dtype="float32", - default_initializer=paddle.nn.initializer.Constant(value=1.0), ) + default_initializer=paddle.nn.initializer.Constant(value=1.0), + ) def forward(self, x: paddle.Tensor, attn_mask: paddle.Tensor): if self.layer_scale_type is None: x = x + self.drop_path(self.attn(self.norm_1(x), attn_mask)) x = x + self.drop_path(self.mlp(self.norm_2(x))) else: - x = (x + self.drop_path(self.attn(self.norm_1(x), attn_mask)) * - self.layer_scale_gamma1) - x = x + self.drop_path(self.mlp(self.norm_2( - x))) * self.layer_scale_gamma2 + x = x + self.drop_path(self.attn(self.norm_1(x), attn_mask)) * self.layer_scale_gamma1 + x = x + self.drop_path(self.mlp(self.norm_2(x))) * self.layer_scale_gamma2 return x @@ -206,21 +202,22 @@ def forward(self, x: paddle.Tensor, attn_mask: paddle.Tensor): class SimpleTransformer(paddle.nn.Layer): def __init__( - self, - attn_target: Callable, - embed_dim: int, - num_blocks: int, - block: Callable=BlockWithMasking, - pre_transformer_layer: Optional[Callable]=None, - post_transformer_layer: Optional[Callable]=None, - drop_path_rate: float=0.0, - drop_path_type: str="progressive", - norm_layer: Callable=_LAYER_NORM, - mlp_ratio: int=4, - ffn_dropout_rate: float=0.0, - layer_scale_type: Optional[str]=None, - layer_scale_init_value: float=0.0001, - weight_init_style: str="jax", ): + self, + attn_target: Callable, + embed_dim: int, + num_blocks: int, + block: Callable = BlockWithMasking, + pre_transformer_layer: Optional[Callable] = None, + post_transformer_layer: Optional[Callable] = None, + drop_path_rate: float = 0.0, + drop_path_type: str = "progressive", + norm_layer: Callable = _LAYER_NORM, + mlp_ratio: int = 4, + ffn_dropout_rate: float = 0.0, + layer_scale_type: Optional[str] = None, + layer_scale_init_value: float = 0.0001, + weight_init_style: str = "jax", + ): """ Simple Transformer with the following features 1. Supports masked attention @@ -232,27 +229,26 @@ def __init__( super().__init__() self.pre_transformer_layer = pre_transformer_layer if drop_path_type == "progressive": - dpr = [ - x.item() - for x in paddle.linspace( - start=0, stop=drop_path_rate, num=num_blocks) - ] + dpr = [x.item() for x in paddle.linspace(start=0, stop=drop_path_rate, num=num_blocks)] elif drop_path_type == "uniform": dpr = [drop_path_rate for i in range(num_blocks)] else: raise ValueError(f"Unknown drop_path_type: {drop_path_type}") - self.blocks = paddle.nn.Sequential(* [ - block( - dim=embed_dim, - attn_target=attn_target, - mlp_ratio=mlp_ratio, - ffn_dropout_rate=ffn_dropout_rate, - drop_path=dpr[i], - norm_layer=norm_layer, - layer_scale_type=layer_scale_type, - layer_scale_init_value=layer_scale_init_value, ) - for i in range(num_blocks) - ]) + self.blocks = paddle.nn.Sequential( + *[ + block( + dim=embed_dim, + attn_target=attn_target, + mlp_ratio=mlp_ratio, + ffn_dropout_rate=ffn_dropout_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + layer_scale_type=layer_scale_type, + layer_scale_init_value=layer_scale_init_value, + ) + for i in range(num_blocks) + ] + ) self.post_transformer_layer = post_transformer_layer self.weight_init_style = weight_init_style self.apply(self._init_weights) @@ -273,12 +269,13 @@ def _init_weights(self, m): paddle.nn.initializer.Constant(value=1.0)(m.weight) def forward( - self, - tokens: paddle.Tensor, - attn_mask: paddle.Tensor=None, - use_checkpoint: bool=False, - checkpoint_every_n: int=1, - checkpoint_blk_ids: Optional[List[int]]=None, ): + self, + tokens: paddle.Tensor, + attn_mask: paddle.Tensor = None, + use_checkpoint: bool = False, + checkpoint_every_n: int = 1, + checkpoint_blk_ids: Optional[List[int]] = None, + ): """ Inputs - tokens: data of shape N x L x D (or L x N x D depending on the attention implementation) @@ -290,10 +287,7 @@ def forward( if self.pre_transformer_layer: tokens = self.pre_transformer_layer(tokens) if use_checkpoint and checkpoint_blk_ids is None: - checkpoint_blk_ids = [ - blk_id for blk_id in range(len(self.blocks)) - if blk_id % checkpoint_every_n == 0 - ] + checkpoint_blk_ids = [blk_id for blk_id in range(len(self.blocks)) if blk_id % checkpoint_every_n == 0] if checkpoint_blk_ids: checkpoint_blk_ids = set(checkpoint_blk_ids) for blk_id, blk in enumerate(self.blocks): diff --git a/paddlemix/models/imagebind/utils/kaldi.py b/paddlemix/models/imagebind/utils/kaldi.py index 1b6950f2ecc61..59a53dc16407a 100644 --- a/paddlemix/models/imagebind/utils/kaldi.py +++ b/paddlemix/models/imagebind/utils/kaldi.py @@ -45,13 +45,10 @@ def _get_epsilon(device, dtype): def _next_power_of_2(x: int) -> int: """Returns the smallest power of 2 that is greater than x""" - return 1 if x == 0 else 2**(x - 1).bit_length() + return 1 if x == 0 else 2 ** (x - 1).bit_length() -def _get_strided(waveform: paddle.Tensor, - window_size: int, - window_shift: int, - snip_edges: bool) -> paddle.Tensor: +def _get_strided(waveform: paddle.Tensor, window_size: int, window_shift: int, snip_edges: bool) -> paddle.Tensor: """Given a waveform (1D tensor of size ``num_samples``), it returns a 2D tensor (m, ``window_size``) representing how the window is shifted along the waveform. Each row is a frame. @@ -90,15 +87,12 @@ def _get_strided(waveform: paddle.Tensor, return waveform.as_strided(sizes, strides) -def _feature_window_function(window_type: str, - window_size: int, - blackman_coeff: float, - device: str, - dtype: int) -> paddle.Tensor: +def _feature_window_function( + window_type: str, window_size: int, blackman_coeff: float, device: str, dtype: int +) -> paddle.Tensor: """Returns a window function with the given type and size""" if window_type == HANNING: - return paddle.hann_window( - window_size, periodic=False, device=device, dtype=dtype) + return paddle.hann_window(window_size, periodic=False, device=device, dtype=dtype) elif window_type == HAMMING: return paddle.hamming_window( window_size, @@ -106,82 +100,79 @@ def _feature_window_function(window_type: str, alpha=0.54, beta=0.46, device=device, - dtype=dtype, ) + dtype=dtype, + ) elif window_type == POVEY: - return paddle.hann_window( - window_size, periodic=False, device=device, dtype=dtype).pow(y=0.85) + return paddle.hann_window(window_size, periodic=False, device=device, dtype=dtype).pow(y=0.85) elif window_type == RECTANGULAR: return paddle.ones(shape=window_size, dtype=dtype) elif window_type == BLACKMAN: a = 2 * math.pi / (window_size - 1) window_function = paddle.arange(end=window_size).astype(dtype) - return (blackman_coeff - 0.5 * paddle.cos(x=a * window_function) + - (0.5 - blackman_coeff) * paddle.cos(x=2 * a * window_function)) + return ( + blackman_coeff + - 0.5 * paddle.cos(x=a * window_function) + + (0.5 - blackman_coeff) * paddle.cos(x=2 * a * window_function) + ) else: raise Exception("Invalid window type " + window_type) -def _get_log_energy(strided_input: paddle.Tensor, - epsilon: paddle.Tensor, - energy_floor: float) -> paddle.Tensor: +def _get_log_energy(strided_input: paddle.Tensor, epsilon: paddle.Tensor, energy_floor: float) -> paddle.Tensor: """Returns the log energy of size (m) for a strided_input (m,*)""" device, dtype = strided_input.place, strided_input.dtype - log_energy = paddle.maximum( - x=strided_input.pow(y=2).sum(axis=1), y=epsilon).log() + log_energy = paddle.maximum(x=strided_input.pow(y=2).sum(axis=1), y=epsilon).log() if energy_floor == 0.0: return log_energy return paddle.maximum( x=log_energy, - y=paddle.to_tensor( - data=math.log(energy_floor), dtype=dtype, place=device), ) + y=paddle.to_tensor(data=math.log(energy_floor), dtype=dtype, place=device), + ) def _get_waveform_and_window_properties( - waveform: paddle.Tensor, - channel: int, - sample_frequency: float, - frame_shift: float, - frame_length: float, - round_to_power_of_two: bool, - preemphasis_coefficient: float, ) -> Tuple[paddle.Tensor, int, int, - int]: + waveform: paddle.Tensor, + channel: int, + sample_frequency: float, + frame_shift: float, + frame_length: float, + round_to_power_of_two: bool, + preemphasis_coefficient: float, +) -> Tuple[paddle.Tensor, int, int, int]: """Gets the waveform and window properties""" channel = max(channel, 0) - assert channel < waveform.shape[0], "Invalid channel {} for size {}".format( - channel, waveform.shape[0]) + assert channel < waveform.shape[0], "Invalid channel {} for size {}".format(channel, waveform.shape[0]) waveform = waveform[(channel), :] window_shift = int(sample_frequency * frame_shift * MILLISECONDS_TO_SECONDS) window_size = int(sample_frequency * frame_length * MILLISECONDS_TO_SECONDS) - padded_window_size = (_next_power_of_2(window_size) - if round_to_power_of_two else window_size) - assert (2 <= window_size <= len(waveform) - ), "choose a window size {} that is [2, {}]".format(window_size, - len(waveform)) + padded_window_size = _next_power_of_2(window_size) if round_to_power_of_two else window_size + assert 2 <= window_size <= len(waveform), "choose a window size {} that is [2, {}]".format( + window_size, len(waveform) + ) assert 0 < window_shift, "`window_shift` must be greater than 0" assert ( padded_window_size % 2 == 0 ), "the padded `window_size` must be divisible by two. use `round_to_power_of_two` or change `frame_length`" - assert (0.0 <= preemphasis_coefficient <= 1.0 - ), "`preemphasis_coefficient` must be between [0,1]" + assert 0.0 <= preemphasis_coefficient <= 1.0, "`preemphasis_coefficient` must be between [0,1]" assert sample_frequency > 0, "`sample_frequency` must be greater than zero" return waveform, window_shift, window_size, padded_window_size def _get_window( - waveform: paddle.Tensor, - padded_window_size: int, - window_size: int, - window_shift: int, - window_type: str, - blackman_coeff: float, - snip_edges: bool, - raw_energy: bool, - energy_floor: float, - dither: float, - remove_dc_offset: bool, - preemphasis_coefficient: float, ) -> Tuple[paddle.Tensor, - paddle.Tensor]: + waveform: paddle.Tensor, + padded_window_size: int, + window_size: int, + window_shift: int, + window_type: str, + blackman_coeff: float, + snip_edges: bool, + raw_energy: bool, + energy_floor: float, + dither: float, + remove_dc_offset: bool, + preemphasis_coefficient: float, +) -> Tuple[paddle.Tensor, paddle.Tensor]: """Gets a window and its log energy Returns: @@ -189,30 +180,25 @@ def _get_window( """ device, dtype = waveform.place, waveform.dtype epsilon = _get_epsilon(device, dtype) - strided_input = _get_strided(waveform, window_size, window_shift, - snip_edges) + strided_input = _get_strided(waveform, window_size, window_shift, snip_edges) if dither != 0.0: - x = paddle.maximum( - x=epsilon, y=paddle.rand( - shape=strided_input.shape, dtype=dtype)) + x = paddle.maximum(x=epsilon, y=paddle.rand(shape=strided_input.shape, dtype=dtype)) rand_gauss = paddle.sqrt(x=-2 * x.log()) * paddle.cos(x=2 * math.pi * x) strided_input = strided_input + rand_gauss * dither if remove_dc_offset: row_means = paddle.mean(x=strided_input, axis=1).unsqueeze(axis=1) strided_input = strided_input - row_means if raw_energy: - signal_log_energy = _get_log_energy(strided_input, epsilon, - energy_floor) + signal_log_energy = _get_log_energy(strided_input, epsilon, energy_floor) if preemphasis_coefficient != 0.0: offset_strided_input = paddle.pad_from_torch( - strided_input.unsqueeze(axis=0), (1, 0), - mode="replicate").squeeze(axis=0) - - strided_input = (strided_input - preemphasis_coefficient * - offset_strided_input[:, :-1]) - window_function = _feature_window_function(window_type, window_size, - blackman_coeff, device, - dtype).unsqueeze(axis=0) + strided_input.unsqueeze(axis=0), (1, 0), mode="replicate" + ).squeeze(axis=0) + + strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :-1] + window_function = _feature_window_function(window_type, window_size, blackman_coeff, device, dtype).unsqueeze( + axis=0 + ) strided_input = strided_input * window_function if padded_window_size != window_size: padding_right = padded_window_size - window_size @@ -220,15 +206,14 @@ def _get_window( strided_input.unsqueeze(axis=0), (0, padding_right), mode="constant", - value=0, ).squeeze(axis=0) + value=0, + ).squeeze(axis=0) if not raw_energy: - signal_log_energy = _get_log_energy(strided_input, epsilon, - energy_floor) + signal_log_energy = _get_log_energy(strided_input, epsilon, energy_floor) return strided_input, signal_log_energy -def _subtract_column_mean(tensor: paddle.Tensor, - subtract_mean: bool) -> paddle.Tensor: +def _subtract_column_mean(tensor: paddle.Tensor, subtract_mean: bool) -> paddle.Tensor: if subtract_mean: col_means = paddle.mean(x=tensor, axis=0).unsqueeze(axis=0) tensor = tensor - col_means @@ -236,22 +221,23 @@ def _subtract_column_mean(tensor: paddle.Tensor, def spectrogram( - waveform: paddle.Tensor, - blackman_coeff: float=0.42, - channel: int=-1, - dither: float=0.0, - energy_floor: float=1.0, - frame_length: float=25.0, - frame_shift: float=10.0, - min_duration: float=0.0, - preemphasis_coefficient: float=0.97, - raw_energy: bool=True, - remove_dc_offset: bool=True, - round_to_power_of_two: bool=True, - sample_frequency: float=16000.0, - snip_edges: bool=True, - subtract_mean: bool=False, - window_type: str=POVEY, ) -> paddle.Tensor: + waveform: paddle.Tensor, + blackman_coeff: float = 0.42, + channel: int = -1, + dither: float = 0.0, + energy_floor: float = 1.0, + frame_length: float = 25.0, + frame_shift: float = 10.0, + min_duration: float = 0.0, + preemphasis_coefficient: float = 0.97, + raw_energy: bool = True, + remove_dc_offset: bool = True, + round_to_power_of_two: bool = True, + sample_frequency: float = 16000.0, + snip_edges: bool = True, + subtract_mean: bool = False, + window_type: str = POVEY, +) -> paddle.Tensor: """Create a spectrogram from a raw audio signal. This matches the input/output of Kaldi's compute-spectrogram-feats. @@ -288,18 +274,15 @@ def spectrogram( """ device, dtype = waveform.place, waveform.dtype epsilon = _get_epsilon(device, dtype) - ( + (waveform, window_shift, window_size, padded_window_size,) = _get_waveform_and_window_properties( waveform, - window_shift, - window_size, - padded_window_size, ) = _get_waveform_and_window_properties( - waveform, - channel, - sample_frequency, - frame_shift, - frame_length, - round_to_power_of_two, - preemphasis_coefficient, ) + channel, + sample_frequency, + frame_shift, + frame_length, + round_to_power_of_two, + preemphasis_coefficient, + ) if len(waveform) < min_duration * sample_frequency: return paddle.empty(shape=[0]) strided_input, signal_log_energy = _get_window( @@ -314,7 +297,8 @@ def spectrogram( energy_floor, dither, remove_dc_offset, - preemphasis_coefficient, ) + preemphasis_coefficient, + ) fft = paddle.fft.rfft(x=strided_input) power_spectrum = paddle.maximum(x=fft.abs().pow(y=2.0), y=epsilon).log() power_spectrum[:, (0)] = signal_log_energy @@ -339,12 +323,13 @@ def mel_scale(freq: paddle.Tensor) -> paddle.Tensor: def vtln_warp_freq( - vtln_low_cutoff: float, - vtln_high_cutoff: float, - low_freq: float, - high_freq: float, - vtln_warp_factor: float, - freq: paddle.Tensor, ) -> paddle.Tensor: + vtln_low_cutoff: float, + vtln_high_cutoff: float, + low_freq: float, + high_freq: float, + vtln_warp_factor: float, + freq: paddle.Tensor, +) -> paddle.Tensor: """This computes a VTLN warping function that is not the same as HTK's one, but has similar inputs (this function has the advantage of never producing empty bins). @@ -381,11 +366,8 @@ def vtln_warp_freq( Returns: Tensor: Freq after vtln warp """ - assert (vtln_low_cutoff > low_freq - ), "be sure to set the vtln_low option higher than low_freq" - assert ( - vtln_high_cutoff < high_freq - ), "be sure to set the vtln_high option lower than high_freq [or negative]" + assert vtln_low_cutoff > low_freq, "be sure to set the vtln_low option higher than low_freq" + assert vtln_high_cutoff < high_freq, "be sure to set the vtln_high option lower than high_freq [or negative]" l = vtln_low_cutoff * max(1.0, vtln_warp_factor) h = vtln_high_cutoff * min(1.0, vtln_warp_factor) scale = 1.0 / vtln_warp_factor @@ -395,9 +377,9 @@ def vtln_warp_freq( scale_left = (Fl - low_freq) / (l - low_freq) scale_right = (high_freq - Fh) / (high_freq - h) res = paddle.empty_like(x=freq) - outside_low_high_freq = paddle.less_than( - x=freq, y=paddle.to_tensor(low_freq)) | paddle.greater_than( - x=freq, y=paddle.to_tensor(high_freq)) + outside_low_high_freq = paddle.less_than(x=freq, y=paddle.to_tensor(low_freq)) | paddle.greater_than( + x=freq, y=paddle.to_tensor(high_freq) + ) before_l = paddle.less_than(x=freq, y=paddle.to_tensor(l)) before_h = paddle.less_than(x=freq, y=paddle.to_tensor(h)) after_h = paddle.greater_equal(x=freq, y=paddle.to_tensor(h)) @@ -409,12 +391,13 @@ def vtln_warp_freq( def vtln_warp_mel_freq( - vtln_low_cutoff: float, - vtln_high_cutoff: float, - low_freq, - high_freq: float, - vtln_warp_factor: float, - mel_freq: paddle.Tensor, ) -> paddle.Tensor: + vtln_low_cutoff: float, + vtln_high_cutoff: float, + low_freq, + high_freq: float, + vtln_warp_factor: float, + mel_freq: paddle.Tensor, +) -> paddle.Tensor: """ Args: vtln_low_cutoff (float): Lower frequency cutoffs for VTLN @@ -434,18 +417,21 @@ def vtln_warp_mel_freq( low_freq, high_freq, vtln_warp_factor, - inverse_mel_scale(mel_freq), )) + inverse_mel_scale(mel_freq), + ) + ) def get_mel_banks( - num_bins: int, - window_length_padded: int, - sample_freq: float, - low_freq: float, - high_freq: float, - vtln_low: float, - vtln_high: float, - vtln_warp_factor: float, ) -> Tuple[paddle.Tensor, paddle.Tensor]: + num_bins: int, + window_length_padded: int, + sample_freq: float, + low_freq: float, + high_freq: float, + vtln_low: float, + vtln_high: float, + vtln_warp_factor: float, +) -> Tuple[paddle.Tensor, paddle.Tensor]: """ Returns: (Tensor, Tensor): The tuple consists of ``bins`` (which is @@ -459,10 +445,8 @@ def get_mel_banks( if high_freq <= 0.0: high_freq += nyquist assert ( - 0.0 <= low_freq < nyquist and 0.0 < high_freq <= nyquist and - low_freq < high_freq - ), "Bad values in options: low-freq {} and high-freq {} vs. nyquist {}".format( - low_freq, high_freq, nyquist) + 0.0 <= low_freq < nyquist and 0.0 < high_freq <= nyquist and low_freq < high_freq + ), "Bad values in options: low-freq {} and high-freq {} vs. nyquist {}".format(low_freq, high_freq, nyquist) fft_bin_width = sample_freq / window_length_padded mel_low_freq = mel_scale_scalar(low_freq) mel_high_freq = mel_scale_scalar(high_freq) @@ -472,72 +456,69 @@ def get_mel_banks( vtln_high += nyquist assert ( - vtln_warp_factor == 1.0 or low_freq < vtln_low < high_freq and - 0.0 < vtln_high < high_freq and vtln_low < vtln_high + vtln_warp_factor == 1.0 + or low_freq < vtln_low < high_freq + and 0.0 < vtln_high < high_freq + and vtln_low < vtln_high ), "Bad values in options: vtln-low {} and vtln-high {}, versus low-freq {} and high-freq {}".format( - vtln_low, vtln_high, low_freq, high_freq) + vtln_low, vtln_high, low_freq, high_freq + ) bin = paddle.arange(end=num_bins).unsqueeze(axis=1) left_mel = mel_low_freq + bin * mel_freq_delta center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta if vtln_warp_factor != 1.0: - left_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, - vtln_warp_factor, left_mel) - center_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, - high_freq, vtln_warp_factor, center_mel) - right_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, - vtln_warp_factor, right_mel) + left_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, left_mel) + center_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, center_mel) + right_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, right_mel) center_freqs = inverse_mel_scale(center_mel) - mel = mel_scale(fft_bin_width * paddle.arange(end=num_fft_bins)).unsqueeze( - axis=0) + mel = mel_scale(fft_bin_width * paddle.arange(end=num_fft_bins)).unsqueeze(axis=0) up_slope = (mel - left_mel) / (center_mel - left_mel) down_slope = (right_mel - mel) / (right_mel - center_mel) if vtln_warp_factor == 1.0: - bins = paddle.maximum( - x=paddle.zeros(shape=[1]), - y=paddle.minimum( - x=up_slope, y=down_slope)) + bins = paddle.maximum(x=paddle.zeros(shape=[1]), y=paddle.minimum(x=up_slope, y=down_slope)) else: bins = paddle.zeros_like(x=up_slope) - up_idx = paddle.greater_than( - x=mel, y=paddle.to_tensor(left_mel)) & paddle.less_equal( - x=mel, y=paddle.to_tensor(center_mel)) - down_idx = paddle.greater_than( - x=mel, y=paddle.to_tensor(center_mel)) & paddle.less_than( - x=mel, y=paddle.to_tensor(right_mel)) + up_idx = paddle.greater_than(x=mel, y=paddle.to_tensor(left_mel)) & paddle.less_equal( + x=mel, y=paddle.to_tensor(center_mel) + ) + down_idx = paddle.greater_than(x=mel, y=paddle.to_tensor(center_mel)) & paddle.less_than( + x=mel, y=paddle.to_tensor(right_mel) + ) bins[up_idx] = up_slope[up_idx] bins[down_idx] = down_slope[down_idx] return bins, center_freqs def fbank( - waveform: paddle.Tensor, - blackman_coeff: float=0.42, - channel: int=-1, - dither: float=0.0, - energy_floor: float=1.0, - frame_length: float=25.0, - frame_shift: float=10.0, - high_freq: float=0.0, - htk_compat: bool=False, - low_freq: float=20.0, - min_duration: float=0.0, - num_mel_bins: int=23, - preemphasis_coefficient: float=0.97, - raw_energy: bool=True, - remove_dc_offset: bool=True, - round_to_power_of_two: bool=True, - sample_frequency: float=16000.0, - snip_edges: bool=True, - subtract_mean: bool=False, - use_energy: bool=False, - use_log_fbank: bool=True, - use_power: bool=True, - vtln_high: float=-500.0, - vtln_low: float=100.0, - vtln_warp: float=1.0, - window_type: str=POVEY, ) -> paddle.Tensor: + waveform: paddle.Tensor, + blackman_coeff: float = 0.42, + channel: int = -1, + dither: float = 0.0, + energy_floor: float = 1.0, + frame_length: float = 25.0, + frame_shift: float = 10.0, + high_freq: float = 0.0, + htk_compat: bool = False, + low_freq: float = 20.0, + min_duration: float = 0.0, + num_mel_bins: int = 23, + preemphasis_coefficient: float = 0.97, + raw_energy: bool = True, + remove_dc_offset: bool = True, + round_to_power_of_two: bool = True, + sample_frequency: float = 16000.0, + snip_edges: bool = True, + subtract_mean: bool = False, + use_energy: bool = False, + use_log_fbank: bool = True, + use_power: bool = True, + vtln_high: float = -500.0, + vtln_low: float = 100.0, + vtln_warp: float = 1.0, + window_type: str = POVEY, +) -> paddle.Tensor: """Create a fbank from a raw audio signal. This matches the input/output of Kaldi's compute-fbank-feats. @@ -586,18 +567,15 @@ def fbank( where m is calculated in _get_strided """ device, dtype = waveform.place, waveform.dtype - ( + (waveform, window_shift, window_size, padded_window_size,) = _get_waveform_and_window_properties( waveform, - window_shift, - window_size, - padded_window_size, ) = _get_waveform_and_window_properties( - waveform, - channel, - sample_frequency, - frame_shift, - frame_length, - round_to_power_of_two, - preemphasis_coefficient, ) + channel, + sample_frequency, + frame_shift, + frame_length, + round_to_power_of_two, + preemphasis_coefficient, + ) if len(waveform) < min_duration * sample_frequency: return paddle.empty(shape=[0], dtype=dtype) @@ -613,7 +591,8 @@ def fbank( energy_floor, dither, remove_dc_offset, - preemphasis_coefficient, ) + preemphasis_coefficient, + ) spectrum = paddle.fft.rfft(x=strided_input).abs() if use_power: spectrum = spectrum.pow(y=2.0) @@ -625,59 +604,56 @@ def fbank( high_freq, vtln_low, vtln_high, - vtln_warp, ) + vtln_warp, + ) mel_energies = mel_energies - mel_energies = paddle.pad_from_torch( - mel_energies, (0, 1), mode="constant", value=0) + mel_energies = paddle.pad_from_torch(mel_energies, (0, 1), mode="constant", value=0) mel_energies = paddle.mm(input=spectrum, mat2=mel_energies.T) if use_log_fbank: - mel_energies = paddle.maximum( - x=mel_energies, y=_get_epsilon(device, dtype)).log() + mel_energies = paddle.maximum(x=mel_energies, y=_get_epsilon(device, dtype)).log() if use_energy: signal_log_energy = signal_log_energy.unsqueeze(axis=1) if htk_compat: - mel_energies = paddle.concat( - x=(mel_energies, signal_log_energy), axis=1) + mel_energies = paddle.concat(x=(mel_energies, signal_log_energy), axis=1) else: - mel_energies = paddle.concat( - x=(signal_log_energy, mel_energies), axis=1) + mel_energies = paddle.concat(x=(signal_log_energy, mel_energies), axis=1) mel_energies = _subtract_column_mean(mel_energies, subtract_mean) return mel_energies def _get_lifter_coeffs(num_ceps: int, cepstral_lifter: float) -> paddle.Tensor: i = paddle.arange(end=num_ceps) - return 1.0 + 0.5 * cepstral_lifter * paddle.sin(x=math.pi * i / - cepstral_lifter) + return 1.0 + 0.5 * cepstral_lifter * paddle.sin(x=math.pi * i / cepstral_lifter) def mfcc( - waveform: paddle.Tensor, - blackman_coeff: float=0.42, - cepstral_lifter: float=22.0, - channel: int=-1, - dither: float=0.0, - energy_floor: float=1.0, - frame_length: float=25.0, - frame_shift: float=10.0, - high_freq: float=0.0, - htk_compat: bool=False, - low_freq: float=20.0, - num_ceps: int=13, - min_duration: float=0.0, - num_mel_bins: int=23, - preemphasis_coefficient: float=0.97, - raw_energy: bool=True, - remove_dc_offset: bool=True, - round_to_power_of_two: bool=True, - sample_frequency: float=16000.0, - snip_edges: bool=True, - subtract_mean: bool=False, - use_energy: bool=False, - vtln_high: float=-500.0, - vtln_low: float=100.0, - vtln_warp: float=1.0, - window_type: str=POVEY, ) -> paddle.Tensor: + waveform: paddle.Tensor, + blackman_coeff: float = 0.42, + cepstral_lifter: float = 22.0, + channel: int = -1, + dither: float = 0.0, + energy_floor: float = 1.0, + frame_length: float = 25.0, + frame_shift: float = 10.0, + high_freq: float = 0.0, + htk_compat: bool = False, + low_freq: float = 20.0, + num_ceps: int = 13, + min_duration: float = 0.0, + num_mel_bins: int = 23, + preemphasis_coefficient: float = 0.97, + raw_energy: bool = True, + remove_dc_offset: bool = True, + round_to_power_of_two: bool = True, + sample_frequency: float = 16000.0, + snip_edges: bool = True, + subtract_mean: bool = False, + use_energy: bool = False, + vtln_high: float = -500.0, + vtln_low: float = 100.0, + vtln_warp: float = 1.0, + window_type: str = POVEY, +) -> paddle.Tensor: """Create a mfcc from a raw audio signal. This matches the input/output of Kaldi's compute-mfcc-feats. @@ -725,11 +701,11 @@ def mfcc( Tensor: A mfcc identical to what Kaldi would output. The shape is (m, ``num_ceps``) where m is calculated in _get_strided """ - assert (num_ceps <= num_mel_bins - ), "num_ceps cannot be larger than num_mel_bins: %d vs %d" % ( - num_ceps, - num_mel_bins, ) - device, dtype = waveform.place, waveform.dtype + assert num_ceps <= num_mel_bins, "num_ceps cannot be larger than num_mel_bins: %d vs %d" % ( + num_ceps, + num_mel_bins, + ) + # device, dtype = waveform.place, waveform.dtype feature = fbank( waveform=waveform, blackman_coeff=blackman_coeff, @@ -756,17 +732,17 @@ def mfcc( vtln_high=vtln_high, vtln_low=vtln_low, vtln_warp=vtln_warp, - window_type=window_type, ) + window_type=window_type, + ) if use_energy: signal_log_energy = feature[:, (num_mel_bins if htk_compat else 0)] mel_offset = int(not htk_compat) - feature = feature[:, mel_offset:num_mel_bins + mel_offset] + feature = feature[:, mel_offset : num_mel_bins + mel_offset] dct_matrix = _get_dct_matrix(num_ceps, num_mel_bins) feature = feature.matmul(y=dct_matrix) if cepstral_lifter != 0.0: - lifter_coeffs = _get_lifter_coeffs(num_ceps, cepstral_lifter).unsqueeze( - axis=0) + lifter_coeffs = _get_lifter_coeffs(num_ceps, cepstral_lifter).unsqueeze(axis=0) feature *= lifter_coeffs if use_energy: diff --git a/paddlemix/models/imagebind/utils/paddle_aux.py b/paddlemix/models/imagebind/utils/paddle_aux.py index 6ffcfe6590f6e..0ea8c1927317d 100644 --- a/paddlemix/models/imagebind/utils/paddle_aux.py +++ b/paddlemix/models/imagebind/utils/paddle_aux.py @@ -59,10 +59,10 @@ def to(self, *args, **kwargs): if isinstance(kwargs["x"], paddle.dtype): dtype = kwargs["x"] elif isinstance(kwargs["x"], str) and kwargs["x"] not in [ - "cpu", - "cuda", - "ipu", - "xpu", + "cpu", + "cuda", + "ipu", + "xpu", ]: dtype = kwargs["x"] elif isinstance(kwargs["x"], paddle.Tensor): @@ -78,8 +78,7 @@ def to(self, *args, **kwargs): if x not in ["cpu", "cuda", "ipu", "xpu"]: dtype = kwargs["x"] else: - dtype = kwargs["y"] if isinstance(kwargs["y"], - str) else self.dtype + dtype = kwargs["y"] if isinstance(kwargs["y"], str) else self.dtype else: dtype = kwargs["x"] return paddle.cast(self, dtype) @@ -99,11 +98,9 @@ def split(self, *args, **kwargs): elif kwargs: if "dim" in kwargs: kwargs["axis"] = kwargs.pop("dim") - kwargs["num_or_sections"] = self.shape[kwargs[ - "axis"]] // kwargs.pop("split_size") + kwargs["num_or_sections"] = self.shape[kwargs["axis"]] // kwargs.pop("split_size") else: - kwargs["num_or_sections"] = self.shape[0] // kwargs.pop( - "split_size") + kwargs["num_or_sections"] = self.shape[0] // kwargs.pop("split_size") return paddle.split(self, **kwargs) @@ -116,7 +113,7 @@ def i0(self, input): K = paddle.arange(0, 20).astype("float32") m = 0 for k in K: - m += ((input**2) / 4)**k / math.factorial(k)**2 + m += ((input**2) / 4) ** k / math.factorial(k) ** 2 return m @@ -128,7 +125,7 @@ def i0(self, input): def stride(self, dim): shape = self.shape shape.append(1) - return paddle.cumprod(paddle.to_tensor(shape)[dim + 1:], dim=0)[-1].item() + return paddle.cumprod(paddle.to_tensor(shape)[dim + 1 :], dim=0)[-1].item() setattr(paddle.Tensor, "stride", stride) @@ -144,14 +141,20 @@ def as_strided(self, size, stride): hh = paddle.expand(h, (dx, dy)).flatten(0) datas = [] for i in range(0, size[0] * stride[0], stride[0]): - axes = [0, ] - starts = [i, ] - ends = [stride[1] * size[1] + i, ] - strides = [stride[1], ] - new_x = paddle.strided_slice( - ww, axes=axes, starts=starts, ends=ends, strides=strides) - new_y = paddle.strided_slice( - hh, axes=axes, starts=starts, ends=ends, strides=strides) + axes = [ + 0, + ] + starts = [ + i, + ] + ends = [ + stride[1] * size[1] + i, + ] + strides = [ + stride[1], + ] + new_x = paddle.strided_slice(ww, axes=axes, starts=starts, ends=ends, strides=strides) + new_y = paddle.strided_slice(hh, axes=axes, starts=starts, ends=ends, strides=strides) datas.append(self[new_y, new_x]) return paddle.stack(datas) @@ -163,19 +166,15 @@ def hann_window(window_length, periodic=True, **kwargs): N = window_length x = paddle.arange(N) if periodic: - return paddle.sin(math.pi * x / (N))**2 + return paddle.sin(math.pi * x / (N)) ** 2 else: - return paddle.sin(math.pi * x / (N - 1))**2 + return paddle.sin(math.pi * x / (N - 1)) ** 2 setattr(paddle, "hann_window", hann_window) -def hamming_window(window_length, - periodic=True, - alpha=0.54, - beta=0.46, - **kwargs): +def hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, **kwargs): N = window_length x = paddle.arange(N) if periodic: @@ -189,17 +188,14 @@ def hamming_window(window_length, def pad(input, pad, mode="constant", value=0.0): data_formats = {3: "NCL", 4: "NCHW", 5: "NCDHW"} - shape = input.shape if input.dim() == 2: input = input.unsqueeze(0) n = len(input.shape) pad = list(pad) + [0] * (n - 3) * 2 - pad = pad[:(n - 2) * 2] - return paddle.nn.functional.pad(input, - pad=tuple(pad), - mode=mode, - value=value, - data_format=data_formats[n]).squeeze() + pad = pad[: (n - 2) * 2] + return paddle.nn.functional.pad( + input, pad=tuple(pad), mode=mode, value=value, data_format=data_formats[n] + ).squeeze() setattr(paddle, "pad_from_torch", pad) diff --git a/paddlemix/models/imagebind/utils/resample.py b/paddlemix/models/imagebind/utils/resample.py index 10cc433bf7d66..c048c3df74fb9 100644 --- a/paddlemix/models/imagebind/utils/resample.py +++ b/paddlemix/models/imagebind/utils/resample.py @@ -13,29 +13,28 @@ # limitations under the License. import math -import sys -from typing import List, Optional, Tuple, Union +from typing import Optional import paddle def _get_sinc_resample_kernel( - orig_freq: int, - new_freq: int, - gcd: int, - lowpass_filter_width: int=6, - rolloff: float=0.99, - resampling_method: str="sinc_interpolation", - beta: Optional[float]=None, - device: str=str("cpu").replace("cuda", "gpu"), - dtype: Optional[paddle.dtype]=None, ): + orig_freq: int, + new_freq: int, + gcd: int, + lowpass_filter_width: int = 6, + rolloff: float = 0.99, + resampling_method: str = "sinc_interpolation", + beta: Optional[float] = None, + device: str = str("cpu").replace("cuda", "gpu"), + dtype: Optional[paddle.dtype] = None, +): if not (int(orig_freq) == orig_freq and int(new_freq) == new_freq): raise Exception( "Frequencies must be of integer type to ensure quality resampling computation. To work around this, manually convert both frequencies to integer values that maintain their resampling rate ratio before passing them into the function. Example: To downsample a 44100 hz waveform by a factor of 8, use `orig_freq=8` and `new_freq=1` instead of `orig_freq=44100` and `new_freq=5512.5`." ) if resampling_method not in ["sinc_interpolation", "kaiser_window"]: - raise ValueError("Invalid resampling method: {}".format( - resampling_method)) + raise ValueError("Invalid resampling method: {}".format(resampling_method)) orig_freq = int(orig_freq) // gcd new_freq = int(new_freq) // gcd assert lowpass_filter_width > 0 @@ -49,54 +48,41 @@ def _get_sinc_resample_kernel( t = (-i / new_freq + idx / orig_freq) * base_freq t = t.clip_(min=-lowpass_filter_width, max=lowpass_filter_width) if resampling_method == "sinc_interpolation": - window = paddle.cos(x=t * math.pi / lowpass_filter_width / 2)**2 + window = paddle.cos(x=t * math.pi / lowpass_filter_width / 2) ** 2 else: if beta is None: beta = 14.769656459379492 beta_tensor = paddle.to_tensor(data=float(beta)) - window = paddle.i0(beta_tensor * paddle.sqrt( - x=1 - (t / lowpass_filter_width)**2)) / paddle.i0(beta_tensor) + window = paddle.i0(beta_tensor * paddle.sqrt(x=1 - (t / lowpass_filter_width) ** 2)) / paddle.i0( + beta_tensor + ) t *= math.pi # breakpoint() - kernel = paddle.where( - condition=t == 0, - x=paddle.to_tensor(data=1.0), - y=paddle.sin(x=t) / t) + kernel = paddle.where(condition=t == 0, x=paddle.to_tensor(data=1.0), y=paddle.sin(x=t) / t) paddle.assign(paddle.multiply(kernel, window), kernel) # kernel.scale_(scale=window) kernels.append(kernel) scale = base_freq / orig_freq - kernels = paddle.stack(x=kernels).reshape( - (new_freq, 1, -1)).scale_(scale=scale) + kernels = paddle.stack(x=kernels).reshape((new_freq, 1, -1)).scale_(scale=scale) if dtype is None: kernels = kernels.to(dtype="float32") return kernels, width -def _apply_sinc_resample_kernel(waveform, - orig_freq: int, - new_freq: int, - gcd: int, - kernel, - width: int): +def _apply_sinc_resample_kernel(waveform, orig_freq: int, new_freq: int, gcd: int, kernel, width: int): if not waveform.is_floating_point(): - raise TypeError( - f"Expected floating point type for waveform tensor, but received {waveform.dtype}." - ) + raise TypeError(f"Expected floating point type for waveform tensor, but received {waveform.dtype}.") orig_freq = int(orig_freq) // gcd new_freq = int(new_freq) // gcd shape = waveform.shape waveform = waveform.reshape((-1, shape[-1])) num_wavs, length = waveform.shape - waveform = paddle.nn.functional.pad(waveform.unsqueeze(1), - (width, width + orig_freq), - data_format="NCL").squeeze() + waveform = paddle.nn.functional.pad(waveform.unsqueeze(1), (width, width + orig_freq), data_format="NCL").squeeze() if waveform.dim() == 1: waveform = waveform.unsqueeze(0) - resampled = paddle.nn.functional.conv1d( - x=waveform[:, (None)], weight=kernel, stride=orig_freq) + resampled = paddle.nn.functional.conv1d(x=waveform[:, (None)], weight=kernel, stride=orig_freq) x = resampled perm_0 = list(range(x.ndim)) perm_0[1] = 2 @@ -110,13 +96,14 @@ def _apply_sinc_resample_kernel(waveform, def resample( - waveform, - orig_freq: int, - new_freq: int, - lowpass_filter_width: int=6, - rolloff: float=0.99, - resampling_method: str="sinc_interpolation", - beta: Optional[float]=None, ): + waveform, + orig_freq: int, + new_freq: int, + lowpass_filter_width: int = 6, + rolloff: float = 0.99, + resampling_method: str = "sinc_interpolation", + beta: Optional[float] = None, +): """Resamples the waveform at the new frequency using bandlimited interpolation. [:footcite:`RESAMPLE`]. .. devices:: CPU CUDA @@ -155,8 +142,8 @@ def resample( resampling_method, beta, waveform.place, - waveform.dtype, ) + waveform.dtype, + ) - resampled = _apply_sinc_resample_kernel(waveform, orig_freq, new_freq, gcd, - kernel, width) + resampled = _apply_sinc_resample_kernel(waveform, orig_freq, new_freq, gcd, kernel, width) return resampled diff --git a/paddlemix/models/minigpt4/configuration.py b/paddlemix/models/minigpt4/configuration.py index 49587bb85c845..6eac08f0366d5 100644 --- a/paddlemix/models/minigpt4/configuration.py +++ b/paddlemix/models/minigpt4/configuration.py @@ -16,8 +16,7 @@ import os from typing import Union -from paddlenlp.transformers.auto.modeling import \ - MODEL_FOR_CAUSAL_LM_MAPPING_NAMES +from paddlenlp.transformers.auto.modeling import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from paddlenlp.transformers.configuration_utils import PretrainedConfig from paddlenlp.transformers.llama.configuration import LlamaConfig @@ -74,23 +73,24 @@ class MiniGPT4VisionConfig(PretrainedConfig): model_type = "mimigpt4_vision_model" def __init__( - self, - hidden_size=1408, - intermediate_size=6144, - projection_dim=512, - num_hidden_layers=39, - num_attention_heads=16, - num_channels=3, - image_size=224, - patch_size=14, - hidden_act="gelu", - layer_norm_eps=0.00001, - dropout=0.0, - attention_dropout=0.0, - initializer_range=1e-10, - initializer_factor=1.0, - qkv_bias=True, - **kwargs, ): + self, + hidden_size=1408, + intermediate_size=6144, + projection_dim=512, + num_hidden_layers=39, + num_attention_heads=16, + num_channels=3, + image_size=224, + patch_size=14, + hidden_act="gelu", + layer_norm_eps=0.00001, + dropout=0.0, + attention_dropout=0.0, + initializer_range=1e-10, + initializer_factor=1.0, + qkv_bias=True, + **kwargs, + ): kwargs["return_dict"] = kwargs.pop("return_dict", True) super().__init__(**kwargs) @@ -111,17 +111,13 @@ def __init__( self.qkv_bias = qkv_bias @classmethod - def from_pretrained(cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, - **kwargs) + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the vision config dict if we are loading from MiniGPT4Config if config_dict.get("model_type") == "minigpt4": config_dict = config_dict["vision_config"] - if ("model_type" in config_dict and hasattr(cls, "model_type") and - config_dict["model_type"] != cls.model_type): + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: logger.warning( f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." @@ -188,24 +184,25 @@ class MiniGPT4QFormerConfig(PretrainedConfig): model_type = "minigpt4_qformer" def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - position_embedding_type="absolute", - classifier_dropout=None, - cross_attention_frequency=2, - encoder_hidden_size=1408, - **kwargs, ): + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type="absolute", + classifier_dropout=None, + cross_attention_frequency=2, + encoder_hidden_size=1408, + **kwargs, + ): super().__init__(pad_token_id=pad_token_id, **kwargs) self.vocab_size = vocab_size @@ -225,18 +222,14 @@ def __init__( self.encoder_hidden_size = encoder_hidden_size @classmethod - def from_pretrained(cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, - **kwargs) + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the qformer config dict if we are loading from MiniGPT4Config if config_dict.get("model_type") == "minigpt4": config_dict = config_dict["qformer_config"] - if ("model_type" in config_dict and hasattr(cls, "model_type") and - config_dict["model_type"] != cls.model_type): + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: logger.warning( f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." @@ -290,58 +283,50 @@ class MiniGPT4Config(PretrainedConfig): is_composition = True def __init__( - self, - vision_config=None, - qformer_config=None, - text_config=None, - num_query_tokens=32, - **kwargs, ): + self, + vision_config=None, + qformer_config=None, + text_config=None, + num_query_tokens=32, + **kwargs, + ): super().__init__(**kwargs) if vision_config is None: vision_config = {} - logger.info( - "vision_config is None. initializing the MiniGPT4VisionConfig with default values." - ) + logger.info("vision_config is None. initializing the MiniGPT4VisionConfig with default values.") if qformer_config is None: qformer_config = {} - logger.info( - "qformer_config is None. Initializing the MiniGPT4QFormerConfig with default values." - ) + logger.info("qformer_config is None. Initializing the MiniGPT4QFormerConfig with default values.") if text_config is None: text_config = {} - logger.info( - "text_config is None. Initializing the text config with default values (`LlamaConfig`)." - ) + logger.info("text_config is None. Initializing the text config with default values (`LlamaConfig`).") self.vision_config = MiniGPT4VisionConfig(**vision_config) self.qformer_config = MiniGPT4QFormerConfig(**qformer_config) - text_model_type = (text_config["model_type"] - if "model_type" in text_config else "llama") + text_model_type = text_config["model_type"] if "model_type" in text_config else "llama" if text_model_type == "llama": self.text_config = LlamaConfig(**text_config) else: - raise ValueError( - "Only llama accepted for model_type, but accepted {}.".format( - text_model_type)) + raise ValueError("Only llama accepted for model_type, but accepted {}.".format(text_model_type)) self.num_query_tokens = num_query_tokens self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size - self.use_decoder_only_language_model = ( - self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES) + self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES self.initializer_factor = 1.0 self.initializer_range = 0.02 @classmethod def from_vision_qformer_text_configs( - cls, - vision_config: MiniGPT4VisionConfig, - qformer_config: MiniGPT4QFormerConfig, - text_config: PretrainedConfig, - **kwargs, ): + cls, + vision_config: MiniGPT4VisionConfig, + qformer_config: MiniGPT4QFormerConfig, + text_config: PretrainedConfig, + **kwargs, + ): r""" Instantiate a [`MiniGPT4Config`] (or a derived class) from a vision model, Q-Former and language model configurations. @@ -353,7 +338,8 @@ def from_vision_qformer_text_configs( vision_config=vision_config.to_dict(), qformer_config=qformer_config.to_dict(), text_config=text_config.to_dict(), - **kwargs, ) + **kwargs, + ) def to_dict(self): """ diff --git a/paddlemix/models/minigpt4/modeling.py b/paddlemix/models/minigpt4/modeling.py index 85ef2a007f2b0..4652192427488 100644 --- a/paddlemix/models/minigpt4/modeling.py +++ b/paddlemix/models/minigpt4/modeling.py @@ -23,12 +23,18 @@ from paddle.nn import CrossEntropyLoss from paddlenlp.transformers.llama.modeling import LlamaForCausalLM from paddlenlp.transformers.model_outputs import ( - BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, - BaseModelOutputWithPooling, BaseModelOutputWithPoolingAndCrossAttentions, - ModelOutput) + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPooling, + BaseModelOutputWithPoolingAndCrossAttentions, + ModelOutput, +) from paddlenlp.transformers.model_utils import ( - PretrainedModel, apply_chunking_to_forward, - find_pruneable_heads_and_indices, prune_linear_layer) + PretrainedModel, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer, +) from ...activations import ACT2FN from ...utils.initializer import normal_, ones_, zeros_ @@ -37,8 +43,7 @@ MiniGPT4_PRETRAINED_MODEL_ARCHIVE_LIST = [] -from .configuration import (MiniGPT4Config, MiniGPT4QFormerConfig, - MiniGPT4VisionConfig) +from .configuration import MiniGPT4Config, MiniGPT4QFormerConfig, MiniGPT4VisionConfig __all__ = [ "MiniGPT4Model", @@ -53,15 +58,14 @@ def Parameter(tensor): return paddle.create_parameter( tensor.shape, dtype=tensor.dtype, - default_initializer=nn.initializer.Assign(tensor), ) + default_initializer=nn.initializer.Assign(tensor), + ) def convert_weights_to_dtype(model, dtype: str): # trying to convert model dtype if necessary if dtype not in ["float16", "float32", "float64"]: - raise ValueError( - "Not supported dtype: {}., only [float16, float32, float64] supported.". - format(dtype)) + raise ValueError("Not supported dtype: {}., only [float16, float32, float64] supported.".format(dtype)) dtype_mapping = { "float16": paddle.float16, "float32": paddle.float32, @@ -71,12 +75,9 @@ def convert_weights_to_dtype(model, dtype: str): def convert_for_vit(layer): if isinstance(layer, (nn.Linear, nn.Conv1D, nn.Conv2D)): if layer.weight.dtype != dtype_mapping[dtype]: - layer.weight = transfer_param( - layer.weight, restore_data=True, dtype=dtype) - if layer.bias is not None and layer.bias.dtype != dtype_mapping[ - dtype]: - layer.bias = transfer_param( - layer.bias, restore_data=True, dtype=dtype) + layer.weight = transfer_param(layer.weight, restore_data=True, dtype=dtype) + if layer.bias is not None and layer.bias.dtype != dtype_mapping[dtype]: + layer.bias = transfer_param(layer.bias, restore_data=True, dtype=dtype) if isinstance(model, MiniGPT4VisionModel): model.apply(convert_for_vit) @@ -111,9 +112,11 @@ class MiniGPT4ForConditionalGenerationModelOutput(ModelOutput): def to_tuple(self) -> Tuple[Any]: return tuple( - self[k] if k not in - ["vision_outputs", "qformer_outputs", "language_model_outputs"] else - getattr(self, k).to_tuple() for k in self.keys()) + self[k] + if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"] + else getattr(self, k).to_tuple() + for k in self.keys() + ) class MiniGPT4PretrainedModel(PretrainedModel): @@ -125,13 +128,14 @@ class MiniGPT4PretrainedModel(PretrainedModel): config_class = MiniGPT4Config base_model_prefix = "minigpt4" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids", ] + _keys_to_ignore_on_load_missing = [ + r"position_ids", + ] def _init_weights(self, module): """Initialize the weights""" factor = self.config.initializer_range - if (isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or - isinstance(module, nn.Linear)): + if isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear): normal_(module.weight, mean=0.0, std=factor) if hasattr(module, "bias") and module.bias is not None: zeros_(module.bias) @@ -141,7 +145,9 @@ def _init_weights(self, module): factor = self.config.vision_config.initializer_range trunc_normal_ = nn.initializer.TruncatedNormal(mean=0.0, std=factor) trunc_normal_(module.position_embedding) - trunc_normal_(module.class_embedding, ) + trunc_normal_( + module.class_embedding, + ) elif isinstance(module, nn.LayerNorm): zeros_(module.bias) ones_(module.weight) @@ -154,12 +160,13 @@ def _set_gradient_checkpointing(self, module, value=False): @classmethod def from_pretrained( - cls, - pretrained_model_name_or_path, - from_hf_hub: bool=False, - subfolder: str=None, - *args, - **kwargs, ): + cls, + pretrained_model_name_or_path, + from_hf_hub: bool = False, + subfolder: str = None, + *args, + **kwargs, + ): vit_dtype = kwargs.pop("vit_dtype", "float16") qformer_dtype = kwargs.pop("qformer_dtype", "float32") llama_dtype = kwargs.pop("llama_dtype", "float16") @@ -169,10 +176,10 @@ def from_pretrained( from_hf_hub=from_hf_hub, subfolder=subfolder, *args, - **kwargs, ) + **kwargs, + ) - logger.info( - "Trying to convert dtype for MiniGPT4 model, it may take a while.") + logger.info("Trying to convert dtype for MiniGPT4 model, it may take a while.") if isinstance(model, (MiniGPT4Model, MiniGPT4ForConditionalGeneration)): convert_weights_to_dtype(model.vision_model, dtype=vit_dtype) convert_weights_to_dtype(model.qformer, dtype=qformer_dtype) @@ -203,30 +210,26 @@ def __init__(self, config: MiniGPT4VisionConfig): in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, - stride=self.patch_size, ) + stride=self.patch_size, + ) - self.num_patches = (self.image_size // self.patch_size)**2 + self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_positions = self.num_patches + 1 - self.position_embedding = Parameter( - paddle.randn([1, self.num_positions, self.embed_dim])) + self.position_embedding = Parameter(paddle.randn([1, self.num_positions, self.embed_dim])) def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor: batch_size = pixel_values.shape[0] target_dtype = self.patch_embedding.weight.dtype - patch_embeds = self.patch_embedding( - pixel_values) # shape = [*, width, grid, grid] + patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid] patch_embeds_shape = paddle.shape(patch_embeds) patch_embeds = paddle.reshape( - patch_embeds, - shape=[patch_embeds_shape[0], patch_embeds_shape[1], -1]).transpose( - [0, 2, 1]) + patch_embeds, shape=[patch_embeds_shape[0], patch_embeds_shape[1], -1] + ).transpose([0, 2, 1]) - class_embeds = self.class_embedding.expand( - [batch_size, 1, -1]).cast(target_dtype) + class_embeds = self.class_embedding.expand([batch_size, 1, -1]).cast(target_dtype) embeddings = paddle.concat([class_embeds, patch_embeds], axis=1) - embeddings = embeddings + self.position_embedding[:, :embeddings.shape[ - 1], :].cast(target_dtype) + embeddings = embeddings + self.position_embedding[:, : embeddings.shape[1], :].cast(target_dtype) return embeddings @@ -242,13 +245,13 @@ def __init__(self, config): if self.head_dim * self.num_heads != self.embed_dim: raise ValueError( f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" - f" {self.num_heads}).") + f" {self.num_heads})." + ) self.scale = self.head_dim**-0.5 self.dropout = nn.Dropout(config.attention_dropout) # small tweak here compared to CLIP, no bias here - self.qkv = nn.Linear( - self.embed_dim, 3 * self.embed_dim, bias_attr=False) + self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias_attr=False) if config.qkv_bias: q_bias = Parameter(paddle.zeros([self.embed_dim])) @@ -258,41 +261,37 @@ def __init__(self, config): v_bias = None if q_bias is not None: - qkv_bias = paddle.concat( - (q_bias, paddle.zeros_like(v_bias), v_bias)) + qkv_bias = paddle.concat((q_bias, paddle.zeros_like(v_bias), v_bias)) self.qkv.bias = Parameter(qkv_bias) self.projection = nn.Linear(self.embed_dim, self.embed_dim) def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int): - return tensor.reshape( - [bsz, seq_len, self.num_heads, self.head_dim]).transpose( - [0, 2, 1, 3]) + return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) def forward( - self, - hidden_states: paddle.Tensor, - head_mask: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=False, ) -> Tuple[ - paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[ - paddle.Tensor]]]: + self, + hidden_states: paddle.Tensor, + head_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: """Input shape: Batch x Time x Channel""" bsz, tgt_len, embed_dim = hidden_states.shape mixed_qkv = self.qkv(hidden_states) - mixed_qkv = mixed_qkv.reshape( - [bsz, tgt_len, 3, self.num_heads, - embed_dim // self.num_heads]).transpose([2, 0, 3, 1, 4]) + mixed_qkv = mixed_qkv.reshape([bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads]).transpose( + [2, 0, 3, 1, 4] + ) query_states, key_states, value_states = ( mixed_qkv[0], mixed_qkv[1], - mixed_qkv[2], ) + mixed_qkv[2], + ) # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = paddle.matmul( - query_states, key_states, transpose_y=True) + attention_scores = paddle.matmul(query_states, key_states, transpose_y=True) attention_scores = attention_scores * self.scale @@ -307,16 +306,16 @@ def forward( if head_mask is not None: attention_probs = attention_probs * head_mask - context_layer = paddle.matmul(attention_probs, value_states).transpose( - [0, 2, 1, 3]) + context_layer = paddle.matmul(attention_probs, value_states).transpose([0, 2, 1, 3]) - new_context_layer_shape = context_layer.shape[:-2] + [self.embed_dim, ] + new_context_layer_shape = context_layer.shape[:-2] + [ + self.embed_dim, + ] context_layer = context_layer.reshape(new_context_layer_shape) output = self.projection(context_layer) - outputs = (output, attention_probs) if output_attentions else (output, - None) + outputs = (output, attention_probs) if output_attentions else (output, None) return outputs @@ -341,17 +340,16 @@ def __init__(self, config: MiniGPT4Config): super().__init__() self.embed_dim = config.hidden_size self.self_attn = MiniGPT4Attention(config) - self.layer_norm1 = nn.LayerNorm( - self.embed_dim, epsilon=config.layer_norm_eps) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps) self.mlp = MiniGPT4MLP(config) - self.layer_norm2 = nn.LayerNorm( - self.embed_dim, epsilon=config.layer_norm_eps) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps) def forward( - self, - hidden_states: paddle.Tensor, - attention_mask: paddle.Tensor, - output_attentions: Optional[bool]=False, ) -> Tuple[paddle.Tensor]: + self, + hidden_states: paddle.Tensor, + attention_mask: paddle.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[paddle.Tensor]: """ Args: hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` @@ -368,7 +366,8 @@ def forward( hidden_states, attn_weights = self.self_attn( hidden_states=hidden_states, head_mask=attention_mask, - output_attentions=output_attentions, ) + output_attentions=output_attentions, + ) hidden_states = hidden_states + residual residual = hidden_states hidden_states = self.layer_norm2(hidden_states) @@ -376,10 +375,10 @@ def forward( hidden_states = hidden_states + residual - outputs = (hidden_states, ) + outputs = (hidden_states,) if output_attentions: - outputs += (attn_weights, ) + outputs += (attn_weights,) return outputs @@ -396,20 +395,17 @@ class MiniGPT4Encoder(nn.Layer): def __init__(self, config: MiniGPT4Config): super().__init__() self.config = config - self.layers = nn.LayerList([ - MiniGPT4EncoderLayer(config) - for _ in range(config.num_hidden_layers) - ]) + self.layers = nn.LayerList([MiniGPT4EncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False def forward( - self, - inputs_embeds, - attention_mask: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - return_dict: Optional[bool]=None, ) -> Union[Tuple, - BaseModelOutput]: + self, + inputs_embeds, + attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: r""" Args: inputs_embeds (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -430,13 +426,11 @@ def forward( return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ - output_attentions = (output_attentions if output_attentions is not None - else self.config.output_attentions) - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -444,7 +438,7 @@ def forward( hidden_states = inputs_embeds for idx, encoder_layer in enumerate(self.layers): if output_hidden_states: - encoder_states = encoder_states + (hidden_states, ) + encoder_states = encoder_states + (hidden_states,) if self.gradient_checkpointing and self.training: def create_custom_forward(module): @@ -456,29 +450,30 @@ def custom_forward(*inputs): layer_outputs = recompute( create_custom_forward(encoder_layer), hidden_states, - attention_mask, ) + attention_mask, + ) else: layer_outputs = encoder_layer( hidden_states, attention_mask, - output_attentions=output_attentions, ) + output_attentions=output_attentions, + ) hidden_states = layer_outputs[0] if output_attentions: - all_attentions = all_attentions + (layer_outputs[1], ) + all_attentions = all_attentions + (layer_outputs[1],) if output_hidden_states: - encoder_states = encoder_states + (hidden_states, ) + encoder_states = encoder_states + (hidden_states,) if not return_dict: - return tuple( - v for v in [hidden_states, encoder_states, all_attentions] - if v is not None) + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, - attentions=all_attentions, ) + attentions=all_attentions, + ) class MiniGPT4VisionModel(MiniGPT4PretrainedModel): @@ -492,26 +487,23 @@ def __init__(self, config: MiniGPT4VisionConfig): self.embeddings = MiniGPT4VisionEmbeddings(config) self.encoder = MiniGPT4Encoder(config) - self.post_layernorm = nn.LayerNorm( - embed_dim, epsilon=config.layer_norm_eps) + self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps) def forward( - self, - pixel_values: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - return_dict: Optional[bool]=None, ) -> Union[ - Tuple, BaseModelOutputWithPooling]: + self, + pixel_values: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" Returns: """ - output_attentions = (output_attentions if output_attentions is not None - else self.config.output_attentions) - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -522,7 +514,8 @@ def forward( inputs_embeds=hidden_states, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) last_hidden_state = encoder_outputs[0] last_hidden_state = self.post_layernorm(last_hidden_state) @@ -537,7 +530,8 @@ def forward( last_hidden_state=last_hidden_state, pooler_output=pooled_output, hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, ) + attentions=encoder_outputs.attentions, + ) def get_input_embeddings(self): return self.embeddings @@ -547,35 +541,29 @@ class MiniGPT4QFormerMultiHeadAttention(nn.Layer): def __init__(self, config, is_cross_attention=False): super().__init__() self.config = config - if config.hidden_size % config.num_attention_heads != 0 and not hasattr( - config, "embedding_size"): + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( "The hidden size (%d) is not a multiple of the number of attention heads (%d)" - % (config.hidden_size, config.num_attention_heads)) + % (config.hidden_size, config.num_attention_heads) + ) self.num_attention_heads = config.num_attention_heads - self.attention_head_size = int(config.hidden_size / - config.num_attention_heads) + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query = nn.Linear(config.hidden_size, self.all_head_size) if is_cross_attention: self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size) - self.value = nn.Linear(config.encoder_hidden_size, - self.all_head_size) + self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size) else: self.key = nn.Linear(config.hidden_size, self.all_head_size) self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = getattr( - config, "position_embedding_type", "absolute") - if (self.position_embedding_type == "relative_key" or - self.position_embedding_type == "relative_key_query"): + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding( - 2 * config.max_position_embeddings - 1, - self.attention_head_size) + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.save_attention = False def save_attn_gradients(self, attn_gradients): @@ -599,31 +587,29 @@ def transpose_for_scores(self, x): return x.transpose([0, 2, 1, 3]) def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, ): + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): # If this is instantiated as a cross-attention module, the keys # and values come from an encoder; the attention mask needs to be # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None if is_cross_attention: - key_layer = self.transpose_for_scores( - self.key(encoder_hidden_states)) - value_layer = self.transpose_for_scores( - self.value(encoder_hidden_states)) + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) attention_mask = encoder_attention_mask elif past_key_value is not None: key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) key_layer = paddle.concat([past_key_value[0], key_layer], axis=2) - value_layer = paddle.concat( - [past_key_value[1], value_layer], axis=2) + value_layer = paddle.concat([past_key_value[1], value_layer], axis=2) else: key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) @@ -635,37 +621,25 @@ def forward( past_key_value = (key_layer, value_layer) # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = paddle.matmul( - query_layer, key_layer, transpose_y=True) + attention_scores = paddle.matmul(query_layer, key_layer, transpose_y=True) - if (self.position_embedding_type == "relative_key" or - self.position_embedding_type == "relative_key_query"): + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": seq_length = hidden_states.shape[1] - position_ids_l = paddle.arange( - seq_length, dtype="int64").reshape([-1, 1]) - position_ids_r = paddle.arange( - seq_length, dtype="int64").reshape([1, -1]) + position_ids_l = paddle.arange(seq_length, dtype="int64").reshape([-1, 1]) + position_ids_r = paddle.arange(seq_length, dtype="int64").reshape([1, -1]) distance = position_ids_l - position_ids_r - positional_embedding = self.distance_embedding( - distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.cast( - dtype=query_layer.dtype) # fp16 compatibility + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.cast(dtype=query_layer.dtype) # fp16 compatibility if self.position_embedding_type == "relative_key": - relative_position_scores = paddle.einsum( - "bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) attention_scores = attention_scores + relative_position_scores elif self.position_embedding_type == "relative_key_query": - relative_position_scores_query = paddle.einsum( - "bhld,lrd->bhlr", query_layer, positional_embedding) - relative_position_scores_key = paddle.einsum( - "bhrd,lrd->bhlr", key_layer, positional_embedding) - attention_scores = ( - attention_scores + relative_position_scores_query + - relative_position_scores_key) + relative_position_scores_query = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = paddle.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key - attention_scores = attention_scores / math.sqrt( - self.attention_head_size) + attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in BertModel forward() function) @@ -694,10 +668,9 @@ def forward( ] context_layer = context_layer.reshape(new_context_layer_shape) - outputs = ((context_layer, attention_probs) - if output_attentions else (context_layer, )) + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) - outputs = outputs + (past_key_value, ) + outputs = outputs + (past_key_value,) return outputs @@ -705,12 +678,10 @@ class MiniGPT4QFormerSelfOutput(nn.Layer): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.LayerNorm = nn.LayerNorm( - config.hidden_size, epsilon=config.layer_norm_eps) + self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) - def forward(self, hidden_states: paddle.Tensor, - input_tensor: paddle.Tensor) -> paddle.Tensor: + def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) @@ -720,8 +691,7 @@ def forward(self, hidden_states: paddle.Tensor, class MiniGPT4QFormerAttention(nn.Layer): def __init__(self, config, is_cross_attention=False): super().__init__() - self.attention = MiniGPT4QFormerMultiHeadAttention(config, - is_cross_attention) + self.attention = MiniGPT4QFormerMultiHeadAttention(config, is_cross_attention) self.output = MiniGPT4QFormerSelfOutput(config) self.pruned_heads = set() @@ -732,7 +702,8 @@ def prune_heads(self, heads): heads, self.attention.num_attention_heads, self.attention.attention_head_size, - self.pruned_heads, ) + self.pruned_heads, + ) # Prune linear layers self.attention.query = prune_linear_layer(self.attention.query, index) @@ -741,21 +712,20 @@ def prune_heads(self, heads): self.output.dense = prune_linear_layer(self.output.dense, index, axis=1) # Update hyper params and store pruned heads - self.attention.num_attention_heads = self.attention.num_attention_heads - len( - heads) - self.attention.all_head_size = (self.attention.attention_head_size * - self.attention.num_attention_heads) + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) def forward( - self, - hidden_states: paddle.Tensor, - attention_mask: Optional[paddle.Tensor]=None, - head_mask: Optional[paddle.Tensor]=None, - encoder_hidden_states: Optional[paddle.Tensor]=None, - encoder_attention_mask: Optional[paddle.Tensor]=None, - past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]]=None, - output_attentions: Optional[bool]=False, ) -> Tuple[paddle.Tensor]: + self, + hidden_states: paddle.Tensor, + attention_mask: Optional[paddle.Tensor] = None, + head_mask: Optional[paddle.Tensor] = None, + encoder_hidden_states: Optional[paddle.Tensor] = None, + encoder_attention_mask: Optional[paddle.Tensor] = None, + past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[paddle.Tensor]: self_outputs = self.attention( hidden_states, attention_mask, @@ -763,10 +733,10 @@ def forward( encoder_hidden_states, encoder_attention_mask, past_key_value, - output_attentions, ) + output_attentions, + ) attention_output = self.output(self_outputs[0], hidden_states) - outputs = (attention_output, - ) + self_outputs[1:] # add attentions if we output them + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs @@ -789,12 +759,10 @@ class MiniGPT4QFormerOutput(nn.Layer): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.intermediate_size, config.hidden_size) - self.LayerNorm = nn.LayerNorm( - config.hidden_size, epsilon=config.layer_norm_eps) + self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) - def forward(self, hidden_states: paddle.Tensor, - input_tensor: paddle.Tensor) -> paddle.Tensor: + def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) @@ -811,8 +779,7 @@ def __init__(self, config, layer_idx): self.layer_idx = layer_idx if layer_idx % config.cross_attention_frequency == 0: - self.crossattention = MiniGPT4QFormerAttention( - config, is_cross_attention=True) + self.crossattention = MiniGPT4QFormerAttention(config, is_cross_attention=True) self.has_cross_attention = True else: self.has_cross_attention = False @@ -821,24 +788,25 @@ def __init__(self, config, layer_idx): self.output_query = MiniGPT4QFormerOutput(config) def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, - query_length=0, ): + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + query_length=0, + ): # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 - self_attn_past_key_value = (past_key_value[:2] - if past_key_value is not None else None) + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None self_attention_outputs = self.attention( hidden_states, attention_mask, head_mask, output_attentions=output_attentions, - past_key_value=self_attn_past_key_value, ) + past_key_value=self_attn_past_key_value, + ) attention_output = self_attention_outputs[0] outputs = self_attention_outputs[1:-1] @@ -849,16 +817,15 @@ def forward( if self.has_cross_attention: if encoder_hidden_states is None: - raise ValueError( - "encoder_hidden_states must be given for cross-attention layers" - ) + raise ValueError("encoder_hidden_states must be given for cross-attention layers") cross_attention_outputs = self.crossattention( query_attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, - output_attentions=output_attentions, ) + output_attentions=output_attentions, + ) query_attention_output = cross_attention_outputs[0] # add cross attentions if we output attention weights outputs = outputs + cross_attention_outputs[1:-1] @@ -867,25 +834,27 @@ def forward( self.feed_forward_chunk_query, self.chunk_size_feed_forward, self.seq_len_dim, - query_attention_output, ) + query_attention_output, + ) if attention_output.shape[1] > query_length: layer_output_text = apply_chunking_to_forward( self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, - attention_output[:, query_length:, :], ) - layer_output = paddle.concat( - [layer_output, layer_output_text], axis=1) + attention_output[:, query_length:, :], + ) + layer_output = paddle.concat([layer_output, layer_output_text], axis=1) else: layer_output = apply_chunking_to_forward( self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, - attention_output, ) - outputs = (layer_output, ) + outputs + attention_output, + ) + outputs = (layer_output,) + outputs - outputs = outputs + (present_key_value, ) + outputs = outputs + (present_key_value,) return outputs @@ -904,25 +873,25 @@ class MiniGPT4QFormerEncoder(nn.Layer): def __init__(self, config): super().__init__() self.config = config - self.layer = nn.LayerList([ - MiniGPT4QFormerLayer(config, layer_idx) - for layer_idx in range(config.num_hidden_layers) - ]) + self.layer = nn.LayerList( + [MiniGPT4QFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) self.gradient_checkpointing = False def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_values=None, - use_cache=None, - output_attentions=False, - output_hidden_states=False, - return_dict=True, - query_length=0, ): + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + query_length=0, + ): all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions else None @@ -932,14 +901,12 @@ def forward( for i in range(self.config.num_hidden_layers): layer_module = self.layer[i] if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states, ) + all_hidden_states = all_hidden_states + (hidden_states,) layer_head_mask = head_mask[i] if head_mask is not None else None - past_key_value = past_key_values[ - i] if past_key_values is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None - if getattr(self.config, "gradient_checkpointing", - False) and self.training: + if getattr(self.config, "gradient_checkpointing", False) and self.training: if use_cache: logger.warn( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -948,8 +915,7 @@ def forward( def create_custom_forward(module): def custom_forward(*inputs): - return module(*inputs, past_key_value, - output_attentions, query_length) + return module(*inputs, past_key_value, output_attentions, query_length) return custom_forward @@ -959,7 +925,8 @@ def custom_forward(*inputs): attention_mask, layer_head_mask, encoder_hidden_states, - encoder_attention_mask, ) + encoder_attention_mask, + ) else: layer_outputs = layer_module( hidden_states, @@ -969,35 +936,39 @@ def custom_forward(*inputs): encoder_attention_mask, past_key_value, output_attentions, - query_length, ) + query_length, + ) hidden_states = layer_outputs[0] if use_cache: - next_decoder_cache += (layer_outputs[-1], ) + next_decoder_cache += (layer_outputs[-1],) if output_attentions: - all_self_attentions = all_self_attentions + (layer_outputs[1], ) + all_self_attentions = all_self_attentions + (layer_outputs[1],) if layer_module.has_cross_attention: - all_cross_attentions = all_cross_attentions + ( - layer_outputs[2], ) + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states, ) + all_hidden_states = all_hidden_states + (hidden_states,) if not return_dict: - return tuple(v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] if v is not None) + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, hidden_states=all_hidden_states, attentions=all_self_attentions, - cross_attentions=all_cross_attentions, ) + cross_attentions=all_cross_attentions, + ) class MiniGPT4QFormerModel(MiniGPT4PretrainedModel): @@ -1009,8 +980,7 @@ def __init__(self, config: MiniGPT4QFormerConfig): super().__init__(config) self.config = config - self.layernorm = nn.LayerNorm( - config.hidden_size, epsilon=config.layer_norm_eps) + self.layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.encoder = MiniGPT4QFormerEncoder(config) @@ -1030,10 +1000,11 @@ class PreTrainedModel self.encoder.layer[layer].attention.prune_heads(heads) def get_extended_attention_mask( - self, - attention_mask: paddle.Tensor, - input_shape: Tuple[int], - has_query: bool=False, ) -> paddle.Tensor: + self, + attention_mask: paddle.Tensor, + input_shape: Tuple[int], + has_query: bool = False, + ) -> paddle.Tensor: """ Makes broadcastable attention and causal masks so that future and masked tokens are ignored. Arguments: @@ -1054,21 +1025,21 @@ def get_extended_attention_mask( extended_attention_mask = attention_mask[:, None, None, :] else: raise ValueError( - "Wrong shape for input_ids (shape {}) or attention_mask (shape {})". - format(input_shape, attention_mask.shape)) + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( + input_shape, attention_mask.shape + ) + ) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. - extended_attention_mask = extended_attention_mask.cast( - dtype=self.layernorm.weight.dtype) # fp16 compatibility + extended_attention_mask = extended_attention_mask.cast(dtype=self.layernorm.weight.dtype) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 return extended_attention_mask - def invert_attention_mask( - self, encoder_attention_mask: paddle.Tensor) -> paddle.Tensor: + def invert_attention_mask(self, encoder_attention_mask: paddle.Tensor) -> paddle.Tensor: """ Invert an attention mask (e.g., switches 0. and 1.). Args: @@ -1077,26 +1048,25 @@ def invert_attention_mask( `paddle.Tensor`: The inverted attention mask. """ if encoder_attention_mask.ndim == 3: - encoder_extended_attention_mask = encoder_attention_mask[:, - None, :, :] + encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] if encoder_attention_mask.ndim == 2: - encoder_extended_attention_mask = encoder_attention_mask[:, None, - None, :] + encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow # /transformer/transformer_layers.py#L270 encoder_extended_attention_mask = encoder_extended_attention_mask.cast( - dtype=self.layernorm.weight.dtype) # fp16 compatibility - encoder_extended_attention_mask = ( - 1.0 - encoder_extended_attention_mask) * -1e4 + dtype=self.layernorm.weight.dtype + ) # fp16 compatibility + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4 return encoder_extended_attention_mask def get_head_mask( - self, - head_mask: Optional[paddle.Tensor], - num_hidden_layers: int, - is_attention_chunked: bool=False, ) -> paddle.Tensor: + self, + head_mask: Optional[paddle.Tensor], + num_hidden_layers: int, + is_attention_chunked: bool = False, + ) -> paddle.Tensor: """ Prepare the head mask if needed. Args: @@ -1111,8 +1081,7 @@ def get_head_mask( `[None]` for each layer. """ if head_mask is not None: - head_mask = self._convert_head_mask_to_5d(head_mask, - num_hidden_layers) + head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers) if is_attention_chunked is True: head_mask = head_mask.unsqueeze(-1) else: @@ -1123,30 +1092,27 @@ def get_head_mask( def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers): """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]""" if head_mask.ndim == 1: - head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze( - -1).unsqueeze(-1) + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) head_mask = head_mask.expand([num_hidden_layers, -1, -1, -1, -1]) elif head_mask.ndim == 2: - head_mask = (head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) - ) # We can specify head_mask for each layer + head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer assert head_mask.ndim == 5, f"head_mask.dim != 5, instead {head_mask.dim()}" - head_mask = head_mask.cast( - dtype=self.config. - dtype) # switch to float if need + fp16 compatibility + head_mask = head_mask.cast(dtype=self.config.dtype) # switch to float if need + fp16 compatibility return head_mask def forward( - self, - query_embeds, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_values=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, ): + self, + query_embeds, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): r""" encoder_hidden_states (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1166,62 +1132,52 @@ def forward( If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). """ - output_attentions = (output_attentions if output_attentions is not None - else self.config.output_attentions) - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # past_key_values_length past_key_values_length = ( - past_key_values[0][0].shape[2] - self.config.query_length - if past_key_values is not None else 0) + past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0 + ) query_length = query_embeds.shape[1] if query_embeds is not None else 0 - embedding_output = self.layernorm( - query_embeds.cast(self.layernorm.weight.dtype)) + embedding_output = self.layernorm(query_embeds.cast(self.layernorm.weight.dtype)) embedding_output = self.dropout(embedding_output) input_shape = embedding_output.shape[:-1] batch_size, seq_length = input_shape if attention_mask is None: - attention_mask = paddle.ones(( - (batch_size, seq_length + past_key_values_length))) + attention_mask = paddle.ones(((batch_size, seq_length + past_key_values_length))) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask = self.get_extended_attention_mask( - attention_mask, input_shape) + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] if encoder_hidden_states is not None: if type(encoder_hidden_states) == list: - encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[ - 0].shape + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].shape else: ( encoder_batch_size, encoder_sequence_length, - _, ) = encoder_hidden_states.shape + _, + ) = encoder_hidden_states.shape encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if type(encoder_attention_mask) == list: - encoder_extended_attention_mask = [ - self.invert_attention_mask(mask) - for mask in encoder_attention_mask - ] + encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask] elif encoder_attention_mask is None: encoder_attention_mask = paddle.ones(encoder_hidden_shape) - encoder_extended_attention_mask = self.invert_attention_mask( - encoder_attention_mask) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: - encoder_extended_attention_mask = self.invert_attention_mask( - encoder_attention_mask) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_extended_attention_mask = None @@ -1243,7 +1199,8 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, - query_length=query_length, ) + query_length=query_length, + ) sequence_output = encoder_outputs[0] pooled_output = sequence_output[:, 0, :] @@ -1256,7 +1213,8 @@ def forward( past_key_values=encoder_outputs.past_key_values, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, - cross_attentions=encoder_outputs.cross_attentions, ) + cross_attentions=encoder_outputs.cross_attentions, + ) class MiniGPT4Model(MiniGPT4PretrainedModel): @@ -1268,27 +1226,24 @@ def __init__(self, config: MiniGPT4Config): self.vision_model = MiniGPT4VisionModel(config.vision_config) - self.query_tokens = Parameter( - paddle.zeros([ - 1, config.num_query_tokens, config.qformer_config.hidden_size - ])) + self.query_tokens = Parameter(paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size])) self.qformer = MiniGPT4QFormerModel(config.qformer_config) - self.language_projection = nn.Linear(config.qformer_config.hidden_size, - config.text_config.hidden_size) + self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size) self.language_model = LlamaForCausalLM(config.text_config) def get_input_embeddings(self) -> nn.Layer: return self.vision_model.embeddings.patch_embedding def get_text_features( - self, - input_ids: Optional[paddle.Tensor]=None, - attention_mask: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - return_dict: Optional[bool]=None, - **kwargs, ): + self, + input_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ): r""" Returns: text_outputs (`CausalLMOutputWithPast`, or `tuple(paddle.Tensor)` if `return_dict=False`): @@ -1306,30 +1261,30 @@ def get_text_features( >>> inputs = tokenizer(["a photo of a cat"], padding=True, return_tensors="pd", return_token_type_ids=False) >>> text_features = model.get_text_features(**inputs) ```""" - output_attentions = (output_attentions if output_attentions is not None - else self.config.output_attentions) - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict text_outputs = self.language_model( input_ids=input_ids, attention_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) return text_outputs def get_image_features( - self, - pixel_values: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - return_dict: Optional[bool]=None, - **kwargs, ): + self, + pixel_values: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ): r""" Returns: vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`): @@ -1349,32 +1304,30 @@ def get_image_features( >>> inputs = processor.process_images(images=image, return_tensors="pd") >>> image_outputs = model.get_image_features(**inputs) ```""" - output_attentions = (output_attentions if output_attentions is not None - else self.config.output_attentions) - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) - - pixel_values = paddle.cast( - pixel_values, - self.vision_model.embeddings.patch_embedding.weight.dtype) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype) vision_outputs = self.vision_model( pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) return vision_outputs def get_qformer_features( - self, - pixel_values: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - return_dict: Optional[bool]=None, - **kwargs, ): + self, + pixel_values: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ): r""" Returns: vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`): @@ -1394,56 +1347,51 @@ def get_qformer_features( >>> inputs = processor.process_images(images=image, return_tensors="pd") >>> qformer_outputs = model.get_qformer_features(**inputs) ```""" - output_attentions = (output_attentions if output_attentions is not None - else self.config.output_attentions) - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # step 1: forward the images through the vision encoder, # to get image embeddings of shape (batch_size, seq_len, hidden_size) - pixel_values = paddle.cast( - pixel_values, - self.vision_model.embeddings.patch_embedding.weight.dtype) + pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype) vision_outputs = self.vision_model( pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) image_embeds = vision_outputs[0] - image_attention_mask = paddle.ones( - image_embeds.shape[:-1], dtype="int64") + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) - query_tokens = paddle.cast(query_tokens, - self.qformer.layernorm.weight.dtype) - image_embeds = paddle.cast(image_embeds, - self.qformer.layernorm.weight.dtype) + query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype) + image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype) query_outputs = self.qformer( query_embeds=query_tokens, encoder_hidden_states=image_embeds, encoder_attention_mask=image_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=True, ) + return_dict=True, + ) return query_outputs def forward( - self, - pixel_values: paddle.Tensor, # processed image - first_input_ids: paddle.Tensor, - second_input_ids: paddle.Tensor, - first_attention_mask: Optional[paddle.Tensor]=None, - second_attention_mask: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - labels: Optional[paddle.Tensor]=None, - return_dict: Optional[bool]=None, ) -> Union[ - Tuple, MiniGPT4ForConditionalGenerationModelOutput]: + self, + pixel_values: paddle.Tensor, # processed image + first_input_ids: paddle.Tensor, + second_input_ids: paddle.Tensor, + first_attention_mask: Optional[paddle.Tensor] = None, + second_attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[paddle.Tensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, MiniGPT4ForConditionalGenerationModelOutput]: r""" Returns: Examples: @@ -1461,69 +1409,61 @@ def forward( >>> inputs = processor(images=image, texts=text, prompts=prompt, return_tensors="pd") >>> outputs = model(**inputs) ```""" - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # step 1: forward the images through the vision encoder, # to get image embeddings of shape (batch_size, seq_len, hidden_size) - pixel_values = paddle.cast( - pixel_values, - self.vision_model.embeddings.patch_embedding.weight.dtype) + pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype) vision_outputs = self.vision_model(pixel_values, return_dict=True) image_embeds = vision_outputs.last_hidden_state - image_attention_mask = paddle.ones( - image_embeds.shape[:-1], dtype="int64") + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) - query_tokens = paddle.cast(query_tokens, - self.qformer.layernorm.weight.dtype) - image_embeds = paddle.cast(image_embeds, - self.qformer.layernorm.weight.dtype) + query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype) + image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype) query_outputs = self.qformer( query_embeds=query_tokens, encoder_hidden_states=image_embeds, encoder_attention_mask=image_attention_mask, - return_dict=True, ) + return_dict=True, + ) query_output = query_outputs.last_hidden_state # step 3: use the language model, conditioned on the text and image language_model_inputs = self.language_projection(query_output) - language_model_attention_mask = paddle.ones( - language_model_inputs.shape[:-1], dtype="int64") + language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64") first_embeds = self.language_model.llama.embed_tokens(first_input_ids) second_embeds = self.language_model.llama.embed_tokens(second_input_ids) - language_model_inputs = paddle.cast( - language_model_inputs, dtype=first_embeds.dtype) - inputs_embeds = paddle.concat( - [first_embeds, language_model_inputs, second_embeds], axis=1) + language_model_inputs = paddle.cast(language_model_inputs, dtype=first_embeds.dtype) + inputs_embeds = paddle.concat([first_embeds, language_model_inputs, second_embeds], axis=1) if first_attention_mask is None: - first_attention_mask = paddle.ones( - first_embeds.shape[:-1], dtype="int64") + first_attention_mask = paddle.ones(first_embeds.shape[:-1], dtype="int64") if second_attention_mask is None: - second_attention_mask = paddle.ones( - second_embeds.shape[:-1], dtype="int64") + second_attention_mask = paddle.ones(second_embeds.shape[:-1], dtype="int64") attention_mask = paddle.concat( [ first_attention_mask, language_model_attention_mask, second_attention_mask, ], - axis=1, ) + axis=1, + ) outputs = self.language_model( inputs_embeds=inputs_embeds, attention_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) logits = outputs.logits if return_dict else outputs[0] loss = None # we compute the loss here since we need to take into account the sequence length of the query embeds if labels is not None: - logits = logits[:, -labels.shape[1]:, :] + logits = logits[:, -labels.shape[1] :, :] # Shift so that tokens < n predict n shift_logits = logits[..., :-1, :] shift_labels = labels[..., 1:] @@ -1533,18 +1473,20 @@ def forward( loss = loss_fct( shift_logits.reshape([-1, self.config.text_config.vocab_size]), - shift_labels.reshape([-1]), ) + shift_labels.reshape([-1]), + ) if not return_dict: output = (logits, vision_outputs, query_outputs, outputs) - return ((loss, ) + output) if loss is not None else output + return ((loss,) + output) if loss is not None else output return MiniGPT4ForConditionalGenerationModelOutput( loss=loss, logits=logits, vision_outputs=vision_outputs, qformer_outputs=query_outputs, - language_model_outputs=outputs, ) + language_model_outputs=outputs, + ) class MiniGPT4ForConditionalGeneration(MiniGPT4PretrainedModel): @@ -1556,30 +1498,26 @@ def __init__(self, config: MiniGPT4Config): self.config = config self.vision_model = MiniGPT4VisionModel(config.vision_config) - self.query_tokens = Parameter( - paddle.zeros([ - 1, config.num_query_tokens, config.qformer_config.hidden_size - ])) + self.query_tokens = Parameter(paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size])) self.qformer = MiniGPT4QFormerModel(config.qformer_config) - self.language_projection = nn.Linear(config.qformer_config.hidden_size, - config.text_config.hidden_size) + self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size) self.language_model = LlamaForCausalLM(config.text_config) def get_input_embeddings(self) -> nn.Layer: return self.vision_model.embeddings.patch_embedding def forward( - self, - pixel_values: paddle.Tensor, # processed image - first_input_ids: paddle.Tensor, - second_input_ids: paddle.Tensor, - first_attention_mask: Optional[paddle.Tensor]=None, - second_attention_mask: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - labels: Optional[paddle.Tensor]=None, - return_dict: Optional[bool]=None, ) -> Union[ - Tuple, MiniGPT4ForConditionalGenerationModelOutput]: + self, + pixel_values: paddle.Tensor, # processed image + first_input_ids: paddle.Tensor, + second_input_ids: paddle.Tensor, + first_attention_mask: Optional[paddle.Tensor] = None, + second_attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[paddle.Tensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, MiniGPT4ForConditionalGenerationModelOutput]: r""" Examples: ```python @@ -1596,70 +1534,62 @@ def forward( >>> inputs = processor(images=image, texts=text, prompts=prompt, return_tensors="pd") >>> outputs = model(**inputs) ```""" - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # step 1: forward the images through the vision encoder, # to get image embeddings of shape (batch_size, seq_len, hidden_size) - pixel_values = paddle.cast( - pixel_values, - self.vision_model.embeddings.patch_embedding.weight.dtype) + pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype) vision_outputs = self.vision_model(pixel_values, return_dict=True) image_embeds = vision_outputs.last_hidden_state - image_attention_mask = paddle.ones( - image_embeds.shape[:-1], dtype="int64") + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) - query_tokens = paddle.cast(query_tokens, - self.qformer.layernorm.weight.dtype) - image_embeds = paddle.cast(image_embeds, - self.qformer.layernorm.weight.dtype) + query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype) + image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype) query_outputs = self.qformer( query_embeds=query_tokens, encoder_hidden_states=image_embeds, encoder_attention_mask=image_attention_mask, - return_dict=True, ) + return_dict=True, + ) query_output = query_outputs.last_hidden_state # step 3: use the language model, conditioned on the text and image language_model_inputs = self.language_projection(query_output) - language_model_attention_mask = paddle.ones( - language_model_inputs.shape[:-1], dtype="int64") + language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64") first_embeds = self.language_model.llama.embed_tokens(first_input_ids) second_embeds = self.language_model.llama.embed_tokens(second_input_ids) - language_model_inputs = paddle.cast( - language_model_inputs, dtype=first_embeds.dtype) - inputs_embeds = paddle.concat( - [first_embeds, language_model_inputs, second_embeds], axis=1) + language_model_inputs = paddle.cast(language_model_inputs, dtype=first_embeds.dtype) + inputs_embeds = paddle.concat([first_embeds, language_model_inputs, second_embeds], axis=1) if first_attention_mask is None: - first_attention_mask = paddle.ones( - first_embeds.shape[:-1], dtype="int64") + first_attention_mask = paddle.ones(first_embeds.shape[:-1], dtype="int64") if second_attention_mask is None: - second_attention_mask = paddle.ones( - second_embeds.shape[:-1], dtype="int64") + second_attention_mask = paddle.ones(second_embeds.shape[:-1], dtype="int64") attention_mask = paddle.concat( [ first_attention_mask, language_model_attention_mask, second_attention_mask, ], - axis=1, ) + axis=1, + ) outputs = self.language_model( inputs_embeds=inputs_embeds, attention_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) logits = outputs.logits if return_dict else outputs[0] loss = None # we compute the loss here since we need to take into account the sequence length of the query embeds if labels is not None: - logits = logits[:, -labels.shape[1]:, :] + logits = logits[:, -labels.shape[1] :, :] # Shift so that tokens < n predict n shift_logits = logits[..., :-1, :] shift_labels = labels[..., 1:] @@ -1669,28 +1599,31 @@ def forward( loss = loss_fct( shift_logits.reshape([-1, self.config.text_config.vocab_size]), - shift_labels.reshape([-1]), ) + shift_labels.reshape([-1]), + ) if not return_dict: output = (logits, vision_outputs, query_outputs, outputs) - return ((loss, ) + output) if loss is not None else output + return ((loss,) + output) if loss is not None else output return MiniGPT4ForConditionalGenerationModelOutput( loss=loss, logits=logits, vision_outputs=vision_outputs, qformer_outputs=query_outputs, - language_model_outputs=outputs, ) + language_model_outputs=outputs, + ) @paddle.no_grad() def generate( - self, - pixel_values: paddle.Tensor, # processed image - first_input_ids: paddle.Tensor, - second_input_ids: paddle.Tensor, - first_attention_mask: Optional[paddle.Tensor]=None, - second_attention_mask: Optional[paddle.Tensor]=None, - **generate_kwargs, ) -> paddle.Tensor: + self, + pixel_values: paddle.Tensor, # processed image + first_input_ids: paddle.Tensor, + second_input_ids: paddle.Tensor, + first_attention_mask: Optional[paddle.Tensor] = None, + second_attention_mask: Optional[paddle.Tensor] = None, + **generate_kwargs, + ) -> paddle.Tensor: """ Overrides `generate` function to be able to use the model as a conditional generator. Args: @@ -1725,64 +1658,57 @@ def generate( """ # step 1: forward the images through the vision encoder, # to get image embeddings of shape (batch_size, seq_len, hidden_size) - pixel_values = paddle.cast( - pixel_values, - self.vision_model.embeddings.patch_embedding.weight.dtype) + pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype) vision_outputs = self.vision_model(pixel_values, return_dict=True) image_embeds = vision_outputs.last_hidden_state - image_attention_mask = paddle.ones( - image_embeds.shape[:-1], dtype="int64") + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) - query_tokens = paddle.cast(query_tokens, - self.qformer.layernorm.weight.dtype) - image_embeds = paddle.cast(image_embeds, - self.qformer.layernorm.weight.dtype) + query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype) + image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype) query_outputs = self.qformer( query_embeds=query_tokens, encoder_hidden_states=image_embeds, encoder_attention_mask=image_attention_mask, - return_dict=True, ) + return_dict=True, + ) query_output = query_outputs.last_hidden_state # step 3: use the language model, conditioned on the text and image language_model_inputs = self.language_projection(query_output) - language_model_attention_mask = paddle.ones( - language_model_inputs.shape[:-1], dtype="int64") + language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64") first_embeds = self.language_model.llama.embed_tokens(first_input_ids) second_embeds = self.language_model.llama.embed_tokens(second_input_ids) - language_model_inputs = paddle.cast( - language_model_inputs, dtype=first_embeds.dtype) - inputs_embeds = paddle.concat( - [first_embeds, language_model_inputs, second_embeds], axis=1) + language_model_inputs = paddle.cast(language_model_inputs, dtype=first_embeds.dtype) + inputs_embeds = paddle.concat([first_embeds, language_model_inputs, second_embeds], axis=1) if first_attention_mask is None: - first_attention_mask = paddle.ones( - first_embeds.shape[:-1], dtype="int64") + first_attention_mask = paddle.ones(first_embeds.shape[:-1], dtype="int64") if second_attention_mask is None: - second_attention_mask = paddle.ones( - second_embeds.shape[:-1], dtype="int64") + second_attention_mask = paddle.ones(second_embeds.shape[:-1], dtype="int64") attention_mask = paddle.concat( [ first_attention_mask, language_model_attention_mask, second_attention_mask, ], - axis=1, ) + axis=1, + ) outputs = self.language_model.generate( inputs_embeds=inputs_embeds, attention_mask=attention_mask, - **generate_kwargs, ) + **generate_kwargs, + ) return outputs @paddle.no_grad() def encode_images( - self, - pixel_values: paddle.Tensor, # processed image + self, + pixel_values: paddle.Tensor, # processed image ) -> paddle.Tensor: """ Overrides `generate` function to be able to use the model as a conditional generator. @@ -1807,44 +1733,40 @@ def encode_images( """ # step 1: forward the images through the vision encoder, # to get image embeddings of shape (batch_size, seq_len, hidden_size) - pixel_values = paddle.cast( - pixel_values, - self.vision_model.embeddings.patch_embedding.weight.dtype) + pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype) vision_outputs = self.vision_model(pixel_values, return_dict=True) image_embeds = vision_outputs.last_hidden_state - image_attention_mask = paddle.ones( - image_embeds.shape[:-1], dtype="int64") + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) - query_tokens = paddle.cast(query_tokens, - self.qformer.layernorm.weight.dtype) - image_embeds = paddle.cast(image_embeds, - self.qformer.layernorm.weight.dtype) + query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype) + image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype) query_outputs = self.qformer( query_embeds=query_tokens, encoder_hidden_states=image_embeds, encoder_attention_mask=image_attention_mask, - return_dict=True, ) + return_dict=True, + ) query_output = query_outputs.last_hidden_state # step 3: use the language model, conditioned on the text and image language_model_inputs = self.language_projection(query_output) - language_model_attention_mask = paddle.ones( - language_model_inputs.shape[:-1], dtype="int64") + language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64") return language_model_inputs, language_model_attention_mask @paddle.no_grad() def generate_with_image_features( - self, - image_features: paddle.Tensor, - first_input_ids: paddle.Tensor, - second_input_ids: paddle.Tensor, - image_attention_mask: Optional[paddle.Tensor]=None, - first_attention_mask: Optional[paddle.Tensor]=None, - second_attention_mask: Optional[paddle.Tensor]=None, - **generate_kwargs, ) -> paddle.Tensor: + self, + image_features: paddle.Tensor, + first_input_ids: paddle.Tensor, + second_input_ids: paddle.Tensor, + image_attention_mask: Optional[paddle.Tensor] = None, + first_attention_mask: Optional[paddle.Tensor] = None, + second_attention_mask: Optional[paddle.Tensor] = None, + **generate_kwargs, + ) -> paddle.Tensor: """ Overrides `generate` function to be able to use the model as a conditional generator. Args: @@ -1884,29 +1806,21 @@ def generate_with_image_features( first_embeds = self.language_model.llama.embed_tokens(first_input_ids) second_embeds = self.language_model.llama.embed_tokens(second_input_ids) image_features = paddle.cast(image_features, dtype=first_embeds.dtype) - inputs_embeds = paddle.concat( - [first_embeds, image_features, second_embeds], axis=1) + inputs_embeds = paddle.concat([first_embeds, image_features, second_embeds], axis=1) if first_attention_mask is None: - first_attention_mask = paddle.ones( - first_embeds.shape[:-1], dtype="int64") + first_attention_mask = paddle.ones(first_embeds.shape[:-1], dtype="int64") if second_attention_mask is None: - second_attention_mask = paddle.ones( - second_embeds.shape[:-1], dtype="int64") + second_attention_mask = paddle.ones(second_embeds.shape[:-1], dtype="int64") if image_attention_mask is None: - image_attention_mask = paddle.ones( - image_features.shape[:-1], dtype="int64") + image_attention_mask = paddle.ones(image_features.shape[:-1], dtype="int64") - attention_mask = paddle.concat( - [ - first_attention_mask, image_attention_mask, - second_attention_mask - ], - axis=1) + attention_mask = paddle.concat([first_attention_mask, image_attention_mask, second_attention_mask], axis=1) outputs = self.language_model.generate( inputs_embeds=inputs_embeds, attention_mask=attention_mask, - **generate_kwargs, ) + **generate_kwargs, + ) return outputs diff --git a/paddlemix/models/sam/common.py b/paddlemix/models/sam/common.py index f0ba1b97e6eab..e8fd2dd038e13 100644 --- a/paddlemix/models/sam/common.py +++ b/paddlemix/models/sam/common.py @@ -19,10 +19,7 @@ class MLPBlock(nn.Layer): - def __init__(self, - embedding_dim: int, - mlp_dim: int, - act: Type[nn.Layer]=nn.GELU) -> None: + def __init__(self, embedding_dim: int, mlp_dim: int, act: Type[nn.Layer] = nn.GELU) -> None: super().__init__() self.lin1 = nn.Linear(embedding_dim, mlp_dim) self.lin2 = nn.Linear(mlp_dim, embedding_dim) @@ -33,16 +30,18 @@ def forward(self, x: paddle.Tensor) -> paddle.Tensor: class LayerNorm2d(nn.Layer): - def __init__(self, num_channels: int, eps: float=1e-06) -> None: + def __init__(self, num_channels: int, eps: float = 1e-06) -> None: super().__init__() self.weight = paddle.create_parameter( shape=[num_channels], dtype="float32", - default_initializer=nn.initializer.Constant(value=1.0), ) + default_initializer=nn.initializer.Constant(value=1.0), + ) self.bias = paddle.create_parameter( shape=[num_channels], dtype="float32", - default_initializer=nn.initializer.Constant(value=0.0), ) + default_initializer=nn.initializer.Constant(value=0.0), + ) self.eps = eps def forward(self, x: paddle.Tensor) -> paddle.Tensor: diff --git a/paddlemix/models/sam/configuration.py b/paddlemix/models/sam/configuration.py index 9c516c863b5b0..26420a4241f79 100644 --- a/paddlemix/models/sam/configuration.py +++ b/paddlemix/models/sam/configuration.py @@ -17,6 +17,8 @@ from paddlenlp.transformers.configuration_utils import PretrainedConfig +from paddlemix.utils.log import logger + __all__ = ["SamConfig"] @@ -25,16 +27,17 @@ class SamConfig(PretrainedConfig): model_type = "Sam" def __init__( - self, - modelname="Sam", - prompt_embed_dim=256, - image_size=1024, - vit_patch_size=16, - encoder_embed_dim=768, - encoder_depth=12, - encoder_num_heads=12, - encoder_global_attn_indexes=[2, 5, 8, 11], - input_type=None, ): + self, + modelname="Sam", + prompt_embed_dim=256, + image_size=1024, + vit_patch_size=16, + encoder_embed_dim=768, + encoder_depth=12, + encoder_num_heads=12, + encoder_global_attn_indexes=[2, 5, 8, 11], + input_type=None, + ): super().__init__() self.modelname = modelname self.prompt_embed_dim = prompt_embed_dim @@ -45,18 +48,14 @@ def __init__( self.encoder_num_heads = encoder_num_heads self.encoder_global_attn_indexes = encoder_global_attn_indexes self.input_type = input_type - self.pixel_mean = ([123.675, 116.28, 103.53], ) + self.pixel_mean = ([123.675, 116.28, 103.53],) self.pixel_std = [58.395, 57.12, 57.375] @classmethod - def from_pretrained(cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, - **kwargs) + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - if ("model_type" in config_dict and hasattr(cls, "model_type") and - config_dict["model_type"] != cls.model_type): + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: logger.warning( f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." diff --git a/paddlemix/models/sam/image_encoder.py b/paddlemix/models/sam/image_encoder.py index b4ba60fe04aef..a6ba55f1167cb 100644 --- a/paddlemix/models/sam/image_encoder.py +++ b/paddlemix/models/sam/image_encoder.py @@ -24,23 +24,24 @@ class ImageEncoderViT(nn.Layer): def __init__( - self, - img_size: int=1024, - patch_size: int=16, - in_chans: int=3, - embed_dim: int=768, - depth: int=12, - num_heads: int=12, - mlp_ratio: float=4.0, - out_chans: int=256, - qkv_bias: bool=True, - norm_layer: Type[nn.Layer]=nn.LayerNorm, - act_layer: Type[nn.Layer]=nn.GELU, - use_abs_pos: bool=True, - use_rel_pos: bool=False, - rel_pos_zero_init: bool=True, - window_size: int=0, - global_attn_indexes: Tuple[int, ...]=(), ) -> None: + self, + img_size: int = 1024, + patch_size: int = 16, + in_chans: int = 3, + embed_dim: int = 768, + depth: int = 12, + num_heads: int = 12, + mlp_ratio: float = 4.0, + out_chans: int = 256, + qkv_bias: bool = True, + norm_layer: Type[nn.Layer] = nn.LayerNorm, + act_layer: Type[nn.Layer] = nn.GELU, + use_abs_pos: bool = True, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + window_size: int = 0, + global_attn_indexes: Tuple[int, ...] = (), + ) -> None: """ Args: img_size (int): Input image size. @@ -66,17 +67,17 @@ def __init__( kernel_size=(patch_size, patch_size), stride=(patch_size, patch_size), in_chans=in_chans, - embed_dim=embed_dim, ) + embed_dim=embed_dim, + ) self.pos_embed = None if use_abs_pos: # Initialize absolute positional embedding with pretrain image size. self.pos_embed = paddle.create_parameter( - shape=[ - 1, img_size // patch_size, img_size // patch_size, embed_dim - ], + shape=[1, img_size // patch_size, img_size // patch_size, embed_dim], dtype="float32", - default_initializer=nn.initializer.Constant(value=0.0), ) + default_initializer=nn.initializer.Constant(value=0.0), + ) self.blocks = nn.LayerList() for i in range(depth): @@ -90,7 +91,8 @@ def __init__( use_rel_pos=use_rel_pos, rel_pos_zero_init=rel_pos_zero_init, window_size=window_size if i not in global_attn_indexes else 0, - input_size=(img_size // patch_size, img_size // patch_size), ) + input_size=(img_size // patch_size, img_size // patch_size), + ) self.blocks.append(block) self.neck = nn.Sequential( @@ -98,15 +100,18 @@ def __init__( embed_dim, out_chans, kernel_size=1, - bias_attr=False, ), + bias_attr=False, + ), LayerNorm2d(out_chans), nn.Conv2D( out_chans, out_chans, kernel_size=3, padding=1, - bias_attr=False, ), - LayerNorm2d(out_chans), ) + bias_attr=False, + ), + LayerNorm2d(out_chans), + ) def forward(self, x: paddle.Tensor) -> paddle.Tensor: x = self.patch_embed(x) @@ -125,17 +130,18 @@ class Block(nn.Layer): """Transformer blocks with support of window attention and residual propagation blocks""" def __init__( - self, - dim: int, - num_heads: int, - mlp_ratio: float=4.0, - qkv_bias: bool=True, - norm_layer=nn.LayerNorm, - act_layer=nn.GELU, - use_rel_pos: bool=False, - rel_pos_zero_init: bool=True, - window_size: int=0, - input_size: Optional[Tuple[int, int]]=None, ) -> None: + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = True, + norm_layer=nn.LayerNorm, + act_layer=nn.GELU, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + window_size: int = 0, + input_size: Optional[Tuple[int, int]] = None, + ) -> None: """ Args: dim (int): Number of input channels. @@ -159,12 +165,11 @@ def __init__( qkv_bias=qkv_bias, use_rel_pos=use_rel_pos, rel_pos_zero_init=rel_pos_zero_init, - input_size=input_size - if window_size == 0 else (window_size, window_size), ) + input_size=input_size if window_size == 0 else (window_size, window_size), + ) self.norm2 = norm_layer(dim) - self.mlp = MLPBlock( - embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer) + self.mlp = MLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer) self.window_size = window_size @@ -191,13 +196,14 @@ class Attention(nn.Layer): """Multi-head Attention block with relative position embeddings.""" def __init__( - self, - dim: int, - num_heads: int=8, - qkv_bias: bool=True, - use_rel_pos: bool=False, - rel_pos_zero_init: bool=True, - input_size: Optional[Tuple[int, int]]=None, ) -> None: + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = True, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + input_size: Optional[Tuple[int, int]] = None, + ) -> None: """ Args: dim (int): Number of input channels. @@ -213,51 +219,45 @@ def __init__( head_dim = dim // num_heads self.scale = head_dim**-0.5 - self.qkv = nn.Linear( - dim, dim * 3, bias_attr=qkv_bias if not qkv_bias else None) + self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias if not qkv_bias else None) self.proj = nn.Linear(dim, dim) self.use_rel_pos = use_rel_pos if self.use_rel_pos: - assert ( - input_size is not None - ), "Input size must be provided if using relative positional encoding." + assert input_size is not None, "Input size must be provided if using relative positional encoding." # initialize relative positional embeddings self.rel_pos_h = paddle.create_parameter( shape=[2 * input_size[0] - 1, head_dim], dtype="float32", - default_initializer=nn.initializer.Constant(value=0.0), ) + default_initializer=nn.initializer.Constant(value=0.0), + ) self.rel_pos_w = paddle.create_parameter( shape=[2 * input_size[1] - 1, head_dim], dtype="float32", - default_initializer=nn.initializer.Constant(value=0.0), ) + default_initializer=nn.initializer.Constant(value=0.0), + ) def forward(self, x: paddle.Tensor) -> paddle.Tensor: B, H, W, _ = x.shape # qkv with shape (3, B, nHead, H * W, C) - qkv = (self.qkv(x).reshape(shape=[B, H * W, 3, self.num_heads, -1]) - .transpose([2, 0, 3, 1, 4])) + qkv = self.qkv(x).reshape(shape=[B, H * W, 3, self.num_heads, -1]).transpose([2, 0, 3, 1, 4]) # q, k, v with shape (B * nHead, H * W, C) - q, k, v = qkv.reshape(shape=[3, B * self.num_heads, H * W, -1]).unbind( - axis=0) + q, k, v = qkv.reshape(shape=[3, B * self.num_heads, H * W, -1]).unbind(axis=0) - attn = (q * self.scale) @k.transpose([0, 2, 1]) + attn = (q * self.scale) @ k.transpose([0, 2, 1]) if self.use_rel_pos: - attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, - self.rel_pos_w, (H, W), (H, W)) + attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W)) attn = F.softmax(attn, axis=-1) - x = ((attn @v).reshape([B, self.num_heads, H, W, -1]) - .transpose([0, 2, 3, 1, 4]).reshape([B, H, W, -1])) + x = (attn @ v).reshape([B, self.num_heads, H, W, -1]).transpose([0, 2, 3, 1, 4]).reshape([B, H, W, -1]) x = self.proj(x) return x -def window_partition(x: paddle.Tensor, - window_size: int) -> Tuple[paddle.Tensor, Tuple[int, int]]: +def window_partition(x: paddle.Tensor, window_size: int) -> Tuple[paddle.Tensor, Tuple[int, int]]: """ Partition into non-overlapping windows with padding if needed. Args: @@ -273,22 +273,20 @@ def window_partition(x: paddle.Tensor, pad_h = (window_size - H % window_size) % window_size pad_w = (window_size - W % window_size) % window_size if pad_h > 0 or pad_w > 0: - x = paddle.nn.functional.pad( - x=x, pad=(0, 0, 0, pad_w, 0, pad_h, 0, 0)) # 每个维度分两位数进行pad + x = paddle.nn.functional.pad(x=x, pad=(0, 0, 0, pad_w, 0, pad_h, 0, 0)) # 每个维度分两位数进行pad Hp, Wp = H + pad_h, W + pad_w - x = x.reshape( - [B, Hp // window_size, window_size, Wp // window_size, window_size, C]) - windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape( - [-1, window_size, window_size, C]) + x = x.reshape([B, Hp // window_size, window_size, Wp // window_size, window_size, C]) + windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, window_size, window_size, C]) return windows, (Hp, Wp) def window_unpartition( - windows: paddle.Tensor, - window_size: int, - pad_hw: Tuple[int, int], - hw: Tuple[int, int], ) -> paddle.Tensor: + windows: paddle.Tensor, + window_size: int, + pad_hw: Tuple[int, int], + hw: Tuple[int, int], +) -> paddle.Tensor: """ Window unpartition into original sequences and removing padding. Args: @@ -303,9 +301,7 @@ def window_unpartition( Hp, Wp = pad_hw H, W = hw B = windows.shape[0] // (Hp * Wp // window_size // window_size) - x = windows.reshape([ - B, Hp // window_size, Wp // window_size, window_size, window_size, -1 - ]) + x = windows.reshape([B, Hp // window_size, Wp // window_size, window_size, window_size, -1]) x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, Hp, Wp, -1]) if Hp > H or Wp > W: @@ -313,8 +309,7 @@ def window_unpartition( return x -def get_rel_pos(q_size: int, k_size: int, - rel_pos: paddle.Tensor) -> paddle.Tensor: +def get_rel_pos(q_size: int, k_size: int, rel_pos: paddle.Tensor) -> paddle.Tensor: """ Get relative positional embeddings according to the relative positions of query and key sizes. @@ -333,31 +328,29 @@ def get_rel_pos(q_size: int, k_size: int, rel_pos_resized = F.interpolate( rel_pos.reshape(1, rel_pos.shape[0], -1).transpose([0, 2, 1]), size=max_rel_dist, - mode="linear", ) - rel_pos_resized = rel_pos_resized.reshape([-1, max_rel_dist]).transpose( - [1, 0]) + mode="linear", + ) + rel_pos_resized = rel_pos_resized.reshape([-1, max_rel_dist]).transpose([1, 0]) else: rel_pos_resized = rel_pos # Scale the coords with short length if shapes for q and k are different. q_coords = paddle.arange(end=q_size)[:, None] * max(k_size / q_size, 1.0) k_coords = paddle.arange(end=k_size)[None, :] * max(q_size / k_size, 1.0) - relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / - k_size, 1.0) + relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0) h, w = relative_coords.shape - return paddle.index_select(rel_pos_resized, - relative_coords.cast("int64").flatten()).reshape( - (h, w, -1)) + return paddle.index_select(rel_pos_resized, relative_coords.cast("int64").flatten()).reshape((h, w, -1)) def add_decomposed_rel_pos( - attn: paddle.Tensor, - q: paddle.Tensor, - rel_pos_h: paddle.Tensor, - rel_pos_w: paddle.Tensor, - q_size: Tuple[int, int], - k_size: Tuple[int, int], ) -> paddle.Tensor: + attn: paddle.Tensor, + q: paddle.Tensor, + rel_pos_h: paddle.Tensor, + rel_pos_w: paddle.Tensor, + q_size: Tuple[int, int], + k_size: Tuple[int, int], +) -> paddle.Tensor: """ Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`. https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950 @@ -382,8 +375,9 @@ def add_decomposed_rel_pos( rel_h = paddle.einsum("bhwc,hkc->bhwk", r_q, Rh) rel_w = paddle.einsum("bhwc,wkc->bhwk", r_q, Rw) - attn = (attn.reshape([B, q_h, q_w, k_h, k_w]) + rel_h[:, :, :, :, None] + - rel_w[:, :, :, None, :]).reshape([B, q_h * q_w, k_h * k_w]) + attn = (attn.reshape([B, q_h, q_w, k_h, k_w]) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]).reshape( + [B, q_h * q_w, k_h * k_w] + ) return attn @@ -394,12 +388,13 @@ class PatchEmbed(nn.Layer): """ def __init__( - self, - kernel_size: Tuple[int, int]=(16, 16), - stride: Tuple[int, int]=(16, 16), - padding: Tuple[int, int]=(0, 0), - in_chans: int=3, - embed_dim: int=768, ) -> None: + self, + kernel_size: Tuple[int, int] = (16, 16), + stride: Tuple[int, int] = (16, 16), + padding: Tuple[int, int] = (0, 0), + in_chans: int = 3, + embed_dim: int = 768, + ) -> None: """ Args: kernel_size (Tuple): kernel size of the projection layer. @@ -410,12 +405,7 @@ def __init__( """ super().__init__() - self.proj = nn.Conv2D( - in_chans, - embed_dim, - kernel_size=kernel_size, - stride=stride, - padding=padding) + self.proj = nn.Conv2D(in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding) def forward(self, x: paddle.Tensor) -> paddle.Tensor: x = self.proj(x) @@ -429,48 +419,44 @@ def forward(self, x: paddle.Tensor) -> paddle.Tensor: import paddle import torch from padiff import auto_diff - from segment_anything.modeling import \ - ImageEncoderViT as ImageEncoderViT_torch + from segment_anything.modeling import ImageEncoderViT as ImageEncoderViT_torch image_encoder_t = ImageEncoderViT_torch( depth=12, embed_dim=768, img_size=1024, mlp_ratio=4, - norm_layer=partial( - torch.nn.LayerNorm, eps=1e-6), + norm_layer=partial(torch.nn.LayerNorm, eps=1e-6), num_heads=12, patch_size=16, qkv_bias=True, use_rel_pos=True, global_attn_indexes=[2, 5, 8, 11], window_size=14, - out_chans=256, ) + out_chans=256, + ) image_encoder = ImageEncoderViT( depth=12, embed_dim=768, img_size=1024, mlp_ratio=4, - norm_layer=partial( - paddle.nn.LayerNorm, epsilon=1e-6), + norm_layer=partial(paddle.nn.LayerNorm, epsilon=1e-6), num_heads=12, patch_size=16, qkv_bias=True, use_rel_pos=True, global_attn_indexes=[2, 5, 8, 11], window_size=14, - out_chans=256, ) + out_chans=256, + ) # Generate random numbers of shape (4, 3, 128, 128) random_numbers = np.random.rand(1, 3, 1024, 1024).astype("float32") inp = ( - { - "x": paddle.to_tensor(random_numbers) - }, - { - "x": torch.as_tensor(random_numbers) - }, ) + {"x": paddle.to_tensor(random_numbers)}, + {"x": torch.as_tensor(random_numbers)}, + ) auto_diff( image_encoder, @@ -482,4 +468,5 @@ def forward(self, x: paddle.Tensor) -> paddle.Tensor: "rtol": 0, "compare_mode": "mean", "single_step": False, - }, ) + }, + ) diff --git a/paddlemix/models/sam/mask_decoder.py b/paddlemix/models/sam/mask_decoder.py index e3aa90acd5671..babf266b92ef6 100644 --- a/paddlemix/models/sam/mask_decoder.py +++ b/paddlemix/models/sam/mask_decoder.py @@ -20,14 +20,16 @@ class MaskDecoder(paddle.nn.Layer): - def __init__(self, - *, - transformer_dim: int, - transformer: paddle.nn.Layer, - num_multimask_outputs: int=3, - activation: Type[paddle.nn.Layer]=paddle.nn.GELU, - iou_head_depth: int=3, - iou_head_hidden_dim: int=256) -> None: + def __init__( + self, + *, + transformer_dim: int, + transformer: paddle.nn.Layer, + num_multimask_outputs: int = 3, + activation: Type[paddle.nn.Layer] = paddle.nn.GELU, + iou_head_depth: int = 3, + iou_head_hidden_dim: int = 256 + ) -> None: """ Predicts masks given an image and prompt embeddings, using a tranformer architecture. @@ -50,36 +52,39 @@ def __init__(self, self.num_multimask_outputs = num_multimask_outputs self.iou_token = paddle.nn.Embedding(1, transformer_dim) self.num_mask_tokens = num_multimask_outputs + 1 - self.mask_tokens = paddle.nn.Embedding(self.num_mask_tokens, - transformer_dim) + self.mask_tokens = paddle.nn.Embedding(self.num_mask_tokens, transformer_dim) self.output_upscaling = paddle.nn.Sequential( paddle.nn.Conv2DTranspose( in_channels=transformer_dim, out_channels=transformer_dim // 4, kernel_size=2, - stride=2, ), + stride=2, + ), LayerNorm2d(transformer_dim // 4), activation(), paddle.nn.Conv2DTranspose( in_channels=transformer_dim // 4, out_channels=transformer_dim // 8, kernel_size=2, - stride=2, ), - activation(), ) - self.output_hypernetworks_mlps = paddle.nn.LayerList(sublayers=[ - MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3) - for i in range(self.num_mask_tokens) - ]) - self.iou_prediction_head = MLP(transformer_dim, iou_head_hidden_dim, - self.num_mask_tokens, iou_head_depth) + stride=2, + ), + activation(), + ) + self.output_hypernetworks_mlps = paddle.nn.LayerList( + sublayers=[ + MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3) for i in range(self.num_mask_tokens) + ] + ) + self.iou_prediction_head = MLP(transformer_dim, iou_head_hidden_dim, self.num_mask_tokens, iou_head_depth) def forward( - self, - image_embeddings: paddle.Tensor, - image_pe: paddle.Tensor, - sparse_prompt_embeddings: paddle.Tensor, - dense_prompt_embeddings: paddle.Tensor, - multimask_output: bool, ) -> Tuple[paddle.Tensor, paddle.Tensor]: + self, + image_embeddings: paddle.Tensor, + image_pe: paddle.Tensor, + sparse_prompt_embeddings: paddle.Tensor, + dense_prompt_embeddings: paddle.Tensor, + multimask_output: bool, + ) -> Tuple[paddle.Tensor, paddle.Tensor]: """ Predict masks given image and prompt embeddings. @@ -99,7 +104,8 @@ def forward( image_embeddings=image_embeddings, image_pe=image_pe, sparse_prompt_embeddings=sparse_prompt_embeddings, - dense_prompt_embeddings=dense_prompt_embeddings, ) + dense_prompt_embeddings=dense_prompt_embeddings, + ) if multimask_output: mask_slice = slice(1, None) @@ -111,28 +117,24 @@ def forward( return masks, iou_pred def predict_masks( - self, - image_embeddings: paddle.Tensor, - image_pe: paddle.Tensor, - sparse_prompt_embeddings: paddle.Tensor, - dense_prompt_embeddings: paddle.Tensor, ) -> Tuple[paddle.Tensor, - paddle.Tensor]: + self, + image_embeddings: paddle.Tensor, + image_pe: paddle.Tensor, + sparse_prompt_embeddings: paddle.Tensor, + dense_prompt_embeddings: paddle.Tensor, + ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Predicts masks. See 'forward' for more details.""" - output_tokens = paddle.concat( - x=[self.iou_token.weight, self.mask_tokens.weight], axis=0) - output_tokens = output_tokens.unsqueeze(axis=0).expand( - shape=[sparse_prompt_embeddings.shape[0], -1, -1]) - tokens = paddle.concat( - x=(output_tokens, sparse_prompt_embeddings), axis=1) - src = paddle.repeat_interleave( - image_embeddings, tokens.shape[0], axis=0) + output_tokens = paddle.concat(x=[self.iou_token.weight, self.mask_tokens.weight], axis=0) + output_tokens = output_tokens.unsqueeze(axis=0).expand(shape=[sparse_prompt_embeddings.shape[0], -1, -1]) + tokens = paddle.concat(x=(output_tokens, sparse_prompt_embeddings), axis=1) + src = paddle.repeat_interleave(image_embeddings, tokens.shape[0], axis=0) src = src + dense_prompt_embeddings pos_src = paddle.repeat_interleave(image_pe, tokens.shape[0], axis=0) b, c, h, w = src.shape hs, src = self.transformer(src, pos_src, tokens) iou_token_out = hs[:, (0), :] - mask_tokens_out = hs[:, 1:1 + self.num_mask_tokens, :] + mask_tokens_out = hs[:, 1 : 1 + self.num_mask_tokens, :] x = src perm_0 = list(range(x.ndim)) perm_0[1] = 2 @@ -142,37 +144,37 @@ def predict_masks( upscaled_embedding = self.output_upscaling(src) hyper_in_list: List[paddle.Tensor] = [] for i in range(self.num_mask_tokens): - hyper_in_list.append(self.output_hypernetworks_mlps[i]( - mask_tokens_out[:, (i), :])) + hyper_in_list.append(self.output_hypernetworks_mlps[i](mask_tokens_out[:, (i), :])) hyper_in = paddle.stack(x=hyper_in_list, axis=1) b, c, h, w = upscaled_embedding.shape - masks = (hyper_in @upscaled_embedding.reshape([b, c, h * w])).reshape( - [b, -1, h, w]) + masks = (hyper_in @ upscaled_embedding.reshape([b, c, h * w])).reshape([b, -1, h, w]) iou_pred = self.iou_prediction_head(iou_token_out) return masks, iou_pred class MLP(paddle.nn.Layer): def __init__( - self, - input_dim: int, - hidden_dim: int, - output_dim: int, - num_layers: int, - sigmoid_output: bool=False, ) -> None: + self, + input_dim: int, + hidden_dim: int, + output_dim: int, + num_layers: int, + sigmoid_output: bool = False, + ) -> None: super().__init__() self.num_layers = num_layers h = [hidden_dim] * (num_layers - 1) - self.layers = paddle.nn.LayerList(sublayers=(paddle.nn.Linear( - in_features=n, - out_features=k) for n, k in zip([input_dim] + h, h + [output_dim]))) + self.layers = paddle.nn.LayerList( + sublayers=( + paddle.nn.Linear(in_features=n, out_features=k) for n, k in zip([input_dim] + h, h + [output_dim]) + ) + ) self.sigmoid_output = sigmoid_output def forward(self, x): for i, layer in enumerate(self.layers): - x = (paddle.nn.functional.relu(x=layer(x)) - if i < self.num_layers - 1 else layer(x)) + x = paddle.nn.functional.relu(x=layer(x)) if i < self.num_layers - 1 else layer(x) if self.sigmoid_output: x = paddle.nn.functional.sigmoid(x=x) return x diff --git a/paddlemix/models/sam/modeling.py b/paddlemix/models/sam/modeling.py index 6d82a9626d4fb..2d64e17c302b0 100644 --- a/paddlemix/models/sam/modeling.py +++ b/paddlemix/models/sam/modeling.py @@ -13,14 +13,11 @@ # limitations under the License. from functools import partial -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List import numpy as np import paddle -from paddle import nn -from paddle.nn import functional as F -from paddlenlp.transformers.model_utils import (PretrainedModel, - register_base_model) +from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model from .configuration import SamConfig from .image_encoder import ImageEncoderViT @@ -65,39 +62,44 @@ def __init__(self, config: SamConfig): embed_dim=config.encoder_embed_dim, img_size=image_size, mlp_ratio=4, - norm_layer=partial( - paddle.nn.LayerNorm, epsilon=1e-6), + norm_layer=partial(paddle.nn.LayerNorm, epsilon=1e-6), num_heads=config.encoder_num_heads, patch_size=vit_patch_size, qkv_bias=True, use_rel_pos=True, global_attn_indexes=config.encoder_global_attn_indexes, window_size=14, - out_chans=prompt_embed_dim, ) + out_chans=prompt_embed_dim, + ) self.prompt_encoder = PromptEncoder( embed_dim=prompt_embed_dim, image_embedding_size=(image_embedding_size, image_embedding_size), input_image_size=(image_size, image_size), - mask_in_chans=16, ) + mask_in_chans=16, + ) self.mask_decoder = MaskDecoder( num_multimask_outputs=3, transformer=TwoWayTransformer( depth=2, embedding_dim=prompt_embed_dim, mlp_dim=2048, - num_heads=8, ), + num_heads=8, + ), transformer_dim=prompt_embed_dim, iou_head_depth=3, - iou_head_hidden_dim=256, ) + iou_head_hidden_dim=256, + ) self.eval() self.register_buffer( "pixel_mean", paddle.to_tensor(config.pixel_mean).reshape([-1, 1, 1]), - persistable=False, ) + persistable=False, + ) self.register_buffer( "pixel_std", paddle.to_tensor(config.pixel_std).reshape([-1, 1, 1]), - persistable=False, ) + persistable=False, + ) @property def device(self) -> Any: @@ -111,9 +113,10 @@ def reset_img(self): self.set_image = False def after_forward(self): - masks = masks[0].detach().cpu().numpy() - iou_predictions = iou_predictions[0].detach().cpu().numpy() - low_res_masks = low_res_masks[0].detach().cpu().numpy() + # masks = masks[0].detach().cpu().numpy() + # iou_predictions = iou_predictions[0].detach().cpu().numpy() + # low_res_masks = low_res_masks[0].detach().cpu().numpy() + pass @paddle.no_grad() def prompt_forward_point(self, x=None, coords_paddle=None): @@ -132,7 +135,8 @@ def prompt_forward_point(self, x=None, coords_paddle=None): sparse_embeddings, dense_embeddings = self.prompt_encoder( points=points, boxes=None, - masks=None, ) + masks=None, + ) # Predict masks low_res_masks, iou_predictions = self.mask_decoder( @@ -140,7 +144,8 @@ def prompt_forward_point(self, x=None, coords_paddle=None): image_pe=self.prompt_encoder.get_dense_pe(), sparse_prompt_embeddings=sparse_embeddings, dense_prompt_embeddings=dense_embeddings, - multimask_output=False, ) + multimask_output=False, + ) return low_res_masks @@ -155,7 +160,8 @@ def prompt_forward_box(self, x=None, box_paddle=None): sparse_embeddings, dense_embeddings = self.prompt_encoder( points=None, boxes=box_paddle, - masks=None, ) + masks=None, + ) # Predict masks low_res_masks, iou_predictions = self.mask_decoder( @@ -163,15 +169,19 @@ def prompt_forward_box(self, x=None, box_paddle=None): image_pe=self.prompt_encoder.get_dense_pe(), sparse_prompt_embeddings=sparse_embeddings, dense_prompt_embeddings=dense_embeddings, - multimask_output=False, ) + multimask_output=False, + ) return low_res_masks # , iou_predictions, low_res_masks @paddle.no_grad() def full_mask_forward(self, img: List[Dict[str, Any]], coords_paddle): labels_paddle = paddle.ones( - shape=[coords_paddle.shape[0], ], - dtype="int64", ) + shape=[ + coords_paddle.shape[0], + ], + dtype="int64", + ) labels_paddle = paddle.to_tensor(labels_paddle).cast("int32")[:, None] points = (coords_paddle, labels_paddle) @@ -183,7 +193,8 @@ def full_mask_forward(self, img: List[Dict[str, Any]], coords_paddle): sparse_embeddings, dense_embeddings = self.prompt_encoder( points=points, boxes=None, - masks=None, ) + masks=None, + ) # Predict masks low_res_masks, iou_predictions = self.mask_decoder( @@ -191,7 +202,8 @@ def full_mask_forward(self, img: List[Dict[str, Any]], coords_paddle): image_pe=self.prompt_encoder.get_dense_pe(), sparse_prompt_embeddings=sparse_embeddings, dense_prompt_embeddings=dense_embeddings, - multimask_output=False, ) + multimask_output=False, + ) return low_res_masks, iou_predictions # (64, 3) # low_res_masks, @@ -205,7 +217,7 @@ def forward(self, img=None, prompt=None): return masks, iou_predictions else: NotImplementedError( - 'input_type need to be in {"points", "boxs", "points_grid"}, but got: {}'. - format(self.input_type)) + 'input_type need to be in ["points", "boxs", "points_grid"], but got: {}'.format(self.input_type) + ) return masks diff --git a/paddlemix/models/sam/prompt_encoder.py b/paddlemix/models/sam/prompt_encoder.py index ff51c6dd571e6..168bad62aa992 100644 --- a/paddlemix/models/sam/prompt_encoder.py +++ b/paddlemix/models/sam/prompt_encoder.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Optional, Tuple, Type +from typing import Optional, Tuple, Type import numpy as np import paddle @@ -22,12 +22,13 @@ class PromptEncoder(paddle.nn.Layer): def __init__( - self, - embed_dim: int, - image_embedding_size: Tuple[int, int], - input_image_size: Tuple[int, int], - mask_in_chans: int, - activation: Type[paddle.nn.Layer]=paddle.nn.GELU, ) -> None: + self, + embed_dim: int, + image_embedding_size: Tuple[int, int], + input_image_size: Tuple[int, int], + mask_in_chans: int, + activation: Type[paddle.nn.Layer] = paddle.nn.GELU, + ) -> None: """ Encodes prompts for input to SAM's mask decoder. @@ -48,33 +49,24 @@ def __init__( self.image_embedding_size = image_embedding_size self.pe_layer = PositionEmbeddingRandom(embed_dim // 2) self.num_point_embeddings: int = 4 - point_embeddings = [ - paddle.nn.Embedding(1, embed_dim) - for i in range(self.num_point_embeddings) - ] + point_embeddings = [paddle.nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings)] self.point_embeddings = paddle.nn.LayerList(sublayers=point_embeddings) self.not_a_point_embed = paddle.nn.Embedding(1, embed_dim) - self.mask_input_size = 4 * image_embedding_size[ - 0], 4 * image_embedding_size[1] + self.mask_input_size = 4 * image_embedding_size[0], 4 * image_embedding_size[1] self.mask_downscaling = paddle.nn.Sequential( - paddle.nn.Conv2D( - in_channels=1, - out_channels=mask_in_chans // 4, - kernel_size=2, - stride=2), + paddle.nn.Conv2D(in_channels=1, out_channels=mask_in_chans // 4, kernel_size=2, stride=2), LayerNorm2d(mask_in_chans // 4), activation(), paddle.nn.Conv2D( in_channels=mask_in_chans // 4, out_channels=mask_in_chans, kernel_size=2, - stride=2, ), + stride=2, + ), LayerNorm2d(mask_in_chans), activation(), - paddle.nn.Conv2D( - in_channels=mask_in_chans, - out_channels=embed_dim, - kernel_size=1), ) + paddle.nn.Conv2D(in_channels=mask_in_chans, out_channels=embed_dim, kernel_size=1), + ) self.no_mask_embed = paddle.nn.Embedding(1, embed_dim) def get_dense_pe(self) -> paddle.Tensor: @@ -88,10 +80,7 @@ def get_dense_pe(self) -> paddle.Tensor: """ return self.pe_layer(self.image_embedding_size).unsqueeze(axis=0) - def _embed_points(self, - points: paddle.Tensor, - labels: paddle.Tensor, - pad: bool) -> paddle.Tensor: + def _embed_points(self, points: paddle.Tensor, labels: paddle.Tensor, pad: bool) -> paddle.Tensor: """Embeds point prompts.""" points = points + 0.5 points = points.cast("float32") @@ -99,10 +88,8 @@ def _embed_points(self, padding_point = paddle.zeros(shape=(points.shape[0], 1, 2)) padding_label = -paddle.ones(shape=(labels.shape[0], 1)) points = paddle.concat(x=[points, padding_point], axis=1) - labels = paddle.concat( - x=[labels.astype("float32"), padding_label], axis=1) - point_embedding = self.pe_layer.forward_with_coords( - points, self.input_image_size) + labels = paddle.concat(x=[labels.astype("float32"), padding_label], axis=1) + point_embedding = self.pe_layer.forward_with_coords(points, self.input_image_size) point_embedding[labels == -1] = 0.0 if point_embedding[labels == -1].shape[0] != 0: @@ -117,8 +104,7 @@ def _embed_boxes(self, boxes: paddle.Tensor) -> paddle.Tensor: """Embeds box prompts.""" boxes = boxes + 0.5 coords = boxes.reshape([-1, 2, 2]) - corner_embedding = self.pe_layer.forward_with_coords( - coords, self.input_image_size) + corner_embedding = self.pe_layer.forward_with_coords(coords, self.input_image_size) corner_embedding[:, (0), :] += self.point_embeddings[2].weight corner_embedding[:, (1), :] += self.point_embeddings[3].weight return corner_embedding @@ -129,10 +115,11 @@ def _embed_masks(self, masks: paddle.Tensor) -> paddle.Tensor: return mask_embedding def _get_batch_size( - self, - points: Optional[Tuple[paddle.Tensor, paddle.Tensor]], - boxes: Optional[paddle.Tensor], - masks: Optional[paddle.Tensor], ) -> int: + self, + points: Optional[Tuple[paddle.Tensor, paddle.Tensor]], + boxes: Optional[paddle.Tensor], + masks: Optional[paddle.Tensor], + ) -> int: """ Gets the batch size of the output given the batch size of the input prompts. """ @@ -149,11 +136,11 @@ def _get_device(self): return self.point_embeddings[0].weight.place def forward( - self, - points: Optional[Tuple[paddle.Tensor, paddle.Tensor]], - boxes: Optional[paddle.Tensor], - masks: Optional[paddle.Tensor], ) -> Tuple[paddle.Tensor, - paddle.Tensor]: + self, + points: Optional[Tuple[paddle.Tensor, paddle.Tensor]], + boxes: Optional[paddle.Tensor], + masks: Optional[paddle.Tensor], + ) -> Tuple[paddle.Tensor, paddle.Tensor]: """ Embeds different types of prompts, returning both sparse and dense embeddings. @@ -175,24 +162,22 @@ def forward( sparse_embeddings = paddle.empty(shape=(bs, 0, self.embed_dim)) if points is not None: coords, labels = points - point_embeddings = self._embed_points( - coords, labels, pad=boxes is None) - sparse_embeddings = paddle.concat( - x=[sparse_embeddings, point_embeddings], axis=1) + point_embeddings = self._embed_points(coords, labels, pad=boxes is None) + sparse_embeddings = paddle.concat(x=[sparse_embeddings, point_embeddings], axis=1) if boxes is not None: box_embeddings = self._embed_boxes(boxes) - sparse_embeddings = paddle.concat( - x=[sparse_embeddings, box_embeddings], axis=1) + sparse_embeddings = paddle.concat(x=[sparse_embeddings, box_embeddings], axis=1) if masks is not None: dense_embeddings = self._embed_masks(masks) else: - dense_embeddings = self.no_mask_embed.weight.reshape( - [1, -1, 1, 1]).expand(shape=[ + dense_embeddings = self.no_mask_embed.weight.reshape([1, -1, 1, 1]).expand( + shape=[ bs, -1, self.image_embedding_size[0], self.image_embedding_size[1], - ]) + ] + ) return sparse_embeddings, dense_embeddings @@ -201,27 +186,26 @@ class PositionEmbeddingRandom(paddle.nn.Layer): Positional encoding using random spatial frequencies. """ - def __init__(self, num_pos_feats: int=64, - scale: Optional[float]=None) -> None: + def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None: super().__init__() if scale is None or scale <= 0.0: scale = 1.0 self.register_buffer( "positional_encoding_gaussian_matrix", - scale * paddle.randn(shape=(2, num_pos_feats)), ) + scale * paddle.randn(shape=(2, num_pos_feats)), + ) def _pe_encoding(self, coords: paddle.Tensor) -> paddle.Tensor: """Positionally encode points that are normalized to [0,1].""" coords = 2 * coords - 1 - coords = coords @self.positional_encoding_gaussian_matrix + coords = coords @ self.positional_encoding_gaussian_matrix coords = 2 * np.pi * coords - return paddle.concat( - x=[paddle.sin(x=coords), paddle.cos(x=coords)], axis=-1) + return paddle.concat(x=[paddle.sin(x=coords), paddle.cos(x=coords)], axis=-1) def forward(self, size: Tuple[int, int]) -> paddle.Tensor: """Generate positional encoding for a grid of the specified size.""" h, w = size - device: Any = self.positional_encoding_gaussian_matrix.place + # device: Any = self.positional_encoding_gaussian_matrix.place grid = paddle.ones(shape=(h, w), dtype="float32") y_embed = grid.cumsum(axis=0) - 0.5 x_embed = grid.cumsum(axis=1) - 0.5 @@ -230,9 +214,7 @@ def forward(self, size: Tuple[int, int]) -> paddle.Tensor: pe = self._pe_encoding(paddle.stack(x=[x_embed, y_embed], axis=-1)) return pe.transpose(perm=[2, 0, 1]) - def forward_with_coords(self, - coords_input: paddle.Tensor, - image_size: Tuple[int, int]) -> paddle.Tensor: + def forward_with_coords(self, coords_input: paddle.Tensor, image_size: Tuple[int, int]) -> paddle.Tensor: """Positionally encode points that are not normalized to [0,1].""" coords = coords_input.clone() coords[:, :, (0)] = coords[:, :, (0)] / image_size[1] diff --git a/paddlemix/models/sam/transformer.py b/paddlemix/models/sam/transformer.py index ef27885ce8a21..a040b99aac6f8 100644 --- a/paddlemix/models/sam/transformer.py +++ b/paddlemix/models/sam/transformer.py @@ -15,7 +15,6 @@ import math from typing import Tuple, Type -import paddle import paddle.nn.functional as F from paddle import Tensor, nn @@ -24,13 +23,14 @@ class TwoWayTransformer(nn.Layer): def __init__( - self, - depth: int, - embedding_dim: int, - num_heads: int, - mlp_dim: int, - activation: Type[nn.Layer]=nn.ReLU, - attention_downsample_rate: int=2, ) -> None: + self, + depth: int, + embedding_dim: int, + num_heads: int, + mlp_dim: int, + activation: Type[nn.Layer] = nn.ReLU, + attention_downsample_rate: int = 2, + ) -> None: """ A transformer decoder that attends to an input image using queries whose positional embedding is supplied. @@ -57,13 +57,13 @@ def __init__( mlp_dim=mlp_dim, activation=activation, attention_downsample_rate=attention_downsample_rate, - skip_first_layer_pe=i == 0, )) - self.final_attn_token_to_image = Attention( - embedding_dim, num_heads, downsample_rate=attention_downsample_rate) + skip_first_layer_pe=i == 0, + ) + ) + self.final_attn_token_to_image = Attention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate) self.norm_final_attn = nn.LayerNorm(embedding_dim) - def forward(self, image_embedding, image_pe, - point_embedding) -> Tuple[Tensor, Tensor]: + def forward(self, image_embedding, image_pe, point_embedding) -> Tuple[Tensor, Tensor]: """ Args: image_embedding (paddle.Tensor): image to attend to. Should be shape @@ -83,11 +83,7 @@ def forward(self, image_embedding, image_pe, queries = point_embedding keys = image_embedding for layer in self.layers: - queries, keys = layer( - queries=queries, - keys=keys, - query_pe=point_embedding, - key_pe=image_pe) + queries, keys = layer(queries=queries, keys=keys, query_pe=point_embedding, key_pe=image_pe) q = queries + point_embedding k = keys + image_pe attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys) @@ -98,13 +94,14 @@ def forward(self, image_embedding, image_pe, class TwoWayAttentionBlock(nn.Layer): def __init__( - self, - embedding_dim: int, - num_heads: int, - mlp_dim: int=2048, - activation: Type[nn.Layer]=nn.ReLU, - attention_downsample_rate: int=2, - skip_first_layer_pe: bool=False, ) -> None: + self, + embedding_dim: int, + num_heads: int, + mlp_dim: int = 2048, + activation: Type[nn.Layer] = nn.ReLU, + attention_downsample_rate: int = 2, + skip_first_layer_pe: bool = False, + ) -> None: """ A transformer block with four layers: (1) self-attention of sparse inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp @@ -121,14 +118,12 @@ def __init__( super().__init__() self.self_attn = Attention(embedding_dim, num_heads) self.norm1 = nn.LayerNorm(embedding_dim) - self.cross_attn_token_to_image = Attention( - embedding_dim, num_heads, downsample_rate=attention_downsample_rate) + self.cross_attn_token_to_image = Attention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate) self.norm2 = nn.LayerNorm(embedding_dim) self.mlp = MLPBlock(embedding_dim, mlp_dim, activation) self.norm3 = nn.LayerNorm(embedding_dim) self.norm4 = nn.LayerNorm(embedding_dim) - self.cross_attn_image_to_token = Attention( - embedding_dim, num_heads, downsample_rate=attention_downsample_rate) + self.cross_attn_image_to_token = Attention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate) self.skip_first_layer_pe = skip_first_layer_pe def forward(self, queries, keys, query_pe, key_pe) -> Tuple[Tensor, Tensor]: @@ -161,16 +156,12 @@ class Attention(nn.Layer): after projection to queries, keys, and values. """ - def __init__(self, - embedding_dim: int, - num_heads: int, - downsample_rate: int=1) -> None: + def __init__(self, embedding_dim: int, num_heads: int, downsample_rate: int = 1) -> None: super().__init__() self.embedding_dim = embedding_dim self.internal_dim = embedding_dim // downsample_rate self.num_heads = num_heads - assert (self.internal_dim % num_heads == 0 - ), "num_heads must divide embedding_dim." + assert self.internal_dim % num_heads == 0, "num_heads must divide embedding_dim." self.q_proj = nn.Linear(embedding_dim, self.internal_dim) self.k_proj = nn.Linear(embedding_dim, self.internal_dim) self.v_proj = nn.Linear(embedding_dim, self.internal_dim) @@ -194,10 +185,10 @@ def forward(self, q, k, v): k = self._separate_heads(k, self.num_heads) v = self._separate_heads(v, self.num_heads) _, _, _, c_per_head = q.shape - attn = q @k.transpose([0, 1, 3, 2]) + attn = q @ k.transpose([0, 1, 3, 2]) attn = attn / math.sqrt(c_per_head) attn = F.softmax(attn, axis=-1) - out = attn @v + out = attn @ v out = self._recombine_heads(out) out = self.out_proj(out) return out diff --git a/paddlemix/models/visualglm/configuration.py b/paddlemix/models/visualglm/configuration.py index a0d326cb0502c..9ec9ae6ce6c5a 100644 --- a/paddlemix/models/visualglm/configuration.py +++ b/paddlemix/models/visualglm/configuration.py @@ -72,22 +72,23 @@ class VisualGLMVisionConfig(PretrainedConfig): model_type = "visualglm_vision_model" def __init__( - self, - hidden_size=1408, - intermediate_size=6144, - num_hidden_layers=39, - num_attention_heads=16, - num_channels=3, - image_size=224, - patch_size=14, - hidden_act="gelu", - layer_norm_eps=0.00001, - dropout=0.1, - attention_dropout=0.1, - initializer_range=1e-10, - initializer_factor=1.0, - qkv_bias=True, - **kwargs, ): + self, + hidden_size=1408, + intermediate_size=6144, + num_hidden_layers=39, + num_attention_heads=16, + num_channels=3, + image_size=224, + patch_size=14, + hidden_act="gelu", + layer_norm_eps=0.00001, + dropout=0.1, + attention_dropout=0.1, + initializer_range=1e-10, + initializer_factor=1.0, + qkv_bias=True, + **kwargs, + ): kwargs["return_dict"] = kwargs.pop("return_dict", True) super().__init__(**kwargs) @@ -107,17 +108,13 @@ def __init__( self.qkv_bias = qkv_bias @classmethod - def from_pretrained(cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, - **kwargs) + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the vision config dict if we are loading from VisualGLMConfig if config_dict.get("model_type") == "visualglm": config_dict = config_dict["vision_config"] - if ("model_type" in config_dict and hasattr(cls, "model_type") and - config_dict["model_type"] != cls.model_type): + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: logger.warning( f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." @@ -181,23 +178,24 @@ class VisualGLMQFormerConfig(PretrainedConfig): model_type = "visualglm_qformer_model" def __init__( - self, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - position_embedding_type="absolute", - classifier_dropout=None, - cross_attention_frequency=2, - encoder_hidden_size=1408, - **kwargs, ): + self, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type="absolute", + classifier_dropout=None, + cross_attention_frequency=2, + encoder_hidden_size=1408, + **kwargs, + ): super().__init__(pad_token_id=pad_token_id, **kwargs) self.hidden_size = hidden_size @@ -216,18 +214,14 @@ def __init__( self.encoder_hidden_size = encoder_hidden_size @classmethod - def from_pretrained(cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, - **kwargs) + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the qformer config dict if we are loading from VisualGLMConfig if config_dict.get("model_type") == "visualglm": config_dict = config_dict["qformer_config"] - if ("model_type" in config_dict and hasattr(cls, "model_type") and - config_dict["model_type"] != cls.model_type): + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: logger.warning( f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." @@ -280,42 +274,34 @@ class VisualGLMConfig(PretrainedConfig): model_type = "visualglm" def __init__( - self, - vision_config=None, - qformer_config=None, - text_config=None, - num_query_tokens=32, - **kwargs, ): + self, + vision_config=None, + qformer_config=None, + text_config=None, + num_query_tokens=32, + **kwargs, + ): super().__init__(**kwargs) if vision_config is None: vision_config = {} - logger.info( - "vision_config is None. initializing the VisualGLMVisionConfig with default values." - ) + logger.info("vision_config is None. initializing the VisualGLMVisionConfig with default values.") if qformer_config is None: qformer_config = {} - logger.info( - "qformer_config is None. Initializing the VisualGLMQFormerConfig with default values." - ) + logger.info("qformer_config is None. Initializing the VisualGLMQFormerConfig with default values.") if text_config is None: text_config = {} - logger.info( - "text_config is None. Initializing the text config with default values (`ChatGLMConfig`)." - ) + logger.info("text_config is None. Initializing the text config with default values (`ChatGLMConfig`).") self.vision_config = VisualGLMVisionConfig(**vision_config) self.qformer_config = VisualGLMQFormerConfig(**qformer_config) - text_model_type = (text_config["model_type"] - if "model_type" in text_config else "chatglm") + text_model_type = text_config["model_type"] if "model_type" in text_config else "chatglm" if text_model_type == "chatglm": self.text_config = ChatGLMConfig(**text_config) else: - raise ValueError( - "Only chatglm accepted for model_type, but accepted {}.".format( - text_model_type)) + raise ValueError("Only chatglm accepted for model_type, but accepted {}.".format(text_model_type)) self.num_query_tokens = num_query_tokens self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size @@ -325,11 +311,12 @@ def __init__( @classmethod def from_vision_qformer_text_configs( - cls, - vision_config: VisualGLMVisionConfig, - qformer_config: VisualGLMQFormerConfig, - text_config: PretrainedConfig, - **kwargs, ): + cls, + vision_config: VisualGLMVisionConfig, + qformer_config: VisualGLMQFormerConfig, + text_config: PretrainedConfig, + **kwargs, + ): r""" Instantiate a [`VisualGLMConfig`] (or a derived class) from a vision model, Q-Former and language model configurations. @@ -341,7 +328,8 @@ def from_vision_qformer_text_configs( vision_config=vision_config.to_dict(), qformer_config=qformer_config.to_dict(), text_config=text_config.to_dict(), - **kwargs, ) + **kwargs, + ) def to_dict(self): """ diff --git a/paddlemix/models/visualglm/modeling.py b/paddlemix/models/visualglm/modeling.py index 6358f9cd8ca20..f478f79caf776 100644 --- a/paddlemix/models/visualglm/modeling.py +++ b/paddlemix/models/visualglm/modeling.py @@ -24,18 +24,27 @@ from paddlenlp.transformers.chatglm.configuration import ChatGLMConfig from paddlenlp.transformers.chatglm.modeling import ChatGLMForCausalLM from paddlenlp.transformers.model_outputs import ( - BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, - BaseModelOutputWithPooling, BaseModelOutputWithPoolingAndCrossAttentions, - ModelOutput) + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPooling, + BaseModelOutputWithPoolingAndCrossAttentions, + ModelOutput, +) from paddlenlp.transformers.model_utils import ( - PretrainedModel, apply_chunking_to_forward, - find_pruneable_heads_and_indices, prune_linear_layer) + PretrainedModel, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer, +) from ...activations import ACT2FN from ...utils.initializer import normal_, ones_, zeros_ from ...utils.log import logger -from .configuration import (VisualGLMConfig, VisualGLMQFormerConfig, - VisualGLMVisionConfig) +from .configuration import ( + VisualGLMConfig, + VisualGLMQFormerConfig, + VisualGLMVisionConfig, +) VisualGLM_PRETRAINED_MODEL_ARCHIVE_LIST = [] @@ -53,7 +62,8 @@ def Parameter(tensor, dtype="float16"): return paddle.create_parameter( tensor.shape, dtype=tensor.dtype, - default_initializer=nn.initializer.Assign(tensor), ) + default_initializer=nn.initializer.Assign(tensor), + ) @dataclass @@ -81,9 +91,11 @@ class VisualGLMForConditionalGenerationModelOutput(ModelOutput): def to_tuple(self) -> Tuple[Any]: return tuple( - self[k] if k not in - ["vision_outputs", "qformer_outputs", "language_model_outputs"] else - getattr(self, k).to_tuple() for k in self.keys()) + self[k] + if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"] + else getattr(self, k).to_tuple() + for k in self.keys() + ) class VisualGLMPretrainedModel(PretrainedModel): @@ -95,13 +107,14 @@ class VisualGLMPretrainedModel(PretrainedModel): config_class = VisualGLMConfig base_model_prefix = "visualglm" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids", ] + _keys_to_ignore_on_load_missing = [ + r"position_ids", + ] def _init_weights(self, module): """Initialize the weights""" factor = self.config.initializer_range - if (isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or - isinstance(module, nn.Linear)): + if isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear): normal_(module.weight, mean=0.0, std=factor) if hasattr(module, "bias") and module.bias is not None: zeros_(module.bias) @@ -111,7 +124,9 @@ def _init_weights(self, module): factor = self.config.vision_config.initializer_range trunc_normal_ = nn.initializer.TruncatedNormal(mean=0.0, std=factor) trunc_normal_(module.position_embedding) - trunc_normal_(module.class_embedding, ) + trunc_normal_( + module.class_embedding, + ) elif isinstance(module, nn.LayerNorm): zeros_(module.bias) ones_(module.weight) @@ -136,30 +151,30 @@ def __init__(self, config: VisualGLMVisionConfig): in_channels=self.in_channels, out_channels=self.embed_dim, kernel_size=self.patch_size, - stride=self.patch_size, ) + stride=self.patch_size, + ) - self.num_patches = (self.image_size // self.patch_size)**2 + self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_positions = self.num_patches + 1 self.class_embedding = Parameter( paddle.randn([1, 1, self.embed_dim]), - dtype=self.patch_embedding.weight.dtype, ) + dtype=self.patch_embedding.weight.dtype, + ) self.position_embedding = Parameter( paddle.randn([1, self.num_positions, self.embed_dim]), - dtype=self.patch_embedding.weight.dtype, ) + dtype=self.patch_embedding.weight.dtype, + ) def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor: batch_size = pixel_values.shape[0] target_dtype = self.patch_embedding.weight.dtype - patch_embeds = self.patch_embedding( - pixel_values) # shape = [*, width, grid, grid] + patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid] patch_embeds = patch_embeds.flatten(2).transpose([0, 2, 1]) - class_embeds = self.class_embedding.expand( - [batch_size, 1, -1]).cast(target_dtype) + class_embeds = self.class_embedding.expand([batch_size, 1, -1]).cast(target_dtype) embeddings = paddle.concat([class_embeds, patch_embeds], axis=1) - embeddings = embeddings + self.position_embedding[:, :embeddings.shape[ - 1], :].cast(target_dtype) + embeddings = embeddings + self.position_embedding[:, : embeddings.shape[1], :].cast(target_dtype) return embeddings @@ -175,60 +190,52 @@ def __init__(self, config): if self.head_dim * self.num_heads != self.embed_dim: raise ValueError( f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" - f" {self.num_heads}).") + f" {self.num_heads})." + ) self.scale = self.head_dim**-0.5 self.dropout = nn.Dropout(config.attention_dropout) # small tweak here compared to CLIP, no bias here - self.qkv = nn.Linear( - self.embed_dim, 3 * self.embed_dim, bias_attr=False) + self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias_attr=False) if config.qkv_bias: - q_bias = Parameter( - paddle.zeros( - [self.embed_dim], dtype=self.qkv.weight.dtype)) - v_bias = Parameter( - paddle.zeros( - [self.embed_dim], dtype=self.qkv.weight.dtype)) + q_bias = Parameter(paddle.zeros([self.embed_dim], dtype=self.qkv.weight.dtype)) + v_bias = Parameter(paddle.zeros([self.embed_dim], dtype=self.qkv.weight.dtype)) else: q_bias = None v_bias = None if q_bias is not None: - qkv_bias = paddle.concat( - (q_bias, paddle.zeros_like(v_bias), v_bias)) + qkv_bias = paddle.concat((q_bias, paddle.zeros_like(v_bias), v_bias)) self.qkv.bias = Parameter(qkv_bias, dtype=self.qkv.weight.dtype) self.projection = nn.Linear(self.embed_dim, self.embed_dim) def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int): - return tensor.reshape( - [bsz, seq_len, self.num_heads, self.head_dim]).transpose( - [0, 2, 1, 3]) + return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) def forward( - self, - hidden_states: paddle.Tensor, - head_mask: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=False, ) -> Tuple[ - paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[ - paddle.Tensor]]]: + self, + hidden_states: paddle.Tensor, + head_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: """Input shape: Batch x Time x Channel""" bsz, tgt_len, embed_dim = hidden_states.shape mixed_qkv = self.qkv(hidden_states) - mixed_qkv = mixed_qkv.reshape( - [bsz, tgt_len, 3, self.num_heads, - embed_dim // self.num_heads]).transpose([2, 0, 3, 1, 4]) + mixed_qkv = mixed_qkv.reshape([bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads]).transpose( + [2, 0, 3, 1, 4] + ) query_states, key_states, value_states = ( mixed_qkv[0], mixed_qkv[1], - mixed_qkv[2], ) + mixed_qkv[2], + ) # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = paddle.matmul( - query_states, key_states, transpose_y=True) + attention_scores = paddle.matmul(query_states, key_states, transpose_y=True) attention_scores = attention_scores * self.scale @@ -243,16 +250,16 @@ def forward( if head_mask is not None: attention_probs = attention_probs * head_mask - context_layer = paddle.matmul(attention_probs, value_states).transpose( - [0, 2, 1, 3]) + context_layer = paddle.matmul(attention_probs, value_states).transpose([0, 2, 1, 3]) - new_context_layer_shape = context_layer.shape[:-2] + [self.embed_dim, ] + new_context_layer_shape = context_layer.shape[:-2] + [ + self.embed_dim, + ] context_layer = context_layer.reshape(new_context_layer_shape) output = self.projection(context_layer) - outputs = (output, attention_probs) if output_attentions else (output, - None) + outputs = (output, attention_probs) if output_attentions else (output, None) return outputs @@ -277,17 +284,16 @@ def __init__(self, config: VisualGLMConfig): super().__init__() self.embed_dim = config.hidden_size self.self_attn = VisualGLMAttention(config) - self.layer_norm1 = nn.LayerNorm( - self.embed_dim, epsilon=config.layer_norm_eps) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps) self.mlp = VisualGLMMLP(config) - self.layer_norm2 = nn.LayerNorm( - self.embed_dim, epsilon=config.layer_norm_eps) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps) def forward( - self, - hidden_states: paddle.Tensor, - attention_mask: paddle.Tensor, - output_attentions: Optional[bool]=False, ) -> Tuple[paddle.Tensor]: + self, + hidden_states: paddle.Tensor, + attention_mask: paddle.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[paddle.Tensor]: """ Args: hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` @@ -304,7 +310,8 @@ def forward( hidden_states, attn_weights = self.self_attn( hidden_states=hidden_states, head_mask=attention_mask, - output_attentions=output_attentions, ) + output_attentions=output_attentions, + ) hidden_states = hidden_states + residual residual = hidden_states hidden_states = self.layer_norm2(hidden_states) @@ -312,10 +319,10 @@ def forward( hidden_states = hidden_states + residual - outputs = (hidden_states, ) + outputs = (hidden_states,) if output_attentions: - outputs += (attn_weights, ) + outputs += (attn_weights,) return outputs @@ -332,20 +339,17 @@ class VisualGLMEncoder(nn.Layer): def __init__(self, config: VisualGLMConfig): super().__init__() self.config = config - self.layers = nn.LayerList([ - VisualGLMEncoderLayer(config) - for _ in range(config.num_hidden_layers) - ]) + self.layers = nn.LayerList([VisualGLMEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False def forward( - self, - inputs_embeds, - attention_mask: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - return_dict: Optional[bool]=None, ) -> Union[Tuple, - BaseModelOutput]: + self, + inputs_embeds, + attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: r""" Args: inputs_embeds (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -367,13 +371,11 @@ def forward( return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ - output_attentions = (output_attentions if output_attentions is not None - else self.config.output_attentions) - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -381,7 +383,7 @@ def forward( hidden_states = inputs_embeds for idx, encoder_layer in enumerate(self.layers): if output_hidden_states: - encoder_states = encoder_states + (hidden_states, ) + encoder_states = encoder_states + (hidden_states,) if self.gradient_checkpointing and self.training: def create_custom_forward(module): @@ -393,29 +395,30 @@ def custom_forward(*inputs): layer_outputs = recompute( create_custom_forward(encoder_layer), hidden_states, - attention_mask, ) + attention_mask, + ) else: layer_outputs = encoder_layer( hidden_states, attention_mask, - output_attentions=output_attentions, ) + output_attentions=output_attentions, + ) hidden_states = layer_outputs[0] if output_attentions: - all_attentions = all_attentions + (layer_outputs[1], ) + all_attentions = all_attentions + (layer_outputs[1],) if output_hidden_states: - encoder_states = encoder_states + (hidden_states, ) + encoder_states = encoder_states + (hidden_states,) if not return_dict: - return tuple( - v for v in [hidden_states, encoder_states, all_attentions] - if v is not None) + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, - attentions=all_attentions, ) + attentions=all_attentions, + ) class VisualGLMVisionModel(VisualGLMPretrainedModel): @@ -429,26 +432,23 @@ def __init__(self, config: VisualGLMVisionConfig): self.embeddings = VisualGLMVisionEmbeddings(config) self.encoder = VisualGLMEncoder(config) - self.post_layernorm = nn.LayerNorm( - embed_dim, epsilon=config.layer_norm_eps) + self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps) def forward( - self, - pixel_values: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - return_dict: Optional[bool]=None, ) -> Union[ - Tuple, BaseModelOutputWithPooling]: + self, + pixel_values: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" Returns: """ - output_attentions = (output_attentions if output_attentions is not None - else self.config.output_attentions) - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -459,7 +459,8 @@ def forward( inputs_embeds=hidden_states, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) last_hidden_state = encoder_outputs[0] last_hidden_state = self.post_layernorm(last_hidden_state) @@ -474,7 +475,8 @@ def forward( last_hidden_state=last_hidden_state, pooler_output=pooled_output, hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, ) + attentions=encoder_outputs.attentions, + ) def get_input_embeddings(self): return self.embeddings @@ -484,35 +486,29 @@ class VisualGLMQFormerMultiHeadAttention(nn.Layer): def __init__(self, config, is_cross_attention=False): super().__init__() self.config = config - if config.hidden_size % config.num_attention_heads != 0 and not hasattr( - config, "embedding_size"): + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( "The hidden size (%d) is not a multiple of the number of attention heads (%d)" - % (config.hidden_size, config.num_attention_heads)) + % (config.hidden_size, config.num_attention_heads) + ) self.num_attention_heads = config.num_attention_heads - self.attention_head_size = int(config.hidden_size / - config.num_attention_heads) + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query = nn.Linear(config.hidden_size, self.all_head_size) if is_cross_attention: self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size) - self.value = nn.Linear(config.encoder_hidden_size, - self.all_head_size) + self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size) else: self.key = nn.Linear(config.hidden_size, self.all_head_size) self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = getattr( - config, "position_embedding_type", "absolute") - if (self.position_embedding_type == "relative_key" or - self.position_embedding_type == "relative_key_query"): + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding( - 2 * config.max_position_embeddings - 1, - self.attention_head_size) + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.save_attention = False def save_attn_gradients(self, attn_gradients): @@ -536,30 +532,28 @@ def transpose_for_scores(self, x): return x.transpose([0, 2, 1, 3]) def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, ): + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): # If this is instantiated as a cross-attention module, the keys # and values come from an encoder; the attention mask needs to be # such that the encoder's padding tokens are not attended to. is_cross_attention = encoder_hidden_states is not None if is_cross_attention: - key_layer = self.transpose_for_scores( - self.key(encoder_hidden_states)) - value_layer = self.transpose_for_scores( - self.value(encoder_hidden_states)) + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) attention_mask = encoder_attention_mask elif past_key_value is not None: key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) key_layer = paddle.concat([past_key_value[0], key_layer], axis=2) - value_layer = paddle.concat( - [past_key_value[1], value_layer], axis=2) + value_layer = paddle.concat([past_key_value[1], value_layer], axis=2) else: key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) @@ -571,37 +565,25 @@ def forward( past_key_value = (key_layer, value_layer) # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = paddle.matmul( - query_layer, key_layer, transpose_y=True) + attention_scores = paddle.matmul(query_layer, key_layer, transpose_y=True) - if (self.position_embedding_type == "relative_key" or - self.position_embedding_type == "relative_key_query"): + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": seq_length = hidden_states.shape[1] - position_ids_l = paddle.arange( - seq_length, dtype="int64").reshape([-1, 1]) - position_ids_r = paddle.arange( - seq_length, dtype="int64").reshape([1, -1]) + position_ids_l = paddle.arange(seq_length, dtype="int64").reshape([-1, 1]) + position_ids_r = paddle.arange(seq_length, dtype="int64").reshape([1, -1]) distance = position_ids_l - position_ids_r - positional_embedding = self.distance_embedding( - distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.cast( - dtype=query_layer.dtype) # fp16 compatibility + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.cast(dtype=query_layer.dtype) # fp16 compatibility if self.position_embedding_type == "relative_key": - relative_position_scores = paddle.einsum( - "bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) attention_scores = attention_scores + relative_position_scores elif self.position_embedding_type == "relative_key_query": - relative_position_scores_query = paddle.einsum( - "bhld,lrd->bhlr", query_layer, positional_embedding) - relative_position_scores_key = paddle.einsum( - "bhrd,lrd->bhlr", key_layer, positional_embedding) - attention_scores = ( - attention_scores + relative_position_scores_query + - relative_position_scores_key) + relative_position_scores_query = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = paddle.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key - attention_scores = attention_scores / math.sqrt( - self.attention_head_size) + attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in BertModel forward() function) @@ -630,10 +612,9 @@ def forward( ] context_layer = context_layer.reshape(new_context_layer_shape) - outputs = ((context_layer, attention_probs) - if output_attentions else (context_layer, )) + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) - outputs = outputs + (past_key_value, ) + outputs = outputs + (past_key_value,) return outputs @@ -641,12 +622,10 @@ class VisualGLMQFormerSelfOutput(nn.Layer): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.LayerNorm = nn.LayerNorm( - config.hidden_size, epsilon=config.layer_norm_eps) + self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) - def forward(self, hidden_states: paddle.Tensor, - input_tensor: paddle.Tensor) -> paddle.Tensor: + def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) @@ -656,8 +635,7 @@ def forward(self, hidden_states: paddle.Tensor, class VisualGLMQFormerAttention(nn.Layer): def __init__(self, config, is_cross_attention=False): super().__init__() - self.attention = VisualGLMQFormerMultiHeadAttention(config, - is_cross_attention) + self.attention = VisualGLMQFormerMultiHeadAttention(config, is_cross_attention) self.output = VisualGLMQFormerSelfOutput(config) self.pruned_heads = set() @@ -668,7 +646,8 @@ def prune_heads(self, heads): heads, self.attention.num_attention_heads, self.attention.attention_head_size, - self.pruned_heads, ) + self.pruned_heads, + ) # Prune linear layers self.attention.query = prune_linear_layer(self.attention.query, index) @@ -677,21 +656,20 @@ def prune_heads(self, heads): self.output.dense = prune_linear_layer(self.output.dense, index, axis=1) # Update hyper params and store pruned heads - self.attention.num_attention_heads = self.attention.num_attention_heads - len( - heads) - self.attention.all_head_size = (self.attention.attention_head_size * - self.attention.num_attention_heads) + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) def forward( - self, - hidden_states: paddle.Tensor, - attention_mask: Optional[paddle.Tensor]=None, - head_mask: Optional[paddle.Tensor]=None, - encoder_hidden_states: Optional[paddle.Tensor]=None, - encoder_attention_mask: Optional[paddle.Tensor]=None, - past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]]=None, - output_attentions: Optional[bool]=False, ) -> Tuple[paddle.Tensor]: + self, + hidden_states: paddle.Tensor, + attention_mask: Optional[paddle.Tensor] = None, + head_mask: Optional[paddle.Tensor] = None, + encoder_hidden_states: Optional[paddle.Tensor] = None, + encoder_attention_mask: Optional[paddle.Tensor] = None, + past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[paddle.Tensor]: self_outputs = self.attention( hidden_states, attention_mask, @@ -699,10 +677,10 @@ def forward( encoder_hidden_states, encoder_attention_mask, past_key_value, - output_attentions, ) + output_attentions, + ) attention_output = self.output(self_outputs[0], hidden_states) - outputs = (attention_output, - ) + self_outputs[1:] # add attentions if we output them + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs @@ -728,8 +706,7 @@ def __init__(self, config): # self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) - def forward(self, hidden_states: paddle.Tensor, - input_tensor: paddle.Tensor) -> paddle.Tensor: + def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = hidden_states + input_tensor @@ -742,15 +719,13 @@ def __init__(self, config, layer_idx): super().__init__() self.chunk_size_feed_forward = config.chunk_size_feed_forward self.seq_len_dim = 1 - self.input_layernorm = nn.LayerNorm( - config.hidden_size, epsilon=config.layer_norm_eps) + self.input_layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) self.attention = VisualGLMQFormerAttention(config) self.layer_idx = layer_idx if layer_idx % config.cross_attention_frequency == 0: - self.crossattention = VisualGLMQFormerAttention( - config, is_cross_attention=True) + self.crossattention = VisualGLMQFormerAttention(config, is_cross_attention=True) self.has_cross_attention = True else: self.has_cross_attention = False @@ -759,25 +734,26 @@ def __init__(self, config, layer_idx): self.output_query = VisualGLMQFormerOutput(config) def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_value=None, - output_attentions=False, - query_length=0, ): + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + query_length=0, + ): # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 - self_attn_past_key_value = (past_key_value[:2] - if past_key_value is not None else None) + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None hidden_states = self.input_layernorm(hidden_states) self_attention_outputs = self.attention( hidden_states, # 1, 32, 768 attention_mask, head_mask, output_attentions=output_attentions, - past_key_value=self_attn_past_key_value, ) + past_key_value=self_attn_past_key_value, + ) attention_output = self_attention_outputs[0] outputs = self_attention_outputs[1:-1] @@ -788,16 +764,15 @@ def forward( if self.has_cross_attention: if encoder_hidden_states is None: - raise ValueError( - "encoder_hidden_states must be given for cross-attention layers" - ) + raise ValueError("encoder_hidden_states must be given for cross-attention layers") cross_attention_outputs = self.crossattention( query_attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, - output_attentions=output_attentions, ) + output_attentions=output_attentions, + ) query_attention_output = cross_attention_outputs[0] # add cross attentions if we output attention weights outputs = outputs + cross_attention_outputs[1:-1] @@ -806,25 +781,27 @@ def forward( self.feed_forward_chunk_query, self.chunk_size_feed_forward, self.seq_len_dim, - query_attention_output, ) + query_attention_output, + ) if attention_output.shape[1] > query_length: layer_output_text = apply_chunking_to_forward( self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, - attention_output[:, query_length:, :], ) - layer_output = paddle.concat( - [layer_output, layer_output_text], axis=1) + attention_output[:, query_length:, :], + ) + layer_output = paddle.concat([layer_output, layer_output_text], axis=1) else: layer_output = apply_chunking_to_forward( self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, - attention_output, ) - outputs = (layer_output, ) + outputs + attention_output, + ) + outputs = (layer_output,) + outputs - outputs = outputs + (present_key_value, ) + outputs = outputs + (present_key_value,) return outputs @@ -843,25 +820,25 @@ class VisualGLMQFormerEncoder(nn.Layer): def __init__(self, config): super().__init__() self.config = config - self.layer = nn.LayerList([ - VisualGLMQFormerLayer(config, layer_idx) - for layer_idx in range(config.num_hidden_layers) - ]) + self.layer = nn.LayerList( + [VisualGLMQFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) self.gradient_checkpointing = False def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_values=None, - use_cache=None, - output_attentions=False, - output_hidden_states=False, - return_dict=True, - query_length=0, ): + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + query_length=0, + ): all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions else None @@ -871,14 +848,12 @@ def forward( for i in range(self.config.num_hidden_layers): layer_module = self.layer[i] if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states, ) + all_hidden_states = all_hidden_states + (hidden_states,) layer_head_mask = head_mask[i] if head_mask is not None else None - past_key_value = past_key_values[ - i] if past_key_values is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None - if getattr(self.config, "gradient_checkpointing", - False) and self.training: + if getattr(self.config, "gradient_checkpointing", False) and self.training: if use_cache: logger.warn( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -887,8 +862,7 @@ def forward( def create_custom_forward(module): def custom_forward(*inputs): - return module(*inputs, past_key_value, - output_attentions, query_length) + return module(*inputs, past_key_value, output_attentions, query_length) return custom_forward @@ -898,7 +872,8 @@ def custom_forward(*inputs): attention_mask, layer_head_mask, encoder_hidden_states, - encoder_attention_mask, ) + encoder_attention_mask, + ) else: layer_outputs = layer_module( hidden_states, @@ -908,35 +883,39 @@ def custom_forward(*inputs): encoder_attention_mask, past_key_value, output_attentions, - query_length, ) + query_length, + ) hidden_states = layer_outputs[0] if use_cache: - next_decoder_cache += (layer_outputs[-1], ) + next_decoder_cache += (layer_outputs[-1],) if output_attentions: - all_self_attentions = all_self_attentions + (layer_outputs[1], ) + all_self_attentions = all_self_attentions + (layer_outputs[1],) if layer_module.has_cross_attention: - all_cross_attentions = all_cross_attentions + ( - layer_outputs[2], ) + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states, ) + all_hidden_states = all_hidden_states + (hidden_states,) if not return_dict: - return tuple(v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] if v is not None) + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, hidden_states=all_hidden_states, attentions=all_self_attentions, - cross_attentions=all_cross_attentions, ) + cross_attentions=all_cross_attentions, + ) class VisualGLMQFormerModel(VisualGLMPretrainedModel): @@ -948,8 +927,7 @@ def __init__(self, config: VisualGLMQFormerConfig): super().__init__(config) self.config = config - self.final_layernorm = nn.LayerNorm( - config.hidden_size, epsilon=config.layer_norm_eps) + self.final_layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.encoder = VisualGLMQFormerEncoder(config) @@ -969,10 +947,11 @@ class PreTrainedModel self.encoder.layer[layer].attention.prune_heads(heads) def get_extended_attention_mask( - self, - attention_mask: paddle.Tensor, - input_shape: Tuple[int], - has_query: bool=False, ) -> paddle.Tensor: + self, + attention_mask: paddle.Tensor, + input_shape: Tuple[int], + has_query: bool = False, + ) -> paddle.Tensor: """ Makes broadcastable attention and causal masks so that future and masked tokens are ignored. Arguments: @@ -993,21 +972,21 @@ def get_extended_attention_mask( extended_attention_mask = attention_mask[:, None, None, :] else: raise ValueError( - "Wrong shape for input_ids (shape {}) or attention_mask (shape {})". - format(input_shape, attention_mask.shape)) + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( + input_shape, attention_mask.shape + ) + ) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. - extended_attention_mask = extended_attention_mask.cast( - dtype=self.config.dtype) # fp16 compatibility + extended_attention_mask = extended_attention_mask.cast(dtype=self.config.dtype) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 return extended_attention_mask - def invert_attention_mask( - self, encoder_attention_mask: paddle.Tensor) -> paddle.Tensor: + def invert_attention_mask(self, encoder_attention_mask: paddle.Tensor) -> paddle.Tensor: """ Invert an attention mask (e.g., switches 0. and 1.). Args: @@ -1016,28 +995,27 @@ def invert_attention_mask( `paddle.Tensor`: The inverted attention mask. """ if encoder_attention_mask.ndim == 3: - encoder_extended_attention_mask = encoder_attention_mask[:, - None, :, :] + encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] if encoder_attention_mask.ndim == 2: - encoder_extended_attention_mask = encoder_attention_mask[:, None, - None, :] + encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow # /transformer/transformer_layers.py#L270 # encoder_extended_attention_mask = (encoder_extended_attention_mask == # encoder_extended_attention_mask.transpose(-1, -2)) encoder_extended_attention_mask = encoder_extended_attention_mask.cast( - dtype=self.config.dtype) # fp16 compatibility - encoder_extended_attention_mask = ( - 1.0 - encoder_extended_attention_mask) * -1e4 + dtype=self.config.dtype + ) # fp16 compatibility + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4 return encoder_extended_attention_mask def get_head_mask( - self, - head_mask: Optional[paddle.Tensor], - num_hidden_layers: int, - is_attention_chunked: bool=False, ) -> paddle.Tensor: + self, + head_mask: Optional[paddle.Tensor], + num_hidden_layers: int, + is_attention_chunked: bool = False, + ) -> paddle.Tensor: """ Prepare the head mask if needed. Args: @@ -1052,8 +1030,7 @@ def get_head_mask( `[None]` for each layer. """ if head_mask is not None: - head_mask = self._convert_head_mask_to_5d(head_mask, - num_hidden_layers) + head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers) if is_attention_chunked is True: head_mask = head_mask.unsqueeze(-1) else: @@ -1064,30 +1041,27 @@ def get_head_mask( def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers): """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]""" if head_mask.ndim == 1: - head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze( - -1).unsqueeze(-1) + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) head_mask = head_mask.expand([num_hidden_layers, -1, -1, -1, -1]) elif head_mask.ndim == 2: - head_mask = (head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) - ) # We can specify head_mask for each layer + head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer assert head_mask.ndim == 5, f"head_mask.dim != 5, instead {head_mask.dim()}" - head_mask = head_mask.cast( - dtype=self.config. - dtype) # switch to float if need + fp16 compatibility + head_mask = head_mask.cast(dtype=self.config.dtype) # switch to float if need + fp16 compatibility return head_mask def forward( - self, - query_embeds, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - past_key_values=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, ): + self, + query_embeds, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): r""" encoder_hidden_states (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1107,18 +1081,16 @@ def forward( If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). """ - output_attentions = (output_attentions if output_attentions is not None - else self.config.output_attentions) - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # past_key_values_length past_key_values_length = ( - past_key_values[0][0].shape[2] - self.config.query_length - if past_key_values is not None else 0) + past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0 + ) query_length = query_embeds.shape[1] if query_embeds is not None else 0 @@ -1128,39 +1100,32 @@ def forward( batch_size, seq_length = input_shape if attention_mask is None: - attention_mask = paddle.ones(( - (batch_size, seq_length + past_key_values_length))) + attention_mask = paddle.ones(((batch_size, seq_length + past_key_values_length))) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask = self.get_extended_attention_mask( - attention_mask, input_shape) + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] if encoder_hidden_states is not None: if type(encoder_hidden_states) == list: - encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[ - 0].shape + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].shape else: ( encoder_batch_size, encoder_sequence_length, - _, ) = encoder_hidden_states.shape + _, + ) = encoder_hidden_states.shape encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if type(encoder_attention_mask) == list: - encoder_extended_attention_mask = [ - self.invert_attention_mask(mask) - for mask in encoder_attention_mask - ] + encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask] elif encoder_attention_mask is None: encoder_attention_mask = paddle.ones(encoder_hidden_shape) - encoder_extended_attention_mask = self.invert_attention_mask( - encoder_attention_mask) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: - encoder_extended_attention_mask = self.invert_attention_mask( - encoder_attention_mask) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) else: encoder_extended_attention_mask = None @@ -1182,7 +1147,8 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, - query_length=query_length, ) + query_length=query_length, + ) sequence_output = encoder_outputs[0] sequence_output = self.final_layernorm(sequence_output) pooled_output = sequence_output[:, 0, :] @@ -1196,7 +1162,8 @@ def forward( past_key_values=encoder_outputs.past_key_values, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, - cross_attentions=encoder_outputs.cross_attentions, ) + cross_attentions=encoder_outputs.cross_attentions, + ) class VisualGLMModel(VisualGLMPretrainedModel): @@ -1208,27 +1175,26 @@ def __init__(self, config: VisualGLMConfig): self.vision_model = VisualGLMVisionModel(config.vision_config) self.query_tokens = Parameter( - paddle.zeros([ - 1, config.num_query_tokens, config.qformer_config.hidden_size - ]), - dtype=self.config.dtype, ) + paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size]), + dtype=self.config.dtype, + ) self.qformer = VisualGLMQFormerModel(config.qformer_config) - self.language_projection = nn.Linear(config.qformer_config.hidden_size, - config.text_config.hidden_size) + self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size) self.language_model = ChatGLMForCausalLM(config.text_config) def get_input_embeddings(self) -> nn.Layer: return self.vision_model.embeddings.patch_embedding def get_text_features( - self, - input_ids: Optional[paddle.Tensor]=None, - attention_mask: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - return_dict: Optional[bool]=None, - **kwargs, ): + self, + input_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ): r""" Returns: text_outputs (`CausalLMOutputWithPast`, or `tuple(paddle.Tensor)` if `return_dict=False`): @@ -1246,30 +1212,30 @@ def get_text_features( >>> inputs = tokenizer(["a photo of a cat"], padding=True, return_tensors="pd", return_token_type_ids=False) >>> text_features = model.get_text_features(**inputs) ```""" - output_attentions = (output_attentions if output_attentions is not None - else self.config.output_attentions) - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict text_outputs = self.language_model( input_ids=input_ids, attention_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) return text_outputs def get_image_features( - self, - pixel_values: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - return_dict: Optional[bool]=None, - **kwargs, ): + self, + pixel_values: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ): r""" Returns: vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`): @@ -1289,32 +1255,30 @@ def get_image_features( >>> inputs = processor.process_images(images=image, return_tensors="pd") >>> image_outputs = model.get_image_features(**inputs) ```""" - output_attentions = (output_attentions if output_attentions is not None - else self.config.output_attentions) - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) - - pixel_values = paddle.cast( - pixel_values, - self.vision_model.embeddings.patch_embedding.weight.dtype) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype) vision_outputs = self.vision_model( pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) return vision_outputs def get_qformer_features( - self, - pixel_values: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - return_dict: Optional[bool]=None, - **kwargs, ): + self, + pixel_values: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ): r""" Returns: vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`): @@ -1334,56 +1298,51 @@ def get_qformer_features( >>> inputs = processor.process_images(images=image, return_tensors="pd") >>> qformer_outputs = model.get_qformer_features(**inputs) ```""" - output_attentions = (output_attentions if output_attentions is not None - else self.config.output_attentions) - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # step 1: forward the images through the vision encoder, # to get image embeddings of shape (batch_size, seq_len, hidden_size) - pixel_values = paddle.cast( - pixel_values, - self.vision_model.embeddings.patch_embedding.weight.dtype) + pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype) vision_outputs = self.vision_model( pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) image_embeds = vision_outputs[0] - image_attention_mask = paddle.ones( - image_embeds.shape[:-1], dtype="int64") + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) - query_tokens = paddle.cast(query_tokens, - self.qformer.layernorm.weight.dtype) - image_embeds = paddle.cast(image_embeds, - self.qformer.layernorm.weight.dtype) + query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype) + image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype) query_outputs = self.qformer( query_embeds=query_tokens, encoder_hidden_states=image_embeds, encoder_attention_mask=image_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=True, ) + return_dict=True, + ) return query_outputs def forward( - self, - pixel_values: paddle.Tensor, # processed image - first_input_ids: paddle.Tensor, - second_input_ids: paddle.Tensor, - first_attention_mask: Optional[paddle.Tensor]=None, - second_attention_mask: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - labels: Optional[paddle.Tensor]=None, - return_dict: Optional[bool]=None, ) -> Union[ - Tuple, VisualGLMForConditionalGenerationModelOutput]: + self, + pixel_values: paddle.Tensor, # processed image + first_input_ids: paddle.Tensor, + second_input_ids: paddle.Tensor, + first_attention_mask: Optional[paddle.Tensor] = None, + second_attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[paddle.Tensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, VisualGLMForConditionalGenerationModelOutput]: r""" Returns: Examples: @@ -1401,68 +1360,60 @@ def forward( >>> inputs = processor(images=image, texts=text, prompts=prompt, return_tensors="pd") >>> outputs = model(**inputs) ```""" - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # step 1: forward the images through the vision encoder, # to get image embeddings of shape (batch_size, seq_len, hidden_size) vision_outputs = self.vision_model(pixel_values, return_dict=True) image_embeds = vision_outputs.last_hidden_state - image_attention_mask = paddle.ones( - image_embeds.shape[:-1], dtype="int64") + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) - query_tokens = paddle.cast(query_tokens, - self.qformer.layernorm.weight.dtype) - image_embeds = paddle.cast(image_embeds, - self.qformer.layernorm.weight.dtype) + query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype) + image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype) query_outputs = self.qformer( query_embeds=query_tokens, encoder_hidden_states=image_embeds, encoder_attention_mask=image_attention_mask, - return_dict=True, ) + return_dict=True, + ) query_output = query_outputs.last_hidden_state # step 3: use the language model, conditioned on the text and image language_model_inputs = self.language_projection(query_output) - language_model_attention_mask = paddle.ones( - language_model_inputs.shape[:-1], dtype="int64") - - first_embeds = self.language_model.chatglm.transformer.word_embeddings( - first_input_ids) - second_embeds = self.language_model.chatglm.word_embeddings( - second_input_ids) - language_model_inputs = paddle.cast( - language_model_inputs, dtype=first_embeds.dtype) - inputs_embeds = paddle.concat( - [first_embeds, language_model_inputs, second_embeds], axis=1) + language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64") + + first_embeds = self.language_model.chatglm.transformer.word_embeddings(first_input_ids) + second_embeds = self.language_model.chatglm.word_embeddings(second_input_ids) + language_model_inputs = paddle.cast(language_model_inputs, dtype=first_embeds.dtype) + inputs_embeds = paddle.concat([first_embeds, language_model_inputs, second_embeds], axis=1) if first_attention_mask is None: - first_attention_mask = paddle.ones_like( - first_embeds.shape[:-1], dtype="int64") + first_attention_mask = paddle.ones_like(first_embeds.shape[:-1], dtype="int64") if second_attention_mask is None: - second_attention_mask = paddle.ones_like( - second_embeds.shape[:-1], dtype="int64") + second_attention_mask = paddle.ones_like(second_embeds.shape[:-1], dtype="int64") attention_mask = paddle.concat( [ first_attention_mask, language_model_attention_mask, second_attention_mask, ], - axis=1, ) + axis=1, + ) outputs = self.language_model( inputs_embeds=inputs_embeds, attention_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) logits = outputs.logits if return_dict else outputs[0] loss = None # we compute the loss here since we need to take into account the sequence length of the query embeds if labels is not None: - logits = logits[:, -labels.shape[1]:, :] + logits = logits[:, -labels.shape[1] :, :] # Shift so that tokens < n predict n shift_logits = logits[..., :-1, :] shift_labels = labels[..., 1:] @@ -1472,18 +1423,20 @@ def forward( loss = loss_fct( shift_logits.reshape([-1, self.config.text_config.vocab_size]), - shift_labels.reshape([-1]), ) + shift_labels.reshape([-1]), + ) if not return_dict: output = (logits, vision_outputs, query_outputs, outputs) - return ((loss, ) + output) if loss is not None else output + return ((loss,) + output) if loss is not None else output return VisualGLMForConditionalGenerationModelOutput( loss=loss, logits=logits, vision_outputs=vision_outputs, qformer_outputs=query_outputs, - language_model_outputs=outputs, ) + language_model_outputs=outputs, + ) class ChatGLMForConditionalGenerationWithImage(ChatGLMForCausalLM): @@ -1492,27 +1445,25 @@ def __init__(self, config: ChatGLMConfig): self.config = config def forward( - self, - image_features: paddle.Tensor, - input_ids: paddle.Tensor, - position_ids: Optional[paddle.Tensor]=None, - attention_mask: Optional[paddle.Tensor]=None, - pre_image_length: Optional[int]=None, - cache: Optional[Tuple[paddle.Tensor]]=None, - inputs_embeds: Optional[paddle.Tensor]=None, - labels: Optional[paddle.Tensor]=None, - use_cache: Optional[bool]=None, - return_dict: Optional[bool]=None, ): - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + self, + image_features: paddle.Tensor, + input_ids: paddle.Tensor, + position_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + pre_image_length: Optional[int] = None, + cache: Optional[Tuple[paddle.Tensor]] = None, + inputs_embeds: Optional[paddle.Tensor] = None, + labels: Optional[paddle.Tensor] = None, + use_cache: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if inputs_embeds is None and cache is None and image_features is not None: - pre_ids, pad_ids, post_ids = paddle.split( - input_ids, num_or_sections=[pre_image_length, 32, -1], axis=1) + pre_ids, pad_ids, post_ids = paddle.split(input_ids, num_or_sections=[pre_image_length, 32, -1], axis=1) pre_txt_emb = self.chatglm.transformer.word_embeddings(pre_ids) post_txt_emb = self.chatglm.transformer.word_embeddings(post_ids) - inputs_embeds = paddle.concat( - [pre_txt_emb, image_features, post_txt_emb], axis=1) + inputs_embeds = paddle.concat([pre_txt_emb, image_features, post_txt_emb], axis=1) outputs = super().forward( input_ids=input_ids, @@ -1522,7 +1473,8 @@ def forward( inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, - return_dict=return_dict, ) + return_dict=return_dict, + ) return outputs @@ -1536,44 +1488,37 @@ def __init__(self, config: VisualGLMConfig): self.config = config self.vision_model = VisualGLMVisionModel(config.vision_config) self.query_tokens = Parameter( - paddle.zeros([ - 1, config.num_query_tokens, config.qformer_config.hidden_size - ]), - dtype=self.config.dtype, ) + paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size]), + dtype=self.config.dtype, + ) self.qformer = VisualGLMQFormerModel(config.qformer_config) - self.language_projection = nn.Linear(config.qformer_config.hidden_size, - config.text_config.hidden_size) - self.language_model = ChatGLMForConditionalGenerationWithImage( - config.text_config) + self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size) + self.language_model = ChatGLMForConditionalGenerationWithImage(config.text_config) def get_input_embeddings(self) -> nn.Layer: return self.vision_model.embeddings.patch_embedding def encode_images( - self, - pixel_values: paddle.Tensor, # processed image + self, + pixel_values: paddle.Tensor, # processed image ): # step 1: forward the images through the vision encoder, # to get image embeddings of shape (batch_size, seq_len, hidden_size) - pixel_values = paddle.cast( - pixel_values, - self.vision_model.embeddings.patch_embedding.weight.dtype) + pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype) vision_outputs = self.vision_model(pixel_values, return_dict=True) image_embeds = vision_outputs.last_hidden_state - image_attention_mask = paddle.ones( - image_embeds.shape[:-1], dtype="int64") + image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64") # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1]) - query_tokens = paddle.cast(query_tokens, - self.qformer.final_layernorm.weight.dtype) - image_embeds = paddle.cast(image_embeds, - self.qformer.final_layernorm.weight.dtype) + query_tokens = paddle.cast(query_tokens, self.qformer.final_layernorm.weight.dtype) + image_embeds = paddle.cast(image_embeds, self.qformer.final_layernorm.weight.dtype) query_outputs = self.qformer( query_embeds=query_tokens, encoder_hidden_states=image_embeds, encoder_attention_mask=image_attention_mask, - return_dict=True, ) + return_dict=True, + ) query_output = query_outputs.last_hidden_state # step 3: mapping query_output into language_model space @@ -1583,12 +1528,13 @@ def encode_images( @paddle.no_grad() def generate( - self, - pixel_values: paddle.Tensor, - input_ids: paddle.Tensor, - pre_image_length: int, - attention_mask: Optional[paddle.Tensor]=None, - **generate_kwargs, ) -> paddle.Tensor: + self, + pixel_values: paddle.Tensor, + input_ids: paddle.Tensor, + pre_image_length: int, + attention_mask: Optional[paddle.Tensor] = None, + **generate_kwargs, + ) -> paddle.Tensor: """ Overrides `generate` function to be able to use the model as a conditional generator. Args: @@ -1625,6 +1571,7 @@ def generate( image_features=image_features, pre_image_length=pre_image_length, attention_mask=attention_mask, - **generate_kwargs, ) + **generate_kwargs, + ) return outputs diff --git a/paddlemix/optimization.py b/paddlemix/optimization.py index c11363d24087c..70d5400739722 100644 --- a/paddlemix/optimization.py +++ b/paddlemix/optimization.py @@ -42,15 +42,16 @@ class CosineDecayWithWarmup(LRScheduler): """ def __init__( - self, - learning_rate, - epochs, - eta_min=0.0, - warmup_steps=0, - warmup_start_lr=0.0, - last_epoch=-1, - step_each_epoch=1, - **kwargs, ): + self, + learning_rate, + epochs, + eta_min=0.0, + warmup_steps=0, + warmup_start_lr=0.0, + last_epoch=-1, + step_each_epoch=1, + **kwargs, + ): self.start_lr = learning_rate self.T_max = epochs self.eta_min = eta_min @@ -70,12 +71,13 @@ def step(self): cur_step_in_epoch = (self.cur_step - 2) % self.step_each_epoch cur_epoch = (self.cur_step - 2) // self.step_each_epoch if self.cur_step < self.warmup_steps and cur_epoch == 0: - self.last_lr = self.warmup_start_lr + ( - self.start_lr - self.warmup_start_lr) * cur_step_in_epoch / max( - self.warmup_steps, 1) + self.last_lr = self.warmup_start_lr + (self.start_lr - self.warmup_start_lr) * cur_step_in_epoch / max( + self.warmup_steps, 1 + ) else: self.last_lr = (self.start_lr - self.eta_min) * 0.5 * ( - 1.0 + math.cos(math.pi * cur_epoch / self.T_max)) + self.eta_min + 1.0 + math.cos(math.pi * cur_epoch / self.T_max) + ) + self.eta_min self.last_epoch = cur_epoch def get_lr(self): @@ -164,11 +166,8 @@ def get_parameters(args, model, assigner, tower): skip = set() if tower == "visual": lr = args.visual_lr if args.visual_lr is not None else args.learning_rate - weight_decay = (args.visual_wd - if args.visual_wd is not None else args.weight_decay) - filter_parameters = [[name, param] - for name, param in model.named_parameters() - if "visual." in name] + weight_decay = args.visual_wd if args.visual_wd is not None else args.weight_decay + filter_parameters = [[name, param] for name, param in model.named_parameters() if "visual." in name] if hasattr(model, "visual"): if hasattr(model.visual, "no_weight_decay"): skip = set.union(skip, model.visual.no_weight_decay()) @@ -176,9 +175,7 @@ def get_parameters(args, model, assigner, tower): elif tower == "text": lr = args.text_lr if args.text_lr is not None else args.learning_rate weight_decay = args.text_wd if args.text_wd is not None else args.weight_decay - filter_parameters = [[name, param] - for name, param in model.named_parameters() - if "text." in name] + filter_parameters = [[name, param] for name, param in model.named_parameters() if "text." in name] if hasattr(model, "text"): if hasattr(model.text, "no_weight_decay"): skip = set.union(skip, model.text.no_weight_decay()) @@ -187,8 +184,7 @@ def get_parameters(args, model, assigner, tower): lr = args.learning_rate weight_decay = args.weight_decay exclude = lambda n: "visual." not in n and "text." not in n - filter_parameters = [[n, p] for n, p in model.named_parameters() - if exclude(n)] + filter_parameters = [[n, p] for n, p in model.named_parameters() if exclude(n)] if hasattr(model, "no_weight_decay"): skip = set.union(skip, model.no_weight_decay()) get_num_layer = assigner.get_layer_id if assigner is not None else None @@ -236,11 +232,8 @@ def get_parameters(args, model, assigner, tower): if is_master(args): logging.info(f"Tower = {tower}") logging.info(f"Skip weight decay name marked in tower-{tower}: {skip}") - logging.info( - f"Num of parameters group in tower-{tower}: {len(parameter_group_vars.values())}" - ) - logging.info( - f"Param groups = {json.dumps(parameter_group_names, indent=2)}") + logging.info(f"Num of parameters group in tower-{tower}: {len(parameter_group_vars.values())}") + logging.info(f"Param groups = {json.dumps(parameter_group_names, indent=2)}") return list(parameter_group_vars.values()) @@ -250,20 +243,19 @@ def get_assigner(args, model): if visual_ld < 1.0: visual_num_layers = model.visual.get_num_layers() assigner_visual = LayerDecayValueAssigner( - list(visual_ld**(visual_num_layers + 1 - i) - for i in range(visual_num_layers + 2))) + list(visual_ld ** (visual_num_layers + 1 - i) for i in range(visual_num_layers + 2)) + ) else: assigner_visual = None if text_ld < 1.0 and hasattr(model, "text"): text_num_layers = model.text.get_num_layers() assigner_text = LayerDecayValueAssigner( - list(text_ld**(text_num_layers + 1 - i) - for i in range(text_num_layers + 2))) + list(text_ld ** (text_num_layers + 1 - i) for i in range(text_num_layers + 2)) + ) else: assigner_text = None if assigner_visual is not None: - logging.info("Assigned visual values = %s" % - str(assigner_visual.values)) + logging.info("Assigned visual values = %s" % str(assigner_visual.values)) if assigner_text is not None: logging.info("Assigned text values = %s" % str(assigner_text.values)) return assigner_visual, assigner_text @@ -286,8 +278,7 @@ def get_all_parameters(args, model): def print_optim(optimizer): for param_group in optimizer._param_groups: - print(param_group["group"], param_group["learning_rate"], - param_group["lr_scale"]) + print(param_group["group"], param_group["learning_rate"], param_group["lr_scale"]) def create_optimizer(args, model, lr_scheduler=None, return_params=False): diff --git a/paddlemix/processors/base_processing.py b/paddlemix/processors/base_processing.py index ae9bf62d18a8b..bdaa5d6f583b6 100644 --- a/paddlemix/processors/base_processing.py +++ b/paddlemix/processors/base_processing.py @@ -43,24 +43,22 @@ def __init__(self, *args, **kwargs): raise TypeError(f"Unexepcted keyword argument {key}.") for arg, attribute_name in zip(args, self.attributes): if attribute_name in kwargs: - raise TypeError( - f"Got multiple values for argument {attribute_name}.") + raise TypeError(f"Got multiple values for argument {attribute_name}.") else: kwargs[attribute_name] = arg if len(kwargs) != len(self.attributes): raise ValueError( f"This processor requires {len(self.attributes)} arguments: {', '.join(self.attributes)}. Got " - f"{len(args)} arguments instead.") + f"{len(args)} arguments instead." + ) # Check each arg is of the proper class (this will also catch a user initializing in the wrong order) for attribute_name, arg in kwargs.items(): setattr(self, attribute_name, arg) def __repr__(self): - attributes_repr = [ - f"- {name}: {repr(getattr(self, name))}" for name in self.attributes - ] + attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes] attributes_repr = "\n".join(attributes_repr) return f"{self.__class__.__name__}:\n{attributes_repr}" @@ -122,13 +120,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. """ - args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, - **kwargs) + args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs) return cls(*args) @classmethod - def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, - **kwargs): + def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs): args = [] for attribute_name in cls.attributes: class_name = getattr(cls, f"{attribute_name}_class") @@ -136,9 +132,7 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, attribute_class = getattr(paddlemix.processors, class_name, None) if attribute_class is None: attribute_class = getattr(paddlenlp.transformers, class_name) - args.append( - attribute_class.from_pretrained(pretrained_model_name_or_path, - **kwargs)) + args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs)) return args @property diff --git a/paddlemix/processors/blip_processing.py b/paddlemix/processors/blip_processing.py index f74eaf37fc175..ff190c0b94be6 100644 --- a/paddlemix/processors/blip_processing.py +++ b/paddlemix/processors/blip_processing.py @@ -21,17 +21,33 @@ import numpy as np import PIL from paddlenlp.transformers.tokenizer_utils_base import ( - BatchEncoding, PreTokenizedInput, TensorType, TextInput) + BatchEncoding, + PreTokenizedInput, + TensorType, + TextInput, +) from .base_processing import ProcessorMixin from .image_transform_utils import ( - convert_to_rgb, normalize, random_horizontal_flip, random_resized_crop, - rescale, resize, to_channel_dimension_format) -from .image_utils import (IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, - ChannelDimension, ImageInput, PILImageResampling, - load_image, to_numpy_array, valid_images) -from .processing_utils import (BaseImageProcessor, BaseTextProcessor, - get_size_dict) + convert_to_rgb, + normalize, + random_horizontal_flip, + random_resized_crop, + rescale, + resize, + to_channel_dimension_format, +) +from .image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + load_image, + to_numpy_array, + valid_images, +) +from .processing_utils import BaseImageProcessor, BaseTextProcessor, get_size_dict __all__ = [ "Blip2Processor", @@ -60,14 +76,14 @@ def __init__(self, image_processor, text_processor, tokenizer): super().__init__(image_processor, text_processor, tokenizer) def __call__( - self, - images=None, - text: Union[TextInput, PreTokenizedInput, List[TextInput], List[ - PreTokenizedInput]]=None, - return_tensors: Optional[Union[str, TensorType]]=None, - max_length=32, - mode="train", - **kwargs, ) -> BatchEncoding: + self, + images=None, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + max_length=32, + mode="train", + **kwargs, + ) -> BatchEncoding: """ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` and `kwargs` arguments to Bert's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode @@ -116,12 +132,12 @@ def __call__( return_token_type_ids=False, max_length=32, padding=True, - **kwargs, ) + **kwargs, + ) return text_encoding # add pixel_values - encoding_image_processor = self.image_processor( - images, return_tensors=return_tensors, mode=mode) + encoding_image_processor = self.image_processor(images, return_tensors=return_tensors, mode=mode) if text is not None: text_encoding = self.text_processor(text, mode=mode) @@ -131,7 +147,8 @@ def __call__( padding="longest", truncation=True, max_length=max_length, - return_attention_mask=True) + return_attention_mask=True, + ) else: text_encoding = None # eos_token_id = None @@ -159,8 +176,7 @@ def decode(self, *args, **kwargs): def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names image_processor_input_names = self.image_processor.model_input_names - return list( - dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) class BlipTextProcessor(BaseTextProcessor): @@ -180,31 +196,31 @@ class BlipTextProcessor(BaseTextProcessor): """ def __init__( - self, - prompt: str="", - do_caption: bool=False, - do_question: bool=False, - max_words: int=50, - **kwargs, ): + self, + prompt: str = "", + do_caption: bool = False, + do_question: bool = False, + max_words: int = 50, + **kwargs, + ): super().__init__(**kwargs) if do_question and do_caption: - raise ValueError( - "do_caption and do_question cannot be set at the same time.") + raise ValueError("do_caption and do_question cannot be set at the same time.") if not do_caption and not do_question: - raise ValueError( - "Either do_caption or do_question must be set to True.") + raise ValueError("Either do_caption or do_question must be set to True.") self.prompt = prompt self.do_caption = do_caption self.do_question = do_question self.max_words = max_words def __call__( - self, - text, - do_caption: Optional[bool]=None, - do_question: Optional[bool]=None, - mode: str="train", - **kwargs, ): + self, + text, + do_caption: Optional[bool] = None, + do_question: Optional[bool] = None, + mode: str = "train", + **kwargs, + ): """ Preprocess the text before tokenization. @@ -222,11 +238,9 @@ def __call__( do_caption = do_caption if do_caption is not None else self.do_caption do_question = do_question if do_question is not None else self.do_question if do_caption and do_question: - raise ValueError( - "do_caption and do_question cannot be set at the same time.") + raise ValueError("do_caption and do_question cannot be set at the same time.") if not do_caption and not do_question: - raise ValueError( - "Either do_caption or do_question must be set to True.") + raise ValueError("Either do_caption or do_question must be set to True.") if not isinstance(text, (list, tuple)): text = [text] @@ -246,18 +260,20 @@ def pre_caption(self, caption: str) -> str: caption = re.sub( r"([.!\"()*#:;~])", " ", - caption.lower(), ) + caption.lower(), + ) caption = re.sub( r"\s{2,}", " ", - caption, ) + caption, + ) caption = caption.rstrip("\n") caption = caption.strip(" ") # truncate caption caption_words = caption.split(" ") if len(caption_words) > self.max_words: - caption = " ".join(caption_words[:self.max_words]) + caption = " ".join(caption_words[: self.max_words]) return caption @@ -268,13 +284,14 @@ def pre_question(self, question: str) -> str: question = re.sub( r"([.!\"()*#:;~])", "", - question.lower(), ) + question.lower(), + ) question = question.rstrip(" ") # truncate question question_words = question.split(" ") if len(question_words) > self.max_words: - question = " ".join(question_words[:self.max_words]) + question = " ".join(question_words[: self.max_words]) return question @@ -325,23 +342,24 @@ class BlipImageProcessor(BaseImageProcessor): model_input_names = ["pixel_values"] def __init__( - self, - do_resize: bool=True, - size: Dict[str, int]=None, - resample: PILImageResampling=PILImageResampling.BICUBIC, - do_rescale: bool=True, - rescale_factor: Union[int, float]=1 / 255, - do_normalize: bool=True, - image_mean: Optional[Union[float, List[float]]]=None, - image_std: Optional[Union[float, List[float]]]=None, - do_convert_rgb: bool=True, - do_flip: bool=False, - flip_prob: float=0.5, - do_rand_resize_crop: bool=False, - scale: Optional[Union[List[float], Tuple[float]]]=(0.08, 1.0), - do_collate: bool=False, - mode: str="train", - **kwargs, ) -> None: + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = True, + do_flip: bool = False, + flip_prob: float = 0.5, + do_rand_resize_crop: bool = False, + scale: Optional[Union[List[float], Tuple[float]]] = (0.08, 1.0), + do_collate: bool = False, + mode: str = "train", + **kwargs, + ) -> None: super().__init__(**kwargs) size = size if size is not None else {"height": 384, "width": 384} size = get_size_dict(size, default_to_square=True) @@ -352,8 +370,7 @@ def __init__( self.do_rescale = do_rescale self.rescale_factor = rescale_factor self.do_normalize = do_normalize - self.image_mean = (image_mean if image_mean is not None else - IMAGENET_STANDARD_MEAN) + self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD self.do_convert_rgb = do_convert_rgb self.do_flip = do_flip @@ -363,12 +380,13 @@ def __init__( self.do_collate = do_collate def resize( - self, - image: np.ndarray, - size: Dict[str, int], - resample: PILImageResampling=PILImageResampling.BICUBIC, - data_format: Optional[Union[str, ChannelDimension]]=None, - **kwargs, ) -> np.ndarray: + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: """ Resize an image. @@ -393,14 +411,16 @@ def resize( size=output_size, resample=resample, data_format=data_format, - **kwargs, ) + **kwargs, + ) def rescale( - self, - image: np.ndarray, - scale: Union[int, float], - data_format: Optional[Union[str, ChannelDimension]]=None, - **kwargs, ): + self, + image: np.ndarray, + scale: Union[int, float], + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ): """ Rescale an image by a scale factor. image = image * scale. @@ -415,12 +435,13 @@ def rescale( return rescale(image, scale=scale, data_format=data_format, **kwargs) def normalize( - self, - image: np.ndarray, - mean: Union[float, List[float]], - std: Union[float, List[float]], - data_format: Optional[Union[str, ChannelDimension]]=None, - **kwargs, ) -> np.ndarray: + self, + image: np.ndarray, + mean: Union[float, List[float]], + std: Union[float, List[float]], + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. @@ -434,16 +455,16 @@ def normalize( data_format (`str` or `ChannelDimension`, *optional*): The channel dimension format of the image. If not provided, it will be the same as the input image. """ - return normalize( - image, mean=mean, std=std, data_format=data_format, **kwargs) + return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs) def random_resized_crop( - self, - image: np.ndarray, - size: Union[int, List, Tuple], - scale: float, - resample: PILImageResampling=PILImageResampling.BICUBIC, - **kwargs, ) -> np.ndarray: + self, + image: np.ndarray, + size: Union[int, List, Tuple], + scale: float, + resample: PILImageResampling = PILImageResampling.BICUBIC, + **kwargs, + ) -> np.ndarray: """ Crop the input data to random size and aspect ratio. A crop of random size (default: of 0.08 to 1.0) of the original size and a random @@ -461,13 +482,9 @@ def random_resized_crop( Resampling filter to use when resiizing the image. """ size = list(size.values()) - return random_resized_crop( - image, size=size, scale=scale, resample=resample, **kwargs) + return random_resized_crop(image, size=size, scale=scale, resample=resample, **kwargs) - def random_horizontal_flip(self, - image: np.ndarray, - flip_prob: float, - **kwargs) -> np.ndarray: + def random_horizontal_flip(self, image: np.ndarray, flip_prob: float, **kwargs) -> np.ndarray: """ Horizontally flip the input data randomly with a given probability. @@ -480,25 +497,26 @@ def random_horizontal_flip(self, return random_horizontal_flip(image, flip_prob=flip_prob, **kwargs) def preprocess( - self, - images: ImageInput, - do_resize: Optional[bool]=None, - size: Optional[Dict[str, int]]=None, - resample: PILImageResampling=None, - do_rescale: Optional[bool]=None, - rescale_factor: Optional[float]=None, - do_normalize: Optional[bool]=None, - image_mean: Optional[Union[float, List[float]]]=None, - image_std: Optional[Union[float, List[float]]]=None, - return_tensors: Optional[Union[str, TensorType]]=None, - do_convert_rgb: bool=None, - do_flip: bool=None, - flip_prob: float=None, - do_rand_resize_crop: bool=None, - scale: Optional[Union[List[float], Tuple[float]]]=None, - data_format: ChannelDimension=ChannelDimension.FIRST, - mode: str=None, - **kwargs, ) -> PIL.Image.Image: + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample: PILImageResampling = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + do_convert_rgb: bool = None, + do_flip: bool = None, + flip_prob: float = None, + do_rand_resize_crop: bool = None, + scale: Optional[Union[List[float], Tuple[float]]] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + mode: str = None, + **kwargs, + ) -> PIL.Image.Image: """ Preprocess an image or batch of images. @@ -546,19 +564,15 @@ def preprocess( do_resize = do_resize if do_resize is not None else self.do_resize resample = resample if resample is not None else self.resample do_rescale = do_rescale if do_rescale is not None else self.do_rescale - rescale_factor = (rescale_factor if rescale_factor is not None else - self.rescale_factor) + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor do_normalize = do_normalize if do_normalize is not None else self.do_normalize image_mean = image_mean if image_mean is not None else self.image_mean image_std = image_std if image_std is not None else self.image_std - do_convert_rgb = (do_convert_rgb if do_convert_rgb is not None else - self.do_convert_rgb) + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb do_flip = do_flip if do_flip is not None else self.do_flip flip_prob = flip_prob if flip_prob is not None else self.flip_prob scale = scale if scale is not None else self.scale - do_rand_resize_crop = (do_rand_resize_crop - if do_rand_resize_crop is not None else - self.do_rand_resize_crop) + do_rand_resize_crop = do_rand_resize_crop if do_rand_resize_crop is not None else self.do_rand_resize_crop size = size if size is not None else self.size size = get_size_dict(size, default_to_square=False) @@ -570,30 +584,22 @@ def preprocess( images = [load_image(image) for image in images] if not valid_images(images): - raise ValueError( - "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " - "paddle.Tensor.") + raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") if do_resize and size is None or resample is None: - raise ValueError( - "Size and resample must be specified if do_resize is True.") + raise ValueError("Size and resample must be specified if do_resize is True.") if do_rescale and rescale_factor is None: - raise ValueError( - "Rescale factor must be specified if do_rescale is True.") + raise ValueError("Rescale factor must be specified if do_rescale is True.") if do_normalize and (image_mean is None or image_std is None): - raise ValueError( - "Image mean and std must be specified if do_normalize is True.") + raise ValueError("Image mean and std must be specified if do_normalize is True.") if do_flip and flip_prob is None: - raise ValueError( - "Flip probability must be specified if do_flip is True.") + raise ValueError("Flip probability must be specified if do_flip is True.") if do_rand_resize_crop and scale is None: - raise ValueError( - "Random resize crop probability must be specified if do_rand_resize_crop is True." - ) + raise ValueError("Random resize crop probability must be specified if do_rand_resize_crop is True.") # PIL RGBA images are converted to RGB if do_convert_rgb: @@ -603,39 +609,21 @@ def preprocess( images = [to_numpy_array(image) for image in images] if do_rand_resize_crop and mode == "train": images = [ - self.random_resized_crop( - image=image, size=size, scale=scale, resample=resample) - for image in images + self.random_resized_crop(image=image, size=size, scale=scale, resample=resample) for image in images ] elif do_resize and mode != "train": - images = [ - self.resize( - image=image, size=size, resample=resample) - for image in images - ] + images = [self.resize(image=image, size=size, resample=resample) for image in images] if do_flip and mode == "train": - images = [ - self.random_horizontal_flip( - image=image, flip_prob=flip_prob) for image in images - ] + images = [self.random_horizontal_flip(image=image, flip_prob=flip_prob) for image in images] if do_rescale: - images = [ - self.rescale( - image=image, scale=rescale_factor) for image in images - ] + images = [self.rescale(image=image, scale=rescale_factor) for image in images] if do_normalize: - images = [ - self.normalize( - image=image, mean=image_mean, std=image_std) - for image in images - ] + images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images] - images = [ - to_channel_dimension_format(image, data_format) for image in images - ] + images = [to_channel_dimension_format(image, data_format) for image in images] data = {"pixel_values": images} - return BatchEncoding(data=data, tensor_type=return_tensors) \ No newline at end of file + return BatchEncoding(data=data, tensor_type=return_tensors) diff --git a/paddlemix/processors/clip_processing.py b/paddlemix/processors/clip_processing.py index 358719d9ff002..3f095f6bd85e7 100644 --- a/paddlemix/processors/clip_processing.py +++ b/paddlemix/processors/clip_processing.py @@ -22,17 +22,29 @@ import PIL from paddle.vision.transforms import functional as F from paddlenlp.transformers.tokenizer_utils_base import ( - BatchEncoding, PreTokenizedInput, TensorType, TextInput) + BatchEncoding, + PreTokenizedInput, + TensorType, + TextInput, +) from .base_processing import ProcessorMixin from .image_transform_utils import ( - convert_to_rgb, normalize, random_horizontal_flip, random_resized_crop, - rescale, resize, to_channel_dimension_format) -from .image_utils import (IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, - ChannelDimension, ImageInput, PILImageResampling, - load_image, to_numpy_array, valid_images) -from .processing_utils import (BaseImageProcessor, BaseTextProcessor, - get_size_dict) + convert_to_rgb, + random_horizontal_flip, + random_resized_crop, + rescale, +) +from .image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + load_image, + valid_images, +) +from .processing_utils import BaseImageProcessor, BaseTextProcessor, get_size_dict __all__ = [ "CLIPProcessor", @@ -61,14 +73,14 @@ def __init__(self, image_processor, text_processor, tokenizer): super().__init__(image_processor, text_processor, tokenizer) def __call__( - self, - images=None, - text: Union[TextInput, PreTokenizedInput, List[TextInput], List[ - PreTokenizedInput]]=None, - return_tensors: Optional[Union[str, TensorType]]=None, - max_length=77, - mode="train", - **kwargs, ) -> BatchEncoding: + self, + images=None, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + max_length=77, + mode="train", + **kwargs, + ) -> BatchEncoding: """ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` and `kwargs` arguments to Bert's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode @@ -109,18 +121,17 @@ def __call__( raise ValueError("You have to specify either images or text.") # images PIL list - encoding_image_processor = self.image_processor( - images, return_tensors=return_tensors, mode=mode) + encoding_image_processor = self.image_processor(images, return_tensors=return_tensors, mode=mode) - text_encoding = self.text_processor( - text, mode=mode) # text preprocessor before tokenizer + text_encoding = self.text_processor(text, mode=mode) # text preprocessor before tokenizer text_encoding = self.tokenizer( text=text_encoding, return_tensors=return_tensors, return_token_type_ids=False, max_length=max_length, padding=True, - **kwargs, ) + **kwargs, + ) for key, value in text_encoding.items(): shape = value.shape @@ -133,8 +144,7 @@ def __call__( fill_value = 0 newshape = shape newshape[-1] = max_length - shape[-1] - padtensor = paddle.full( - shape=newshape, fill_value=fill_value, dtype=value.dtype) + padtensor = paddle.full(shape=newshape, fill_value=fill_value, dtype=value.dtype) newvalue = paddle.concat([value, padtensor], axis=-1) text_encoding[key] = newvalue @@ -163,8 +173,7 @@ def decode(self, *args, **kwargs): def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names image_processor_input_names = self.image_processor.model_input_names - return list( - dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) class CLIPTextProcessor(BaseTextProcessor): @@ -181,19 +190,21 @@ class CLIPTextProcessor(BaseTextProcessor): """ def __init__( - self, - prompt: str="", - max_words: int=77, - **kwargs, ): + self, + prompt: str = "", + max_words: int = 77, + **kwargs, + ): super().__init__(**kwargs) self.prompt = prompt self.max_words = max_words def __call__( - self, - text, - mode: str="train", - **kwargs, ): + self, + text, + mode: str = "train", + **kwargs, + ): """ Preprocess the text before tokenization. @@ -218,18 +229,20 @@ def pre_caption(self, caption: str) -> str: caption = re.sub( r"([.!\"()*#:;~])", " ", - caption.lower(), ) + caption.lower(), + ) caption = re.sub( r"\s{2,}", " ", - caption, ) + caption, + ) caption = caption.rstrip("\n") caption = caption.strip(" ") # truncate caption caption_words = caption.split(" ") if len(caption_words) > self.max_words: - caption = " ".join(caption_words[:self.max_words]) + caption = " ".join(caption_words[: self.max_words]) return caption @@ -280,23 +293,24 @@ class CLIPImageProcessor(BaseImageProcessor): model_input_names = ["pixel_values"] def __init__( - self, - do_resize: bool=True, - size: Dict[str, int]=None, - resample: PILImageResampling=PILImageResampling.BICUBIC, - do_rescale: bool=True, - rescale_factor: Union[int, float]=1 / 255, - do_normalize: bool=True, - image_mean: Optional[Union[float, List[float]]]=None, - image_std: Optional[Union[float, List[float]]]=None, - do_convert_rgb: bool=True, - do_flip: bool=False, - flip_prob: float=0.5, - do_rand_resize_crop: bool=False, - scale: Optional[Union[List[float], Tuple[float]]]=(0.9, 1.0), - do_collate: bool=False, - mode: str="train", - **kwargs, ) -> None: + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = True, + do_flip: bool = False, + flip_prob: float = 0.5, + do_rand_resize_crop: bool = False, + scale: Optional[Union[List[float], Tuple[float]]] = (0.9, 1.0), + do_collate: bool = False, + mode: str = "train", + **kwargs, + ) -> None: super().__init__(**kwargs) size = size if size is not None else {"height": 384, "width": 384} size = get_size_dict(size, default_to_square=True) @@ -307,8 +321,7 @@ def __init__( self.do_rescale = do_rescale self.rescale_factor = rescale_factor self.do_normalize = do_normalize - self.image_mean = (image_mean if image_mean is not None else - IMAGENET_STANDARD_MEAN) + self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD self.do_convert_rgb = do_convert_rgb self.do_flip = do_flip @@ -318,12 +331,13 @@ def __init__( self.do_collate = do_collate def resize( - self, - image: np.ndarray, - size: Dict[str, int], - resample: PILImageResampling=PILImageResampling.BICUBIC, - data_format: Optional[Union[str, ChannelDimension]]=None, - **kwargs, ) -> np.ndarray: + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: """ Resize an image. @@ -348,14 +362,16 @@ def resize( size=output_size, resample=resample, data_format=data_format, - **kwargs, ) + **kwargs, + ) def rescale( - self, - image: np.ndarray, - scale: Union[int, float], - data_format: Optional[Union[str, ChannelDimension]]=None, - **kwargs, ): + self, + image: np.ndarray, + scale: Union[int, float], + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ): """ Rescale an image by a scale factor. image = image * scale. @@ -370,23 +386,24 @@ def rescale( return rescale(image, scale=scale, data_format=data_format, **kwargs) def normalize( - self, - image: paddle.Tensor, - mean: Union[float, List[float]], - std: Union[float, List[float]], - data_format: Optional[Union[str, ChannelDimension]]=None, - **kwargs, ) -> np.ndarray: - tensor_normalize = paddle.vision.transforms.Normalize( - mean=mean, std=std, data_format=data_format, **kwargs) + self, + image: paddle.Tensor, + mean: Union[float, List[float]], + std: Union[float, List[float]], + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + tensor_normalize = paddle.vision.transforms.Normalize(mean=mean, std=std, data_format=data_format, **kwargs) return tensor_normalize(image) def random_resized_crop( - self, - image: np.ndarray, - size: Union[int, List, Tuple], - scale: float, - resample: PILImageResampling=PILImageResampling.BICUBIC, - **kwargs, ) -> np.ndarray: + self, + image: np.ndarray, + size: Union[int, List, Tuple], + scale: float, + resample: PILImageResampling = PILImageResampling.BICUBIC, + **kwargs, + ) -> np.ndarray: """ Crop the input data to random size and aspect ratio. A crop of random size (default: of 0.08 to 1.0) of the original size and a random @@ -404,13 +421,9 @@ def random_resized_crop( Resampling filter to use when resiizing the image. """ size = list(size.values()) - return random_resized_crop( - image, size=size, scale=scale, resample=resample, **kwargs) + return random_resized_crop(image, size=size, scale=scale, resample=resample, **kwargs) - def random_horizontal_flip(self, - image: np.ndarray, - flip_prob: float, - **kwargs) -> np.ndarray: + def random_horizontal_flip(self, image: np.ndarray, flip_prob: float, **kwargs) -> np.ndarray: """ Horizontally flip the input data randomly with a given probability. @@ -423,25 +436,26 @@ def random_horizontal_flip(self, return random_horizontal_flip(image, flip_prob=flip_prob, **kwargs) def preprocess( - self, - images: ImageInput, - do_resize: Optional[bool]=None, - size: Optional[Dict[str, int]]=None, - resample: PILImageResampling=None, - do_rescale: Optional[bool]=None, - rescale_factor: Optional[float]=None, - do_normalize: Optional[bool]=None, - image_mean: Optional[Union[float, List[float]]]=None, - image_std: Optional[Union[float, List[float]]]=None, - return_tensors: Optional[Union[str, TensorType]]=None, - do_convert_rgb: bool=None, - do_flip: bool=None, - flip_prob: float=None, - do_rand_resize_crop: bool=None, - scale: Optional[Union[List[float], Tuple[float]]]=None, - data_format: ChannelDimension=ChannelDimension.FIRST, - mode: str=None, - **kwargs, ) -> PIL.Image.Image: + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample: PILImageResampling = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + do_convert_rgb: bool = None, + do_flip: bool = None, + flip_prob: float = None, + do_rand_resize_crop: bool = None, + scale: Optional[Union[List[float], Tuple[float]]] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + mode: str = None, + **kwargs, + ) -> PIL.Image.Image: """ Preprocess an image or batch of images. @@ -489,19 +503,15 @@ def preprocess( do_resize = do_resize if do_resize is not None else self.do_resize resample = resample if resample is not None else self.resample do_rescale = do_rescale if do_rescale is not None else self.do_rescale - rescale_factor = (rescale_factor if rescale_factor is not None else - self.rescale_factor) + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor do_normalize = do_normalize if do_normalize is not None else self.do_normalize image_mean = image_mean if image_mean is not None else self.image_mean image_std = image_std if image_std is not None else self.image_std - do_convert_rgb = (do_convert_rgb if do_convert_rgb is not None else - self.do_convert_rgb) + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb do_flip = do_flip if do_flip is not None else self.do_flip flip_prob = flip_prob if flip_prob is not None else self.flip_prob scale = scale if scale is not None else self.scale - do_rand_resize_crop = (do_rand_resize_crop - if do_rand_resize_crop is not None else - self.do_rand_resize_crop) + do_rand_resize_crop = do_rand_resize_crop if do_rand_resize_crop is not None else self.do_rand_resize_crop size = size if size is not None else self.size size = get_size_dict(size, default_to_square=False) @@ -513,54 +523,34 @@ def preprocess( images = [load_image(image) for image in images] if not valid_images(images): - raise ValueError( - "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " - "paddle.Tensor.") + raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") if do_resize and size is None or resample is None: - raise ValueError( - "Size and resample must be specified if do_resize is True.") + raise ValueError("Size and resample must be specified if do_resize is True.") if do_rescale and rescale_factor is None: - raise ValueError( - "Rescale factor must be specified if do_rescale is True.") + raise ValueError("Rescale factor must be specified if do_rescale is True.") if do_normalize and (image_mean is None or image_std is None): - raise ValueError( - "Image mean and std must be specified if do_normalize is True.") + raise ValueError("Image mean and std must be specified if do_normalize is True.") if do_flip and flip_prob is None: - raise ValueError( - "Flip probability must be specified if do_flip is True.") + raise ValueError("Flip probability must be specified if do_flip is True.") if do_rand_resize_crop and scale is None: - raise ValueError( - "Random resize crop probability must be specified if do_rand_resize_crop is True." - ) + raise ValueError("Random resize crop probability must be specified if do_rand_resize_crop is True.") if do_rand_resize_crop and mode == "train": images = [ - self.random_resized_crop( - image=image, size=size, scale=scale, resample=resample) - for image in images + self.random_resized_crop(image=image, size=size, scale=scale, resample=resample) for image in images ] elif do_resize and mode != "train": - images = [ - self.resize( - image=image, size=size, resample=resample) - for image in images - ] + images = [self.resize(image=image, size=size, resample=resample) for image in images] if do_flip and mode == "train": - images = [ - self.random_horizontal_flip( - image=image, flip_prob=flip_prob) for image in images - ] + images = [self.random_horizontal_flip(image=image, flip_prob=flip_prob) for image in images] if do_rescale: - images = [ - self.rescale( - image=image, scale=rescale_factor) for image in images - ] + images = [self.rescale(image=image, scale=rescale_factor) for image in images] if do_normalize: images = [convert_to_rgb(image) for image in images] images = [np.array(image, "float32") for image in images] @@ -571,24 +561,25 @@ def preprocess( batch_images["image"] / 255.0, mean=image_mean, std=image_std, - data_format="CHW", ) + data_format="CHW", + ) return {"image": image} - def preprocess_fixed( - self, images: ImageInput, - size: Optional[Dict[str, int]]=None) -> PIL.Image.Image: + def preprocess_fixed(self, images: ImageInput, size: Optional[Dict[str, int]] = None) -> PIL.Image.Image: size = size if size is not None else self.size size = get_size_dict(size, default_to_square=False) - processor = paddle.vision.transforms.Compose([ - paddle.vision.transforms.RandomResizedCrop( - [224, 224], scale=(1.0, 1.0), interpolation="bicubic"), - _convert_to_rgb, - paddle.vision.transforms.ToTensor(), - paddle.vision.transforms.Normalize( - mean=[0.48145466, 0.4578275, 0.40821073], - std=[0.26862954, 0.26130258, 0.27577711], ), - ]) + processor = paddle.vision.transforms.Compose( + [ + paddle.vision.transforms.RandomResizedCrop([224, 224], scale=(1.0, 1.0), interpolation="bicubic"), + _convert_to_rgb, + paddle.vision.transforms.ToTensor(), + paddle.vision.transforms.Normalize( + mean=[0.48145466, 0.4578275, 0.40821073], + std=[0.26862954, 0.26130258, 0.27577711], + ), + ] + ) inputs = [] for inp in images: inputs.append(processor(inp).unsqueeze(0)) @@ -618,8 +609,7 @@ def forward(self, img): scale = self.max_size / float(max(height, width)) if scale != 1.0: new_size = tuple(round(dim * scale) for dim in (height, width)) - img = paddle.vision.transforms.resize(img, new_size, - self.interpolation) + img = paddle.vision.transforms.resize(img, new_size, self.interpolation) pad_h = self.max_size - new_size[0] pad_w = self.max_size - new_size[1] img = paddle.vision.transforms.pad( @@ -630,41 +620,42 @@ def forward(self, img): pad_w - pad_w // 2, pad_h - pad_h // 2, ], - fill=self.fill, ) + fill=self.fill, + ) return img def image_transform( - image_size: int, - is_train: bool, - mean: Optional[Tuple[float, ...]]=(0.48145466, 0.4578275, 0.40821073), - std: Optional[Tuple[float, ...]]=(0.26862954, 0.26130258, 0.27577711), - resize_longest_max: bool=False, - fill_color: int=0, ): + image_size: int, + is_train: bool, + mean: Optional[Tuple[float, ...]] = (0.48145466, 0.4578275, 0.40821073), + std: Optional[Tuple[float, ...]] = (0.26862954, 0.26130258, 0.27577711), + resize_longest_max: bool = False, + fill_color: int = 0, +): if not isinstance(mean, (list, tuple)): - mean = (mean, ) * 3 + mean = (mean,) * 3 if not isinstance(std, (list, tuple)): - std = (std, ) * 3 + std = (std,) * 3 if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]: image_size = image_size[0] normalize = paddle.vision.transforms.Normalize(mean=mean, std=std) if is_train: - return paddle.vision.transforms.Compose([ - paddle.vision.transforms.RandomResizedCrop( - image_size, scale=(1.0, 1.0), interpolation="bicubic"), - _convert_to_rgb, - paddle.vision.transforms.ToTensor(), - normalize, - ]) + return paddle.vision.transforms.Compose( + [ + paddle.vision.transforms.RandomResizedCrop(image_size, scale=(1.0, 1.0), interpolation="bicubic"), + _convert_to_rgb, + paddle.vision.transforms.ToTensor(), + normalize, + ] + ) else: if resize_longest_max: transforms = [ResizeMaxSize(image_size, fill=fill_color)] else: transforms = [ - paddle.vision.transforms.Resize( - image_size, interpolation="bicubic"), + paddle.vision.transforms.Resize(image_size, interpolation="bicubic"), paddle.vision.transforms.CenterCrop(image_size), ] - transforms.extend( - [_convert_to_rgb, paddle.vision.transforms.ToTensor(), normalize]) + transforms.extend([_convert_to_rgb, paddle.vision.transforms.ToTensor(), normalize]) return paddle.vision.transforms.Compose(transforms) diff --git a/paddlemix/processors/groundingdino_processing.py b/paddlemix/processors/groundingdino_processing.py index 6e15c880bd9dc..7d1d4b48fa674 100644 --- a/paddlemix/processors/groundingdino_processing.py +++ b/paddlemix/processors/groundingdino_processing.py @@ -15,20 +15,14 @@ Processor class for GroundingDino. """ -import re -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Union -import numpy as np import paddle import paddle.vision.transforms as T -import PIL from paddlenlp.taskflow.utils import pad_batch_data -from paddlenlp.transformers.tokenizer_utils_base import (BatchEncoding, - TensorType, TextInput) from .base_processing import ProcessorMixin -from .image_utils import (IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, - valid_images) +from .image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, valid_images from .processing_utils import BaseImageProcessor, BaseTextProcessor from .utils import _max_by_axis @@ -50,18 +44,18 @@ def __init__(self, image_processor, text_processor, tokenizer): super().__init__(image_processor, text_processor, tokenizer) def __call__( - self, - images=None, - text: str=None, - **kwargs, ): + self, + images=None, + text: str = None, + **kwargs, + ): if images is None or text is None: raise ValueError("You have to specify either images and text.") self.prompt = self.text_processor.pre_caption(text) input_ids = self.tokenizer([self.prompt]).input_ids - specical_tokens = self.tokenizer.convert_tokens_to_ids( - ["[CLS]", "[SEP]", ".", "?"]) + specical_tokens = self.tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"]) tokenized_out = self.text_processor(input_ids, specical_tokens) image_tensor, mask = self.image_processor(images) @@ -86,8 +80,7 @@ def decode(self, posmap): def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names image_processor_input_names = self.image_processor.model_input_names - return list( - dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) class GroudingDinoTextProcessor(BaseTextProcessor): @@ -96,19 +89,21 @@ class GroudingDinoTextProcessor(BaseTextProcessor): """ def __init__( - self, - max_words: int=256, - **kwargs, ): + self, + max_words: int = 256, + **kwargs, + ): super().__init__(**kwargs) self.max_words = max_words self.caption = None def __call__( - self, - input_ids, - special_tokens_list, - **kwargs, ): + self, + input_ids, + special_tokens_list, + **kwargs, + ): """ Preprocess the text with tokenization. """ @@ -116,25 +111,19 @@ def __call__( input_ids = pad_batch_data(input_ids) input_ids = paddle.to_tensor(input_ids, dtype=paddle.int64).squeeze(-1) tokenized_out["input_ids"] = input_ids - tokenized_out["attention_mask"] = paddle.cast(input_ids != 0, - paddle.int64) + tokenized_out["attention_mask"] = paddle.cast(input_ids != 0, paddle.int64) ( text_self_attention_masks, position_ids, cate_to_token_mask_list, - ) = self.generate_masks_with_special_tokens_and_transfer_map( - tokenized_out, special_tokens_list) + ) = self.generate_masks_with_special_tokens_and_transfer_map(tokenized_out, special_tokens_list) if text_self_attention_masks.shape[1] > self.max_words: - text_self_attention_masks = text_self_attention_masks[:, :self. - max_words, : - self.max_words] - position_ids = position_ids[:, :self.max_words] - tokenized_out["input_ids"] = tokenized_out[ - "input_ids"][:, :self.max_words] - tokenized_out["attention_mask"] = tokenized_out[ - "attention_mask"][:, :self.max_words] + text_self_attention_masks = text_self_attention_masks[:, : self.max_words, : self.max_words] + position_ids = position_ids[:, : self.max_words] + tokenized_out["input_ids"] = tokenized_out["input_ids"][:, : self.max_words] + tokenized_out["attention_mask"] = tokenized_out["attention_mask"][:, : self.max_words] tokenized_out["position_ids"] = position_ids tokenized_out["text_self_attention_masks"] = text_self_attention_masks @@ -150,8 +139,7 @@ def pre_caption(self, caption: str) -> str: self.caption = caption return caption - def generate_masks_with_special_tokens_and_transfer_map( - self, tokenized, special_tokens_list): + def generate_masks_with_special_tokens_and_transfer_map(self, tokenized, special_tokens_list): """Generate attention mask between each pair of special tokens Args: input_ids (torch.Tensor): input ids. Shape: [bs, num_token] @@ -170,8 +158,7 @@ def generate_masks_with_special_tokens_and_transfer_map( idxs = paddle.nonzero(special_tokens_mask) # generate attention mask and positional ids - attention_mask = (paddle.eye(num_token, dtype=paddle.int32) - .cast(paddle.bool).unsqueeze(0).tile([bs, 1, 1])) + attention_mask = paddle.eye(num_token, dtype=paddle.int32).cast(paddle.bool).unsqueeze(0).tile([bs, 1, 1]) position_ids = paddle.zeros((bs, num_token), dtype=paddle.int64) cate_to_token_mask_list = [[] for _ in range(bs)] previous_col = 0 @@ -182,17 +169,18 @@ def generate_masks_with_special_tokens_and_transfer_map( attention_mask[row, col, col] = True position_ids[row, col] = 0 else: - attention_mask[row, previous_col + 1:col + 1, previous_col + 1: - col + 1] = True - position_ids[row, previous_col + 1:col + 1] = paddle.arange( - 0, col - previous_col) - c2t_maski = paddle.zeros([num_token, ]).cast(paddle.bool) - c2t_maski[previous_col + 1:col] = True + attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True + position_ids[row, previous_col + 1 : col + 1] = paddle.arange(0, col - previous_col) + c2t_maski = paddle.zeros( + [ + num_token, + ] + ).cast(paddle.bool) + c2t_maski[previous_col + 1 : col] = True cate_to_token_mask_list[row].append(c2t_maski) previous_col = col - return attention_mask, position_ids.cast( - paddle.int64), cate_to_token_mask_list + return attention_mask, position_ids.cast(paddle.int64), cate_to_token_mask_list class GroudingDinoImageProcessor(BaseImageProcessor): @@ -203,22 +191,22 @@ class GroudingDinoImageProcessor(BaseImageProcessor): model_input_names = ["pixel_values"] def __init__( - self, - do_resize: bool=True, - size: List[int]=None, - do_normalize: bool=True, - image_mean: Optional[Union[float, List[float]]]=None, - image_std: Optional[Union[float, List[float]]]=None, - do_nested: bool=True, - **kwargs, ) -> None: + self, + do_resize: bool = True, + size: List[int] = None, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_nested: bool = True, + **kwargs, + ) -> None: super().__init__(**kwargs) size = size if size is not None else 800 self.do_resize = do_resize self.size = size self.do_normalize = do_normalize - self.image_mean = (image_mean if image_mean is not None else - IMAGENET_STANDARD_MEAN) + self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD self.do_nested = do_nested @@ -229,8 +217,7 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None): min_original_size = float(min((w, h))) max_original_size = float(max((w, h))) if max_original_size / min_original_size * size > max_size: - size = int( - round(max_size * min_original_size / max_original_size)) + size = int(round(max_size * min_original_size / max_original_size)) if (w <= h and w == size) or (h <= w and h == size): return (h, w) @@ -256,16 +243,13 @@ def get_size(image_size, size, max_size=None): if target is None: return rescaled_image - ratios = tuple( - float(s) / float(s_orig) - for s, s_orig in zip(rescaled_image.size, image.size)) + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size)) ratio_width, ratio_height = ratios target = target.copy() if "boxes" in target: boxes = target["boxes"] - scaled_boxes = boxes * paddle.to_tensor( - [ratio_width, ratio_height, ratio_width, ratio_height]) + scaled_boxes = boxes * paddle.to_tensor([ratio_width, ratio_height, ratio_width, ratio_height]) target["boxes"] = scaled_boxes if "area" in target: @@ -276,11 +260,10 @@ def get_size(image_size, size, max_size=None): h, w = size target["size"] = paddle.to_tensor([h, w]) - if "masks" in target: - target["masks"] = (interpolate( - target["masks"][:, None].cast(paddle.float32), - size, - mode="nearest")[:, 0] > 0.5) + # if "masks" in target: + # target["masks"] = ( + # interpolate(target["masks"][:, None].cast(paddle.float32), size, mode="nearest")[:, 0] > 0.5 + # ) return rescaled_image, target @@ -298,22 +281,23 @@ def nested_tensor_from_tensor_list(self, tensor_list: List[paddle.Tensor]): mask = paddle.ones((b, h, w), dtype=paddle.bool) for i in range(b): img = tensor_list[i] - tensor[i, :img.shape[0], :img.shape[1], :img.shape[2]] = img - mask[i, :img.shape[1], :img.shape[2]] = False + tensor[i, : img.shape[0], : img.shape[1], : img.shape[2]] = img + mask[i, : img.shape[1], : img.shape[2]] = False else: raise ValueError("not supported") return tensor, mask def preprocess( - self, - images, - do_resize: Optional[bool]=None, - size: Optional[Dict[str, int]]=None, - do_normalize: Optional[bool]=None, - image_mean: Optional[Union[float, List[float]]]=None, - image_std: Optional[Union[float, List[float]]]=None, - do_nested: bool=None, - **kwargs, ): + self, + images, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_nested: bool = None, + **kwargs, + ): """ Preprocess an image or batch of images. @@ -329,23 +313,17 @@ def preprocess( if not isinstance(images, (list, tuple)): images = [images] - if isinstance(images[0], str): - images = [load_image(image) for image in images] + # if isinstance(images[0], str): + # images = [load_image(image) for image in images] if not valid_images(images): - raise ValueError( - "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " - "paddle.Tensor.") + raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") if do_normalize and (image_mean is None or image_std is None): - raise ValueError( - "Image mean and std must be specified if do_normalize is True.") + raise ValueError("Image mean and std must be specified if do_normalize is True.") if do_resize: - images = [ - T.to_tensor(self.resize( - image=image, size=size)) for image in images - ] + images = [T.to_tensor(self.resize(image=image, size=size)) for image in images] if do_normalize: images = T.normalize(images, mean=image_mean, std=image_std) diff --git a/paddlemix/processors/image_processing_utils.py b/paddlemix/processors/image_processing_utils.py index 4cb4fb5343c27..0c8a0913dd8cd 100644 --- a/paddlemix/processors/image_processing_utils.py +++ b/paddlemix/processors/image_processing_utils.py @@ -19,16 +19,25 @@ from typing import Any, Dict, Iterable, Optional, Tuple, Union import numpy as np -from huggingface_hub import (create_repo, get_hf_file_metadata, hf_hub_download, - hf_hub_url, repo_type_and_id_from_hf_id, - upload_folder) +from huggingface_hub import ( + create_repo, + get_hf_file_metadata, + hf_hub_download, + hf_hub_url, + repo_type_and_id_from_hf_id, + upload_folder, +) from huggingface_hub.utils import EntryNotFoundError from paddlenlp import __version__ -from paddlenlp.transformers.feature_extraction_utils import \ - BatchFeature as BaseBatchFeature +from paddlenlp.transformers.feature_extraction_utils import ( + BatchFeature as BaseBatchFeature, +) from paddlemix.utils.downloader import ( - COMMUNITY_MODEL_PREFIX, get_path_from_url_with_filelock, resolve_cache_dir) + COMMUNITY_MODEL_PREFIX, + get_path_from_url_with_filelock, + resolve_cache_dir, +) from paddlemix.utils.log import logger IMAGE_PROCESSOR_NAME = "image_preprocessor_config.json" @@ -75,9 +84,7 @@ def _set_processor_class(self, processor_class: str): self._processor_class = processor_class @classmethod - def from_pretrained(cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - **kwargs): + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs): r""" Instantiate a type of [`~processing_utils.ImageProcessingMixin`] from an image processor. @@ -155,13 +162,11 @@ def from_pretrained(cls, assert image_processor.do_normalize is False assert unused_kwargs == {"foo": False} ```""" - image_processor_dict, kwargs = cls.get_image_processor_dict( - pretrained_model_name_or_path, **kwargs) + image_processor_dict, kwargs = cls.get_image_processor_dict(pretrained_model_name_or_path, **kwargs) return cls.from_dict(image_processor_dict, **kwargs) - def save_pretrained(self, save_directory: Union[str, os.PathLike], - **kwargs): + def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs): """ Save an image processor object to the directory `save_directory`, so that it can be re-loaded using the [`~processing_utils.ImageProcessingMixin.from_pretrained`] class method. @@ -173,15 +178,12 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. """ if os.path.isfile(save_directory): - raise AssertionError( - f"Provided path ({save_directory}) should be a directory, not a file" - ) + raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") os.makedirs(save_directory, exist_ok=True) # If we save using the predefined names, we can load using `from_pretrained` - output_image_processor_file = os.path.join(save_directory, - IMAGE_PROCESSOR_NAME) + output_image_processor_file = os.path.join(save_directory, IMAGE_PROCESSOR_NAME) self.to_json_file(output_image_processor_file) logger.info(f"Image processor saved in {output_image_processor_file}") @@ -189,13 +191,14 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], return [output_image_processor_file] def save_to_hf_hub( - self, - repo_id: str, - private: Optional[bool]=None, - subfolder: Optional[str]=None, - commit_message: Optional[str]=None, - revision: Optional[str]=None, - create_pr: bool=False, ): + self, + repo_id: str, + private: Optional[bool] = None, + subfolder: Optional[str] = None, + commit_message: Optional[str] = None, + revision: Optional[str] = None, + create_pr: bool = False, + ): """ Uploads all elements of this processor to a new HuggingFace Hub repository. Args: @@ -220,9 +223,7 @@ def save_to_hf_hub( # Check if README file already exist in repo try: - get_hf_file_metadata( - hf_hub_url( - repo_id=repo_id, filename="README.md", revision=revision)) + get_hf_file_metadata(hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision)) has_readme = True except EntryNotFoundError: has_readme = False @@ -248,12 +249,13 @@ def save_to_hf_hub( folder_path=root_dir, commit_message=commit_message, revision=revision, - create_pr=create_pr, ) + create_pr=create_pr, + ) @classmethod def get_image_processor_dict( - cls, pretrained_model_name_or_path: Union[str, os.PathLike], - **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]: + cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: """ From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a image processor of type [`~image_processor_utils.ImageProcessingMixin`] using `from_dict`. @@ -271,14 +273,12 @@ def get_image_processor_dict( cache_dir = kwargs.pop("cache_dir", None) from_hf_hub = kwargs.pop("from_hf_hub", False) subfolder = kwargs.pop("subfolder", None) - cache_dir = resolve_cache_dir(pretrained_model_name_or_path, - from_hf_hub, cache_dir) + cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir) pretrained_model_name_or_path = str(pretrained_model_name_or_path) is_local = os.path.isdir(pretrained_model_name_or_path) if os.path.isdir(pretrained_model_name_or_path): - resolved_image_processor_file = os.path.join( - pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME) + resolved_image_processor_file = os.path.join(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME) elif os.path.isfile(pretrained_model_name_or_path): resolved_image_processor_file = pretrained_model_name_or_path is_local = True @@ -290,18 +290,20 @@ def get_image_processor_dict( cache_dir=cache_dir, subfolder=subfolder, library_name="PaddleNLP", - library_version=__version__, ) + library_version=__version__, + ) else: # Assuming from community-contributed pretrained models - image_processor_file = "/".join([ - COMMUNITY_MODEL_PREFIX, - pretrained_model_name_or_path, - IMAGE_PROCESSOR_NAME, - ]) + image_processor_file = "/".join( + [ + COMMUNITY_MODEL_PREFIX, + pretrained_model_name_or_path, + IMAGE_PROCESSOR_NAME, + ] + ) try: # Load from local folder or from cache or download from model Hub and cache - resolved_image_processor_file = get_path_from_url_with_filelock( - image_processor_file, cache_dir) + resolved_image_processor_file = get_path_from_url_with_filelock(image_processor_file, cache_dir) except EnvironmentError: # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to # the original exception. @@ -312,13 +314,12 @@ def get_image_processor_dict( f"Can't load image processor for '{pretrained_model_name_or_path}'. If you were trying to load" " it from 'BOS', make sure you don't have a local directory with the" f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a" - f" directory containing a {IMAGE_PROCESSOR_NAME} file") + f" directory containing a {IMAGE_PROCESSOR_NAME} file" + ) try: # Load image_processor dict - with open( - resolved_image_processor_file, "r", - encoding="utf-8") as reader: + with open(resolved_image_processor_file, "r", encoding="utf-8") as reader: text = reader.read() image_processor_dict = json.loads(text) @@ -328,8 +329,7 @@ def get_image_processor_dict( ) if is_local: - logger.info( - f"loading configuration file {resolved_image_processor_file}") + logger.info(f"loading configuration file {resolved_image_processor_file}") else: logger.info( f"loading configuration file {image_processor_file} from cache at {resolved_image_processor_file}" @@ -449,14 +449,14 @@ def __call__(self, images, **kwargs) -> BatchFeature: return self.preprocess(images, **kwargs) def preprocess(self, images, **kwargs) -> BatchFeature: - raise NotImplementedError( - "Each image processor must implement its own preprocess method") + raise NotImplementedError("Each image processor must implement its own preprocess method") VALID_SIZE_DICT_KEYS = ( {"height", "width"}, {"shortest_edge"}, - {"shortest_edge", "longest_edge"}, ) + {"shortest_edge", "longest_edge"}, +) def is_valid_size_dict(size_dict): @@ -471,16 +471,15 @@ def is_valid_size_dict(size_dict): def convert_to_size_dict( - size, - max_size: Optional[int]=None, - default_to_square: bool=True, - height_width_order: bool=True, ): + size, + max_size: Optional[int] = None, + default_to_square: bool = True, + height_width_order: bool = True, +): # By default, if size is an int we assume it represents a tuple of (size, size). if isinstance(size, int) and default_to_square: if max_size is not None: - raise ValueError( - "Cannot specify both size as an int, with default_to_square=True and max_size" - ) + raise ValueError("Cannot specify both size as an int, with default_to_square=True and max_size") return {"height": size, "width": size} # In other configs, if size is an int and default_to_square is False, size represents the length of # the shortest edge after resizing. @@ -499,11 +498,12 @@ def convert_to_size_dict( def get_size_dict( - size: Union[int, Iterable[int], Dict[str, int]]=None, - max_size: Optional[int]=None, - height_width_order: bool=True, - default_to_square: bool=True, - param_name="size", ) -> dict: + size: Union[int, Iterable[int], Dict[str, int]] = None, + max_size: Optional[int] = None, + height_width_order: bool = True, + default_to_square: bool = True, + param_name="size", +) -> dict: """ Converts the old size parameter in the config into the new dict expected in the config. This is to ensure backwards compatibility with the old image processor configs and removes ambiguity over whether the tuple is in (height, @@ -526,11 +526,11 @@ def get_size_dict( If `size` is an int, whether to default to a square image or not. """ if not isinstance(size, dict): - size_dict = convert_to_size_dict(size, max_size, default_to_square, - height_width_order) + size_dict = convert_to_size_dict(size, max_size, default_to_square, height_width_order) logger.info( f"{param_name} should be a dictionary on of the following set of keys: {VALID_SIZE_DICT_KEYS}, got {size}." - f" Converted to {size_dict}.", ) + f" Converted to {size_dict}.", + ) else: size_dict = size diff --git a/paddlemix/processors/image_transform_utils.py b/paddlemix/processors/image_transform_utils.py index b11ca848774a0..4d3e9918931b3 100644 --- a/paddlemix/processors/image_transform_utils.py +++ b/paddlemix/processors/image_transform_utils.py @@ -23,10 +23,16 @@ from paddle.vision.transforms import functional as F from PIL import Image -from .image_utils import (ChannelDimension, ImageInput, PILImageResampling, - TensorType, get_channel_dimension_axis, - get_image_size, infer_channel_dimension_format, - to_numpy_array) +from .image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + TensorType, + get_channel_dimension_axis, + get_image_size, + infer_channel_dimension_format, + to_numpy_array, +) from .utils import ExplicitEnum @@ -35,9 +41,9 @@ def is_paddle_tensor(tensor): def to_channel_dimension_format( - image: np.ndarray, - channel_dim: Union[ChannelDimension, str], - input_channel_dim: Optional[Union[ChannelDimension, str]]=None, + image: np.ndarray, + channel_dim: Union[ChannelDimension, str], + input_channel_dim: Optional[Union[ChannelDimension, str]] = None, ) -> np.ndarray: """ Converts `image` to the channel dimension format specified by `channel_dim`. @@ -52,8 +58,7 @@ def to_channel_dimension_format( `np.ndarray`: The image with the channel dimension set to `channel_dim`. """ if not isinstance(image, np.ndarray): - raise ValueError( - f"Input image must be of type np.ndarray, got {type(image)}") + raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}") if input_channel_dim is None: input_channel_dim = infer_channel_dimension_format(image) @@ -67,17 +72,17 @@ def to_channel_dimension_format( elif target_channel_dim == ChannelDimension.LAST: image = image.transpose((1, 2, 0)) else: - raise ValueError("Unsupported channel dimension format: {}".format( - channel_dim)) + raise ValueError("Unsupported channel dimension format: {}".format(channel_dim)) return image def rescale( - image: np.ndarray, - scale: float, - data_format: Optional[ChannelDimension]=None, - dtype=np.float32, ) -> np.ndarray: + image: np.ndarray, + scale: float, + data_format: Optional[ChannelDimension] = None, + dtype=np.float32, +) -> np.ndarray: """ Rescales `image` by `scale`. @@ -96,20 +101,19 @@ def rescale( `np.ndarray`: The rescaled image. """ if not isinstance(image, np.ndarray): - raise ValueError( - f"Input image must be of type np.ndarray, got {type(image)}") + raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}") rescaled_image = image * scale if data_format is not None: - rescaled_image = to_channel_dimension_format(rescaled_image, - data_format) + rescaled_image = to_channel_dimension_format(rescaled_image, data_format) rescaled_image = rescaled_image.astype(dtype) return rescaled_image def to_pil_image( - image: Union[np.ndarray, "PIL.Image.Image", "paddle.Tensor"], - do_rescale: Optional[bool]=None, ) -> "PIL.Image.Image": + image: Union[np.ndarray, "PIL.Image.Image", "paddle.Tensor"], + do_rescale: Optional[bool] = None, +) -> "PIL.Image.Image": """ Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if needed. @@ -131,8 +135,7 @@ def to_pil_image( if is_paddle_tensor(image): image = image.numpy() elif not isinstance(image, np.ndarray): - raise ValueError("Input image type not supported: {}".format( - type(image))) + raise ValueError("Input image type not supported: {}".format(type(image))) # If the channel as been moved to first dim, we put it back at the end. image = to_channel_dimension_format(image, ChannelDimension.LAST) @@ -141,8 +144,7 @@ def to_pil_image( image = np.squeeze(image, axis=-1) if image.shape[-1] == 1 else image # PIL.Image can only store uint8 values, so we rescale the image to be between 0 and 255 if needed. - do_rescale = (isinstance(image.flat[0], (float, np.float32, np.float64)) - if do_rescale is None else do_rescale) + do_rescale = isinstance(image.flat[0], (float, np.float32, np.float64)) if do_rescale is None else do_rescale if do_rescale: image = rescale(image, 255) image = image.astype(np.uint8) @@ -151,10 +153,11 @@ def to_pil_image( # Logic adapted from torchvision resizing logic: https://github.com/pytorch/vision/blob/511924c1ced4ce0461197e5caa64ce5b9e558aab/torchvision/transforms/functional.py#L366 def get_resize_output_image_size( - input_image: np.ndarray, - size: Union[int, Tuple[int, int], List[int], Tuple[int]], - default_to_square: bool=True, - max_size: Optional[int]=None, ) -> tuple: + input_image: np.ndarray, + size: Union[int, Tuple[int, int], List[int], Tuple[int]], + default_to_square: bool = True, + max_size: Optional[int] = None, +) -> tuple: """ Find the target (height, width) dimension of the output image after resizing given the input image and the desired size. @@ -190,8 +193,7 @@ def get_resize_output_image_size( # Perform same logic as if size was an int size = size[0] else: - raise ValueError( - "size must have 1 or 2 elements if it is a list or tuple") + raise ValueError("size must have 1 or 2 elements if it is a list or tuple") if default_to_square: return (size, size) @@ -200,14 +202,14 @@ def get_resize_output_image_size( short, long = (width, height) if width <= height else (height, width) requested_new_short = size - new_short, new_long = requested_new_short, int(requested_new_short * long / - short) + new_short, new_long = requested_new_short, int(requested_new_short * long / short) if max_size is not None: if max_size <= requested_new_short: raise ValueError( f"max_size = {max_size} must be strictly greater than the requested " - f"size for the smaller edge size = {size}") + f"size for the smaller edge size = {size}" + ) if new_long > max_size: new_short, new_long = int(max_size * new_short / new_long), max_size @@ -215,12 +217,13 @@ def get_resize_output_image_size( def resize( - image, - size: Tuple[int, int], - resample: "PILImageResampling" =None, - reducing_gap: Optional[int]=None, - data_format: Optional[ChannelDimension]=None, - return_numpy: bool=True, ) -> np.ndarray: + image, + size: Tuple[int, int], + resample: "PILImageResampling" = None, + reducing_gap: Optional[int] = None, + data_format: Optional[ChannelDimension] = None, + return_numpy: bool = True, +) -> np.ndarray: """ Resizes `image` to `(height, width)` specified by `size` using the PIL library. @@ -250,8 +253,7 @@ def resize( # For all transformations, we want to keep the same data format as the input image unless otherwise specified. # The resized image from PIL will always have channels last, so find the input format first. - data_format = (infer_channel_dimension_format(image) - if data_format is None else data_format) + data_format = infer_channel_dimension_format(image) if data_format is None else data_format # To maintain backwards compatibility with the resizing done in previous image feature extractors, we use # the pillow library to resize the image and then convert back to numpy @@ -259,27 +261,26 @@ def resize( image = to_pil_image(image) height, width = size # PIL images are in the format (width, height) - resized_image = image.resize( - (width, height), resample=resample, reducing_gap=reducing_gap) + resized_image = image.resize((width, height), resample=resample, reducing_gap=reducing_gap) if return_numpy: resized_image = np.array(resized_image) # If the input image channel dimension was of size 1, then it is dropped when converting to a PIL image # so we need to add it back if necessary. - resized_image = (np.expand_dims( - resized_image, axis=-1) - if resized_image.ndim == 2 else resized_image) + resized_image = np.expand_dims(resized_image, axis=-1) if resized_image.ndim == 2 else resized_image # The image is always in channels last format after converting from a PIL image resized_image = to_channel_dimension_format( - resized_image, data_format, input_channel_dim=ChannelDimension.LAST) + resized_image, data_format, input_channel_dim=ChannelDimension.LAST + ) return resized_image def normalize( - image: np.ndarray, - mean: Union[float, Iterable[float]], - std: Union[float, Iterable[float]], - data_format: Optional[ChannelDimension]=None, ) -> np.ndarray: + image: np.ndarray, + mean: Union[float, Iterable[float]], + std: Union[float, Iterable[float]], + data_format: Optional[ChannelDimension] = None, +) -> np.ndarray: """ Normalizes `image` using the mean and standard deviation specified by `mean` and `std`. @@ -298,7 +299,8 @@ def normalize( if isinstance(image, PIL.Image.Image): warnings.warn( "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.", - FutureWarning, ) + FutureWarning, + ) # Convert PIL image to numpy array with the same logic as in the previous feature extractor normalize - # casting to numpy array and dividing by 255. image = to_numpy_array(image) @@ -313,18 +315,14 @@ def normalize( if isinstance(mean, Iterable): if len(mean) != num_channels: - raise ValueError( - f"mean must have {num_channels} elements if it is an iterable, got {len(mean)}" - ) + raise ValueError(f"mean must have {num_channels} elements if it is an iterable, got {len(mean)}") else: mean = [mean] * num_channels mean = np.array(mean, dtype=image.dtype) if isinstance(std, Iterable): if len(std) != num_channels: - raise ValueError( - f"std must have {num_channels} elements if it is an iterable, got {len(std)}" - ) + raise ValueError(f"std must have {num_channels} elements if it is an iterable, got {len(std)}") else: std = [std] * num_channels std = np.array(std, dtype=image.dtype) @@ -334,16 +332,16 @@ def normalize( else: image = ((image.T - mean) / std).T - image = (to_channel_dimension_format(image, data_format) - if data_format is not None else image) + image = to_channel_dimension_format(image, data_format) if data_format is not None else image return image def center_crop( - image: np.ndarray, - size: Tuple[int, int], - data_format: Optional[Union[str, ChannelDimension]]=None, - return_numpy: Optional[bool]=None, ) -> np.ndarray: + image: np.ndarray, + size: Tuple[int, int], + data_format: Optional[Union[str, ChannelDimension]] = None, + return_numpy: Optional[bool] = None, +) -> np.ndarray: """ Crops the `image` to the specified `size` using a center crop. Note that if the image is too small to be cropped to the size given, it will be padded (so the returned result will always be of size `size`). @@ -370,20 +368,18 @@ def center_crop( if isinstance(image, PIL.Image.Image): warnings.warn( "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.", - FutureWarning, ) + FutureWarning, + ) image = to_numpy_array(image) return_numpy = False if return_numpy is None else return_numpy else: return_numpy = True if return_numpy is None else return_numpy if not isinstance(image, np.ndarray): - raise ValueError( - f"Input image must be of type np.ndarray, got {type(image)}") + raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}") if not isinstance(size, Iterable) or len(size) != 2: - raise ValueError( - "size must have 2 elements representing the height and width of the output image" - ) + raise ValueError("size must have 2 elements representing the height and width of the output image") input_data_format = infer_channel_dimension_format(image) output_data_format = data_format if data_format is not None else input_data_format @@ -426,8 +422,7 @@ def center_crop( left += left_pad right += left_pad - new_image = new_image[..., max(0, top):min(new_height, bottom), max( - 0, left):min(new_width, right)] + new_image = new_image[..., max(0, top) : min(new_height, bottom), max(0, left) : min(new_width, right)] new_image = to_channel_dimension_format(new_image, output_data_format) if not return_numpy: @@ -436,8 +431,7 @@ def center_crop( return new_image -def _center_to_corners_format_paddle( - bboxes_center: "paddle.Tensor") -> "paddle.Tensor": +def _center_to_corners_format_paddle(bboxes_center: "paddle.Tensor") -> "paddle.Tensor": center_x, center_y, width, height = bboxes_center.unbind(-1) bbox_corners = paddle.stack( # top left x, top left y, bottom right x, bottom right y @@ -447,7 +441,8 @@ def _center_to_corners_format_paddle( (center_x + 0.5 * width), (center_y + 0.5 * height), ], - axis=-1, ) + axis=-1, + ) return bbox_corners @@ -461,7 +456,8 @@ def _center_to_corners_format_numpy(bboxes_center: np.ndarray) -> np.ndarray: center_x + 0.5 * width, center_y + 0.5 * height, ], - axis=-1, ) + axis=-1, + ) return bboxes_corners @@ -486,9 +482,9 @@ def center_to_corners_format(bboxes_center: TensorType) -> TensorType: def _corners_to_center_format_paddle( - bboxes_corners: "paddle.Tensor", ) -> "paddle.Tensor": - top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.unbind( - -1) + bboxes_corners: "paddle.Tensor", +) -> "paddle.Tensor": + top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.unbind(-1) b = [ (top_left_x + bottom_right_x) / 2, # center x (top_left_y + bottom_right_y) / 2, # center y @@ -507,7 +503,8 @@ def _corners_to_center_format_numpy(bboxes_corners: np.ndarray) -> np.ndarray: (bottom_right_x - top_left_x), # width (bottom_right_y - top_left_y), # height ], - axis=-1, ) + axis=-1, + ) return bboxes_center @@ -539,8 +536,7 @@ def rgb_to_id(color): if isinstance(color, np.ndarray) and len(color.shape) == 3: if color.dtype == np.uint8: color = color.astype(np.int32) - return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, - 2] + return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2] return int(color[0] + 256 * color[1] + 256 * 256 * color[2]) @@ -575,12 +571,12 @@ class PaddingMode(ExplicitEnum): def pad( - image: np.ndarray, - padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]], - mode: PaddingMode=PaddingMode.CONSTANT, - constant_values: Union[float, Iterable[float]]=0.0, - data_format: Optional[Union[str, ChannelDimension]]=None, - input_data_format: Optional[Union[str, ChannelDimension]]=None, + image: np.ndarray, + padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]], + mode: PaddingMode = PaddingMode.CONSTANT, + constant_values: Union[float, Iterable[float]] = 0.0, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> np.ndarray: """ Pads the `image` with the specified (height, width) `padding` and `mode`. @@ -628,19 +624,15 @@ def _expand_for_data_format(values): values = ((values, values), (values, values)) elif isinstance(values, tuple) and len(values) == 1: values = ((values[0], values[0]), (values[0], values[0])) - elif (isinstance(values, tuple) and len(values) == 2 and - isinstance(values[0], int)): + elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], int): values = (values, values) - elif (isinstance(values, tuple) and len(values) == 2 and - isinstance(values[0], tuple)): + elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], tuple): values = values else: raise ValueError(f"Unsupported format: {values}") # add 0 for channel dimension - values = (((0, 0), *values) - if input_data_format == ChannelDimension.FIRST else (*values, - (0, 0))) + values = ((0, 0), *values) if input_data_format == ChannelDimension.FIRST else (*values, (0, 0)) # Add additional padding if there's a batch dimension values = (0, *values) if image.ndim == 4 else values @@ -650,10 +642,7 @@ def _expand_for_data_format(values): if mode == PaddingMode.CONSTANT: constant_values = _expand_for_data_format(constant_values) - image = np.pad(image, - padding, - mode="constant", - constant_values=constant_values) + image = np.pad(image, padding, mode="constant", constant_values=constant_values) elif mode == PaddingMode.REFLECT: image = np.pad(image, padding, mode="reflect") elif mode == PaddingMode.REPLICATE: @@ -663,8 +652,7 @@ def _expand_for_data_format(values): else: raise ValueError(f"Invalid padding mode: {mode}") - image = (to_channel_dimension_format(image, data_format) - if data_format is not None else image) + image = to_channel_dimension_format(image, data_format) if data_format is not None else image return image @@ -697,8 +685,9 @@ def decode_image(image_path: str) -> ImageInput: def random_horizontal_flip( - image: np.ndarray, - flip_prob: float, ) -> np.ndarray: + image: np.ndarray, + flip_prob: float, +) -> np.ndarray: """ Randomly flips the image horizontally. @@ -757,19 +746,18 @@ def _get_image_size(img): elif len(img.shape) == 4: return img.shape[2:][::-1] # nchw -> wh else: - raise ValueError( - "The dim for input Tensor should be 3-D or 4-D, but received {}". - format(len(img.shape))) + raise ValueError("The dim for input Tensor should be 3-D or 4-D, but received {}".format(len(img.shape))) else: raise TypeError(f"Unexpected type {type(img)}") def random_resized_crop( - image: np.ndarray, - size: Union[int, List, Tuple], - scale: float=(0.08, 1.0), - ratio: float=(3.0 / 4, 4.0 / 3), - resample: "PILImageResampling" =None, ) -> np.ndarray: + image: np.ndarray, + size: Union[int, List, Tuple], + scale: float = (0.08, 1.0), + ratio: float = (3.0 / 4, 4.0 / 3), + resample: "PILImageResampling" = None, +) -> np.ndarray: """ Crop the input data to random size and aspect ratio. A crop of random size (default: of 0.08 to 1.0) of the original size and a random diff --git a/paddlemix/processors/image_transforms.py b/paddlemix/processors/image_transforms.py index f8e07441533b5..135d42e0ac095 100644 --- a/paddlemix/processors/image_transforms.py +++ b/paddlemix/processors/image_transforms.py @@ -20,12 +20,17 @@ import numpy as np import paddle import PIL -from paddlenlp.transformers.tokenizer_utils_base import (ExplicitEnum, - TensorType) +from paddlenlp.transformers.tokenizer_utils_base import ExplicitEnum, TensorType -from .image_utils import (ChannelDimension, ImageInput, PILImageResampling, - get_channel_dimension_axis, get_image_size, - infer_channel_dimension_format, to_numpy_array) +from .image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + get_channel_dimension_axis, + get_image_size, + infer_channel_dimension_format, + to_numpy_array, +) def is_paddle_tensor(tensor): @@ -33,9 +38,9 @@ def is_paddle_tensor(tensor): def to_channel_dimension_format( - image: np.ndarray, - channel_dim: Union[ChannelDimension, str], - input_channel_dim: Optional[Union[ChannelDimension, str]]=None, + image: np.ndarray, + channel_dim: Union[ChannelDimension, str], + input_channel_dim: Optional[Union[ChannelDimension, str]] = None, ) -> np.ndarray: """ Converts `image` to the channel dimension format specified by `channel_dim`. @@ -50,8 +55,7 @@ def to_channel_dimension_format( `np.ndarray`: The image with the channel dimension set to `channel_dim`. """ if not isinstance(image, np.ndarray): - raise ValueError( - f"Input image must be of type np.ndarray, got {type(image)}") + raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}") if input_channel_dim is None: input_channel_dim = infer_channel_dimension_format(image) @@ -65,17 +69,17 @@ def to_channel_dimension_format( elif target_channel_dim == ChannelDimension.LAST: image = image.transpose((1, 2, 0)) else: - raise ValueError("Unsupported channel dimension format: {}".format( - channel_dim)) + raise ValueError("Unsupported channel dimension format: {}".format(channel_dim)) return image def rescale( - image: np.ndarray, - scale: float, - data_format: Optional[ChannelDimension]=None, - dtype=np.float32, ) -> np.ndarray: + image: np.ndarray, + scale: float, + data_format: Optional[ChannelDimension] = None, + dtype=np.float32, +) -> np.ndarray: """ Rescales `image` by `scale`. @@ -94,20 +98,19 @@ def rescale( `np.ndarray`: The rescaled image. """ if not isinstance(image, np.ndarray): - raise ValueError( - f"Input image must be of type np.ndarray, got {type(image)}") + raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}") rescaled_image = image * scale if data_format is not None: - rescaled_image = to_channel_dimension_format(rescaled_image, - data_format) + rescaled_image = to_channel_dimension_format(rescaled_image, data_format) rescaled_image = rescaled_image.astype(dtype) return rescaled_image def to_pil_image( - image: Union[np.ndarray, "PIL.Image.Image", "paddle.Tensor"], - do_rescale: Optional[bool]=None, ) -> "PIL.Image.Image": + image: Union[np.ndarray, "PIL.Image.Image", "paddle.Tensor"], + do_rescale: Optional[bool] = None, +) -> "PIL.Image.Image": """ Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if needed. @@ -129,8 +132,7 @@ def to_pil_image( if is_paddle_tensor(image): image = image.numpy() elif not isinstance(image, np.ndarray): - raise ValueError("Input image type not supported: {}".format( - type(image))) + raise ValueError("Input image type not supported: {}".format(type(image))) # If the channel as been moved to first dim, we put it back at the end. image = to_channel_dimension_format(image, ChannelDimension.LAST) @@ -139,8 +141,7 @@ def to_pil_image( image = np.squeeze(image, axis=-1) if image.shape[-1] == 1 else image # PIL.Image can only store uint8 values, so we rescale the image to be between 0 and 255 if needed. - do_rescale = (isinstance(image.flat[0], (float, np.float32, np.float64)) - if do_rescale is None else do_rescale) + do_rescale = isinstance(image.flat[0], (float, np.float32, np.float64)) if do_rescale is None else do_rescale if do_rescale: image = rescale(image, 255) image = image.astype(np.uint8) @@ -149,10 +150,11 @@ def to_pil_image( # Logic adapted from torchvision resizing logic: https://github.com/pytorch/vision/blob/511924c1ced4ce0461197e5caa64ce5b9e558aab/torchvision/transforms/functional.py#L366 def get_resize_output_image_size( - input_image: np.ndarray, - size: Union[int, Tuple[int, int], List[int], Tuple[int]], - default_to_square: bool=True, - max_size: Optional[int]=None, ) -> tuple: + input_image: np.ndarray, + size: Union[int, Tuple[int, int], List[int], Tuple[int]], + default_to_square: bool = True, + max_size: Optional[int] = None, +) -> tuple: """ Find the target (height, width) dimension of the output image after resizing given the input image and the desired size. @@ -188,8 +190,7 @@ def get_resize_output_image_size( # Perform same logic as if size was an int size = size[0] else: - raise ValueError( - "size must have 1 or 2 elements if it is a list or tuple") + raise ValueError("size must have 1 or 2 elements if it is a list or tuple") if default_to_square: return (size, size) @@ -198,14 +199,14 @@ def get_resize_output_image_size( short, long = (width, height) if width <= height else (height, width) requested_new_short = size - new_short, new_long = requested_new_short, int(requested_new_short * long / - short) + new_short, new_long = requested_new_short, int(requested_new_short * long / short) if max_size is not None: if max_size <= requested_new_short: raise ValueError( f"max_size = {max_size} must be strictly greater than the requested " - f"size for the smaller edge size = {size}") + f"size for the smaller edge size = {size}" + ) if new_long > max_size: new_short, new_long = int(max_size * new_short / new_long), max_size @@ -213,12 +214,13 @@ def get_resize_output_image_size( def resize( - image, - size: Tuple[int, int], - resample: "PILImageResampling" =None, - reducing_gap: Optional[int]=None, - data_format: Optional[ChannelDimension]=None, - return_numpy: bool=True, ) -> np.ndarray: + image, + size: Tuple[int, int], + resample: "PILImageResampling" = None, + reducing_gap: Optional[int] = None, + data_format: Optional[ChannelDimension] = None, + return_numpy: bool = True, +) -> np.ndarray: """ Resizes `image` to `(height, width)` specified by `size` using the PIL library. @@ -248,8 +250,7 @@ def resize( # For all transformations, we want to keep the same data format as the input image unless otherwise specified. # The resized image from PIL will always have channels last, so find the input format first. - data_format = (infer_channel_dimension_format(image) - if data_format is None else data_format) + data_format = infer_channel_dimension_format(image) if data_format is None else data_format # To maintain backwards compatibility with the resizing done in previous image feature extractors, we use # the pillow library to resize the image and then convert back to numpy @@ -257,27 +258,26 @@ def resize( image = to_pil_image(image) height, width = size # PIL images are in the format (width, height) - resized_image = image.resize( - (width, height), resample=resample, reducing_gap=reducing_gap) + resized_image = image.resize((width, height), resample=resample, reducing_gap=reducing_gap) if return_numpy: resized_image = np.array(resized_image) # If the input image channel dimension was of size 1, then it is dropped when converting to a PIL image # so we need to add it back if necessary. - resized_image = (np.expand_dims( - resized_image, axis=-1) - if resized_image.ndim == 2 else resized_image) + resized_image = np.expand_dims(resized_image, axis=-1) if resized_image.ndim == 2 else resized_image # The image is always in channels last format after converting from a PIL image resized_image = to_channel_dimension_format( - resized_image, data_format, input_channel_dim=ChannelDimension.LAST) + resized_image, data_format, input_channel_dim=ChannelDimension.LAST + ) return resized_image def normalize( - image: np.ndarray, - mean: Union[float, Iterable[float]], - std: Union[float, Iterable[float]], - data_format: Optional[ChannelDimension]=None, ) -> np.ndarray: + image: np.ndarray, + mean: Union[float, Iterable[float]], + std: Union[float, Iterable[float]], + data_format: Optional[ChannelDimension] = None, +) -> np.ndarray: """ Normalizes `image` using the mean and standard deviation specified by `mean` and `std`. @@ -296,7 +296,8 @@ def normalize( if isinstance(image, PIL.Image.Image): warnings.warn( "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.", - FutureWarning, ) + FutureWarning, + ) # Convert PIL image to numpy array with the same logic as in the previous feature extractor normalize - # casting to numpy array and dividing by 255. image = to_numpy_array(image) @@ -311,18 +312,14 @@ def normalize( if isinstance(mean, Iterable): if len(mean) != num_channels: - raise ValueError( - f"mean must have {num_channels} elements if it is an iterable, got {len(mean)}" - ) + raise ValueError(f"mean must have {num_channels} elements if it is an iterable, got {len(mean)}") else: mean = [mean] * num_channels mean = np.array(mean, dtype=image.dtype) if isinstance(std, Iterable): if len(std) != num_channels: - raise ValueError( - f"std must have {num_channels} elements if it is an iterable, got {len(std)}" - ) + raise ValueError(f"std must have {num_channels} elements if it is an iterable, got {len(std)}") else: std = [std] * num_channels std = np.array(std, dtype=image.dtype) @@ -332,16 +329,16 @@ def normalize( else: image = ((image.T - mean) / std).T - image = (to_channel_dimension_format(image, data_format) - if data_format is not None else image) + image = to_channel_dimension_format(image, data_format) if data_format is not None else image return image def center_crop( - image: np.ndarray, - size: Tuple[int, int], - data_format: Optional[Union[str, ChannelDimension]]=None, - return_numpy: Optional[bool]=None, ) -> np.ndarray: + image: np.ndarray, + size: Tuple[int, int], + data_format: Optional[Union[str, ChannelDimension]] = None, + return_numpy: Optional[bool] = None, +) -> np.ndarray: """ Crops the `image` to the specified `size` using a center crop. Note that if the image is too small to be cropped to the size given, it will be padded (so the returned result will always be of size `size`). @@ -368,20 +365,18 @@ def center_crop( if isinstance(image, PIL.Image.Image): warnings.warn( "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.", - FutureWarning, ) + FutureWarning, + ) image = to_numpy_array(image) return_numpy = False if return_numpy is None else return_numpy else: return_numpy = True if return_numpy is None else return_numpy if not isinstance(image, np.ndarray): - raise ValueError( - f"Input image must be of type np.ndarray, got {type(image)}") + raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}") if not isinstance(size, Iterable) or len(size) != 2: - raise ValueError( - "size must have 2 elements representing the height and width of the output image" - ) + raise ValueError("size must have 2 elements representing the height and width of the output image") input_data_format = infer_channel_dimension_format(image) output_data_format = data_format if data_format is not None else input_data_format @@ -424,8 +419,7 @@ def center_crop( left += left_pad right += left_pad - new_image = new_image[..., max(0, top):min(new_height, bottom), max( - 0, left):min(new_width, right)] + new_image = new_image[..., max(0, top) : min(new_height, bottom), max(0, left) : min(new_width, right)] new_image = to_channel_dimension_format(new_image, output_data_format) if not return_numpy: @@ -434,8 +428,7 @@ def center_crop( return new_image -def _center_to_corners_format_paddle( - bboxes_center: "paddle.Tensor") -> "paddle.Tensor": +def _center_to_corners_format_paddle(bboxes_center: "paddle.Tensor") -> "paddle.Tensor": center_x, center_y, width, height = bboxes_center.unbind(-1) bbox_corners = paddle.stack( # top left x, top left y, bottom right x, bottom right y @@ -445,7 +438,8 @@ def _center_to_corners_format_paddle( (center_x + 0.5 * width), (center_y + 0.5 * height), ], - axis=-1, ) + axis=-1, + ) return bbox_corners @@ -459,7 +453,8 @@ def _center_to_corners_format_numpy(bboxes_center: np.ndarray) -> np.ndarray: center_x + 0.5 * width, center_y + 0.5 * height, ], - axis=-1, ) + axis=-1, + ) return bboxes_corners @@ -484,9 +479,9 @@ def center_to_corners_format(bboxes_center: TensorType) -> TensorType: def _corners_to_center_format_paddle( - bboxes_corners: "paddle.Tensor", ) -> "paddle.Tensor": - top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.unbind( - -1) + bboxes_corners: "paddle.Tensor", +) -> "paddle.Tensor": + top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.unbind(-1) b = [ (top_left_x + bottom_right_x) / 2, # center x (top_left_y + bottom_right_y) / 2, # center y @@ -505,7 +500,8 @@ def _corners_to_center_format_numpy(bboxes_corners: np.ndarray) -> np.ndarray: (bottom_right_x - top_left_x), # width (bottom_right_y - top_left_y), # height ], - axis=-1, ) + axis=-1, + ) return bboxes_center @@ -537,8 +533,7 @@ def rgb_to_id(color): if isinstance(color, np.ndarray) and len(color.shape) == 3: if color.dtype == np.uint8: color = color.astype(np.int32) - return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, - 2] + return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2] return int(color[0] + 256 * color[1] + 256 * 256 * color[2]) @@ -573,12 +568,12 @@ class PaddingMode(ExplicitEnum): def pad( - image: np.ndarray, - padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]], - mode: PaddingMode=PaddingMode.CONSTANT, - constant_values: Union[float, Iterable[float]]=0.0, - data_format: Optional[Union[str, ChannelDimension]]=None, - input_data_format: Optional[Union[str, ChannelDimension]]=None, + image: np.ndarray, + padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]], + mode: PaddingMode = PaddingMode.CONSTANT, + constant_values: Union[float, Iterable[float]] = 0.0, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> np.ndarray: """ Pads the `image` with the specified (height, width) `padding` and `mode`. @@ -626,19 +621,15 @@ def _expand_for_data_format(values): values = ((values, values), (values, values)) elif isinstance(values, tuple) and len(values) == 1: values = ((values[0], values[0]), (values[0], values[0])) - elif (isinstance(values, tuple) and len(values) == 2 and - isinstance(values[0], int)): + elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], int): values = (values, values) - elif (isinstance(values, tuple) and len(values) == 2 and - isinstance(values[0], tuple)): + elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], tuple): values = values else: raise ValueError(f"Unsupported format: {values}") # add 0 for channel dimension - values = (((0, 0), *values) - if input_data_format == ChannelDimension.FIRST else (*values, - (0, 0))) + values = ((0, 0), *values) if input_data_format == ChannelDimension.FIRST else (*values, (0, 0)) # Add additional padding if there's a batch dimension values = (0, *values) if image.ndim == 4 else values @@ -648,10 +639,7 @@ def _expand_for_data_format(values): if mode == PaddingMode.CONSTANT: constant_values = _expand_for_data_format(constant_values) - image = np.pad(image, - padding, - mode="constant", - constant_values=constant_values) + image = np.pad(image, padding, mode="constant", constant_values=constant_values) elif mode == PaddingMode.REFLECT: image = np.pad(image, padding, mode="reflect") elif mode == PaddingMode.REPLICATE: @@ -661,8 +649,7 @@ def _expand_for_data_format(values): else: raise ValueError(f"Invalid padding mode: {mode}") - image = (to_channel_dimension_format(image, data_format) - if data_format is not None else image) + image = to_channel_dimension_format(image, data_format) if data_format is not None else image return image diff --git a/paddlemix/processors/image_utils.py b/paddlemix/processors/image_utils.py index a9ac6fca3f62d..8f59dd153e395 100644 --- a/paddlemix/processors/image_utils.py +++ b/paddlemix/processors/image_utils.py @@ -49,15 +49,19 @@ def to_numpy(obj): return obj -if version.parse(version.parse(PIL.__version__).base_version) >= version.parse( - "9.1.0"): +if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"): PILImageResampling = PIL.Image.Resampling else: PILImageResampling = PIL.Image -ImageInput = Union["PIL.Image.Image", np.ndarray, "paddle.Tensor", - List["PIL.Image.Image"], List[np.ndarray], - List["paddle.Tensor"], ] # noqa +ImageInput = Union[ + "PIL.Image.Image", + np.ndarray, + "paddle.Tensor", + List["PIL.Image.Image"], + List[np.ndarray], + List["paddle.Tensor"], +] # noqa class TensorType(ExplicitEnum): @@ -76,8 +80,7 @@ class ChannelDimension(ExplicitEnum): def is_valid_image(img): - return (isinstance(img, PIL.Image.Image) or isinstance(img, np.ndarray) or - is_paddle_tensor(img)) + return isinstance(img, PIL.Image.Image) or isinstance(img, np.ndarray) or is_paddle_tensor(img) def valid_images(imgs): @@ -98,7 +101,7 @@ def is_batched(img): return False -def make_list_of_images(images, expected_ndims: int=3) -> List[ImageInput]: +def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageInput]: """ Ensure that the input is a list of images. If the input is a single image, it is converted to a list of length 1. If the input is a batch of images, it is converted to a list of images. @@ -127,11 +130,12 @@ def make_list_of_images(images, expected_ndims: int=3) -> List[ImageInput]: else: raise ValueError( f"Invalid image shape. Expected either {expected_ndims + 1} or {expected_ndims} dimensions, but got" - f" {images.ndim} dimensions.") + f" {images.ndim} dimensions." + ) return images raise ValueError( - "Invalid image type. Expected either PIL.Image.Image, numpy.ndarray, paddle.Tensor " - f"but got {type(images)}.") + "Invalid image type. Expected either PIL.Image.Image, numpy.ndarray, paddle.Tensor " f"but got {type(images)}." + ) def to_numpy_array(img) -> np.ndarray: @@ -159,8 +163,7 @@ def infer_channel_dimension_format(image: np.ndarray) -> ChannelDimension: elif image.ndim == 4: first_dim, last_dim = 1, 3 else: - raise ValueError( - f"Unsupported number of image dimensions: {image.ndim}") + raise ValueError(f"Unsupported number of image dimensions: {image.ndim}") if image.shape[first_dim] in (1, 3): return ChannelDimension.FIRST @@ -188,8 +191,7 @@ def get_channel_dimension_axis(image: np.ndarray) -> int: raise ValueError(f"Unsupported data format: {channel_dim}") -def get_image_size(image: np.ndarray, - channel_dim: ChannelDimension=None) -> Tuple[int, int]: +def get_image_size(image: np.ndarray, channel_dim: ChannelDimension = None) -> Tuple[int, int]: """ Returns the (height, width) dimensions of the image. @@ -213,37 +215,44 @@ def get_image_size(image: np.ndarray, raise ValueError(f"Unsupported data format: {channel_dim}") -def is_valid_annotation_coco_detection( - annotation: Dict[str, Union[List, Tuple]]) -> bool: - if (isinstance(annotation, dict) and "image_id" in annotation and - "annotations" in annotation and - isinstance(annotation["annotations"], (list, tuple)) and ( - # an image can have no annotations - len(annotation["annotations"]) == 0 or - isinstance(annotation["annotations"][0], dict))): +def is_valid_annotation_coco_detection(annotation: Dict[str, Union[List, Tuple]]) -> bool: + if ( + isinstance(annotation, dict) + and "image_id" in annotation + and "annotations" in annotation + and isinstance(annotation["annotations"], (list, tuple)) + and ( + # an image can have no annotations + len(annotation["annotations"]) == 0 + or isinstance(annotation["annotations"][0], dict) + ) + ): return True return False -def is_valid_annotation_coco_panoptic( - annotation: Dict[str, Union[List, Tuple]]) -> bool: - if (isinstance(annotation, dict) and "image_id" in annotation and - "segments_info" in annotation and "file_name" in annotation and - isinstance(annotation["segments_info"], (list, tuple)) and ( - # an image can have no segments - len(annotation["segments_info"]) == 0 or - isinstance(annotation["segments_info"][0], dict))): +def is_valid_annotation_coco_panoptic(annotation: Dict[str, Union[List, Tuple]]) -> bool: + if ( + isinstance(annotation, dict) + and "image_id" in annotation + and "segments_info" in annotation + and "file_name" in annotation + and isinstance(annotation["segments_info"], (list, tuple)) + and ( + # an image can have no segments + len(annotation["segments_info"]) == 0 + or isinstance(annotation["segments_info"][0], dict) + ) + ): return True return False -def valid_coco_detection_annotations( - annotations: Iterable[Dict[str, Union[List, Tuple]]]) -> bool: +def valid_coco_detection_annotations(annotations: Iterable[Dict[str, Union[List, Tuple]]]) -> bool: return all(is_valid_annotation_coco_detection(ann) for ann in annotations) -def valid_coco_panoptic_annotations( - annotations: Iterable[Dict[str, Union[List, Tuple]]]) -> bool: +def valid_coco_panoptic_annotations(annotations: Iterable[Dict[str, Union[List, Tuple]]]) -> bool: return all(is_valid_annotation_coco_panoptic(ann) for ann in annotations) @@ -280,8 +289,7 @@ def load_image(image: Union[str, "PIL.Image.Image"]) -> "PIL.Image.Image": return image -def get_preprocess_shape(oldh: int, oldw: int, - long_side_length: int) -> Tuple[int, int]: +def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]: """ Compute the output size given input size and target long side length. """ diff --git a/paddlemix/processors/imagebind_processing.py b/paddlemix/processors/imagebind_processing.py index 1b70eed72629f..ff69dca3e927d 100644 --- a/paddlemix/processors/imagebind_processing.py +++ b/paddlemix/processors/imagebind_processing.py @@ -17,19 +17,14 @@ """ import logging -from typing import Dict, List, Optional, Union +from typing import List, Optional, Union -import numpy as np import paddle from paddle.vision.transforms import transforms as T -from paddlenlp.transformers.tokenizer_utils_base import (BatchEncoding, - TensorType, TextInput) +from paddlenlp.transformers.tokenizer_utils_base import BatchEncoding from paddlevideo.data.clip_sampling import ConstantClipsPerVideoSampler -from PIL import Image from .base_processing import ProcessorMixin -from .image_processing_utils import BatchFeature -from .image_utils import ImageInput from .processing_utils import BaseAudioProcessor __all__ = ["ImageBindProcessor", "ImageBindAudioProcessor"] @@ -48,34 +43,23 @@ class ImageBindProcessor(ProcessorMixin): def __init__(self, image_processor, tokenizer, audio_processor): super().__init__(image_processor, tokenizer, audio_processor) - def __call__(self, - text=None, - images=None, - audios=None, - return_tensors=None, - **kwargs): + def __call__(self, text=None, images=None, audios=None, return_tensors=None, **kwargs): if text is None and images is None: - raise ValueError( - "You have to specify either text or images. Both cannot be none." - ) + raise ValueError("You have to specify either text or images. Both cannot be none.") if text is not None: - encoding = self.tokenizer( - text, return_tensors=return_tensors, **kwargs) + encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs) n, m = encoding["input_ids"].shape - zero_encoding = paddle.zeros( - shape=[n, self.tokenizer.max_len], dtype="int64") + zero_encoding = paddle.zeros(shape=[n, self.tokenizer.max_len], dtype="int64") zero_encoding[:, :m] = paddle.to_tensor(data=encoding["input_ids"]) encoding["input_ids"] = zero_encoding if images is not None: - image_features = self.image_processor( - images, return_tensors=return_tensors, **kwargs) + image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs) if audios is not None: - encoding["audio_values"] = self.audio_processor( - audios, return_tensors=return_tensors, **kwargs) + encoding["audio_values"] = self.audio_processor(audios, return_tensors=return_tensors, **kwargs) if text is not None and images is not None: encoding["pixel_values"] = image_features.pixel_values @@ -83,30 +67,29 @@ def __call__(self, elif text is not None: return encoding else: - return BatchEncoding( - data=dict(**image_features), tensor_type=return_tensors) + return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors) @property def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names image_processor_input_names = self.image_processor.model_input_names - return list( - dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) class ImageBindAudioProcessor(BaseAudioProcessor): model_input_names = ["audio_values"] def __init__( - self, - num_mel_bins: int=0, - target_length: int=0, - sample_rate: int=0, - clip_duration: int=0, - clips_per_video: int=0, - mean: Optional[Union[float, List[float]]]=None, - std: Optional[Union[float, List[float]]]=None, - **kwargs, ): + self, + num_mel_bins: int = 0, + target_length: int = 0, + sample_rate: int = 0, + clip_duration: int = 0, + clips_per_video: int = 0, + mean: Optional[Union[float, List[float]]] = None, + std: Optional[Union[float, List[float]]] = None, + **kwargs, + ): super().__init__(**kwargs) self.num_mel_bins = num_mel_bins self.target_length = target_length @@ -117,9 +100,10 @@ def __init__( self.std = std def preprocess( - self, - audio_path: Union[str, List[str]], - **kwargs, ): + self, + audio_path: Union[str, List[str]], + **kwargs, + ): """ Preprocess the text with tokenization. """ @@ -128,32 +112,38 @@ def preprocess( audio_outputs = [] # breakpoint() clip_sampler = ConstantClipsPerVideoSampler( - clip_duration=self.clip_duration, - clips_per_video=self.clips_per_video) + clip_duration=self.clip_duration, clips_per_video=self.clips_per_video + ) # for audio_path in audio_paths: waveform, sr = paddle.audio.load(audio_path) if self.sample_rate != sr: - waveform = paddle.audio.functional.resample( - waveform, orig_freq=sr, new_freq=self.sample_rate) + waveform = paddle.audio.functional.resample(waveform, orig_freq=sr, new_freq=self.sample_rate) - all_clips_timepoints = self.get_clip_timepoints( - clip_sampler, waveform.shape[1] / self.sample_rate) + all_clips_timepoints = self.get_clip_timepoints(clip_sampler, waveform.shape[1] / self.sample_rate) all_clips = [] for clip_timepoints in all_clips_timepoints: - waveform_clip = waveform[:, int(clip_timepoints[ - 0] * self.sample_rate):int(clip_timepoints[1] * - self.sample_rate), ] + waveform_clip = waveform[ + :, + int(clip_timepoints[0] * self.sample_rate) : int(clip_timepoints[1] * self.sample_rate), + ] waveform_melspec = self.waveform2melspec( - waveform_clip, self.sample_rate, self.num_mel_bins, - self.target_length) + waveform_clip, self.sample_rate, self.num_mel_bins, self.target_length + ) all_clips.append(waveform_melspec) normalize = T.Normalize( mean=self.mean - if not isinstance(self.mean, (float, int)) else [self.mean, ], + if not isinstance(self.mean, (float, int)) + else [ + self.mean, + ], std=self.std - if not isinstance(self.std, (float, int)) else [self.std, ], ) + if not isinstance(self.std, (float, int)) + else [ + self.std, + ], + ) all_clips = [normalize(ac) for ac in all_clips] all_clips = paddle.stack(x=all_clips, axis=0) @@ -165,13 +155,11 @@ def get_clip_timepoints(self, clip_sampler, duration): is_last_clip = False end = 0.0 while not is_last_clip: - start, end, _, _, is_last_clip = clip_sampler( - end, duration, annotation=None) + start, end, _, _, is_last_clip = clip_sampler(end, duration, annotation=None) all_clips_timepoints.append((start, end)) return all_clips_timepoints - def waveform2melspec(self, waveform, sample_rate, num_mel_bins, - target_length): + def waveform2melspec(self, waveform, sample_rate, num_mel_bins, target_length): waveform -= waveform.mean() fbank = paddle.audio.fbank( waveform, @@ -182,7 +170,8 @@ def waveform2melspec(self, waveform, sample_rate, num_mel_bins, num_mel_bins=num_mel_bins, dither=0.0, frame_length=25, - frame_shift=DEFAULT_AUDIO_FRAME_SHIFT_MS, ) + frame_shift=DEFAULT_AUDIO_FRAME_SHIFT_MS, + ) x = fbank perm_0 = list(range(x.ndim)) perm_0[0] = 1 @@ -194,10 +183,10 @@ def waveform2melspec(self, waveform, sample_rate, num_mel_bins, logging.warning( "Large gap between audio n_frames(%d) and target_length (%d). Is the audio_target_length setting correct?", n_frames, - target_length, ) + target_length, + ) if p > 0: - fbank = paddle.pad_from_torch( - fbank, pad=(0, p), mode="constant", value=0) + fbank = paddle.pad_from_torch(fbank, pad=(0, p), mode="constant", value=0) elif p < 0: fbank = fbank[:, 0:target_length] diff --git a/paddlemix/processors/minigpt4_image_processing.py b/paddlemix/processors/minigpt4_image_processing.py index 0558e3124c4fb..1798f95574a4d 100644 --- a/paddlemix/processors/minigpt4_image_processing.py +++ b/paddlemix/processors/minigpt4_image_processing.py @@ -21,14 +21,26 @@ import PIL from paddlenlp.transformers.tokenizer_utils_base import TensorType -from .image_processing_utils import (BaseImageProcessor, BatchFeature, - get_size_dict) -from .image_transforms import (convert_to_rgb, normalize, rescale, resize, - to_channel_dimension_format) -from .image_utils import (ChannelDimension, ImageInput, PILImageResampling, - is_batched, to_numpy_array, valid_images) - -__all__ = ["MiniGPT4ImageProcessor", ] +from .image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from .image_transforms import ( + convert_to_rgb, + normalize, + rescale, + resize, + to_channel_dimension_format, +) +from .image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + is_batched, + to_numpy_array, + valid_images, +) + +__all__ = [ + "MiniGPT4ImageProcessor", +] class MiniGPT4ImageProcessor(BaseImageProcessor): @@ -69,17 +81,18 @@ class MiniGPT4ImageProcessor(BaseImageProcessor): model_input_names = ["pixel_values"] def __init__( - self, - do_resize: bool=True, - size: Dict[str, int]=None, - resample: PILImageResampling=PILImageResampling.BICUBIC, - do_rescale: bool=True, - rescale_factor: Union[int, float]=1 / 255, - do_normalize: bool=True, - image_mean: Optional[Union[float, List[float]]]=None, - image_std: Optional[Union[float, List[float]]]=None, - do_convert_rgb: bool=True, - **kwargs, ) -> None: + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = True, + **kwargs, + ) -> None: super().__init__(**kwargs) default_image_mean = [0.48145466, 0.4578275, 0.40821073] default_image_std = [0.26862954, 0.26130258, 0.27577711] @@ -97,12 +110,13 @@ def __init__( self.do_convert_rgb = do_convert_rgb def resize( - self, - image: np.ndarray, - size: Dict[str, int], - resample: PILImageResampling=PILImageResampling.BICUBIC, - data_format: Optional[Union[str, ChannelDimension]]=None, - **kwargs, ) -> np.ndarray: + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: """ Resize an image. @@ -127,14 +141,16 @@ def resize( size=output_size, resample=resample, data_format=data_format, - **kwargs, ) + **kwargs, + ) def rescale( - self, - image: np.ndarray, - scale: Union[int, float], - data_format: Optional[Union[str, ChannelDimension]]=None, - **kwargs, ): + self, + image: np.ndarray, + scale: Union[int, float], + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ): """ Rescale an image by a scale factor. image = image * scale. @@ -149,12 +165,13 @@ def rescale( return rescale(image, scale=scale, data_format=data_format, **kwargs) def normalize( - self, - image: np.ndarray, - mean: Union[float, List[float]], - std: Union[float, List[float]], - data_format: Optional[Union[str, ChannelDimension]]=None, - **kwargs, ) -> np.ndarray: + self, + image: np.ndarray, + mean: Union[float, List[float]], + std: Union[float, List[float]], + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. @@ -168,24 +185,24 @@ def normalize( data_format (`str` or `ChannelDimension`, *optional*): The channel dimension format of the image. If not provided, it will be the same as the input image. """ - return normalize( - image, mean=mean, std=std, data_format=data_format, **kwargs) + return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs) def preprocess( - self, - images: ImageInput, - do_resize: Optional[bool]=None, - size: Optional[Dict[str, int]]=None, - resample: PILImageResampling=None, - do_rescale: Optional[bool]=None, - rescale_factor: Optional[float]=None, - do_normalize: Optional[bool]=None, - image_mean: Optional[Union[float, List[float]]]=None, - image_std: Optional[Union[float, List[float]]]=None, - return_tensors: Optional[Union[str, TensorType]]=None, - do_convert_rgb: bool=None, - data_format: ChannelDimension=ChannelDimension.FIRST, - **kwargs, ) -> PIL.Image.Image: + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample: PILImageResampling = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + do_convert_rgb: bool = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + **kwargs, + ) -> PIL.Image.Image: """ Preprocess an image or batch of images. @@ -227,13 +244,11 @@ def preprocess( do_resize = do_resize if do_resize is not None else self.do_resize resample = resample if resample is not None else self.resample do_rescale = do_rescale if do_rescale is not None else self.do_rescale - rescale_factor = (rescale_factor if rescale_factor is not None else - self.rescale_factor) + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor do_normalize = do_normalize if do_normalize is not None else self.do_normalize image_mean = image_mean if image_mean is not None else self.image_mean image_std = image_std if image_std is not None else self.image_std - do_convert_rgb = (do_convert_rgb if do_convert_rgb is not None else - self.do_convert_rgb) + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb size = size if size is not None else self.size size = get_size_dict(size, default_to_square=False) @@ -242,21 +257,16 @@ def preprocess( images = [images] if not valid_images(images): - raise ValueError( - "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " - "paddle.Tensor.") + raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") if do_resize and size is None or resample is None: - raise ValueError( - "Size and resample must be specified if do_resize is True.") + raise ValueError("Size and resample must be specified if do_resize is True.") if do_rescale and rescale_factor is None: - raise ValueError( - "Rescale factor must be specified if do_rescale is True.") + raise ValueError("Rescale factor must be specified if do_rescale is True.") if do_normalize and (image_mean is None or image_std is None): - raise ValueError( - "Image mean and std must be specified if do_normalize is True.") + raise ValueError("Image mean and std must be specified if do_normalize is True.") # PIL RGBA images are converted to RGB if do_convert_rgb: @@ -266,28 +276,15 @@ def preprocess( images = [to_numpy_array(image) for image in images] if do_resize: - images = [ - self.resize( - image=image, size=size, resample=resample) - for image in images - ] + images = [self.resize(image=image, size=size, resample=resample) for image in images] if do_rescale: - images = [ - self.rescale( - image=image, scale=rescale_factor) for image in images - ] + images = [self.rescale(image=image, scale=rescale_factor) for image in images] if do_normalize: - images = [ - self.normalize( - image=image, mean=image_mean, std=image_std) - for image in images - ] - - images = [ - to_channel_dimension_format(image, data_format) for image in images - ] + images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images] + + images = [to_channel_dimension_format(image, data_format) for image in images] data = {"pixel_values": images} return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/paddlemix/processors/minigpt4_processing.py b/paddlemix/processors/minigpt4_processing.py index 72d71e7af1d34..3a79ceb64575f 100644 --- a/paddlemix/processors/minigpt4_processing.py +++ b/paddlemix/processors/minigpt4_processing.py @@ -20,15 +20,20 @@ import numpy as np import paddle -from paddlenlp.transformers.tokenizer_utils_base import (BatchEncoding, - TensorType, TextInput) +from paddlenlp.transformers.tokenizer_utils_base import ( + BatchEncoding, + TensorType, + TextInput, +) from PIL import Image from .base_processing import ProcessorMixin from .image_processing_utils import BatchFeature from .image_utils import ImageInput -__all__ = ["MiniGPT4Processor", ] +__all__ = [ + "MiniGPT4Processor", +] class MiniGPT4Processor(ProcessorMixin): @@ -74,16 +79,16 @@ def __init__(self, image_processor, tokenizer): tokenizer.pad_token = tokenizer.eos_token super().__init__(image_processor, tokenizer) self.current_processor = self.image_processor - self.default_prompt = ( - "###Human: ###Assistant: ") + self.default_prompt = "###Human: ###Assistant: " self.image_tag = "" self.text_tag = "" def process_images( - self, - images: ImageInput, - return_tensors: Optional[Union[str, TensorType]]=TensorType.PADDLE, - **kwargs, ) -> BatchFeature: + self, + images: ImageInput, + return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE, + **kwargs, + ) -> BatchFeature: """ This method uses [`MiniGPT4ImageProcessor.__call__`] method to prepare image(s) for the model. Please refer to the docstring of the method for more information. @@ -95,34 +100,31 @@ def process_images( images = [images] # processing with image processor - processed_images = self.image_processor( - images, return_tensors=return_tensors) + processed_images = self.image_processor(images, return_tensors=return_tensors) return processed_images def process_texts( - self, - texts: Union[TextInput, List[TextInput]], - prompts: Union[TextInput, List[TextInput]]=None, - return_tensors: Optional[Union[str, TensorType]]=TensorType.PADDLE, - **kwargs, ): + self, + texts: Union[TextInput, List[TextInput]], + prompts: Union[TextInput, List[TextInput]] = None, + return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE, + **kwargs, + ): prompts = prompts if prompts is not None else [self.default_prompt] if (not isinstance(texts, TextInput)) and (not isinstance(texts, list)): + raise TypeError("Unsupported type for texts: {}, only str and list type supported.".format(type(texts))) + if prompts is not None and (not isinstance(prompts, TextInput)) and (not isinstance(prompts, list)): raise TypeError( - "Unsupported type for texts: {}, only str and list type supported.". - format(type(texts))) - if (prompts is not None and (not isinstance(prompts, TextInput)) and - (not isinstance(prompts, list))): - raise TypeError( - "Unsupported type for prompts: {}, only str and list type supported.". - format(type(prompts))) + "Unsupported type for prompts: {}, only str and list type supported.".format(type(prompts)) + ) if isinstance(prompts, list): if isinstance(texts, list) and len(prompts) != len(texts): raise ValueError( - "The length of prompts not is equal to texts' length: {} != {}". - format(len(prompts), len(texts))) + "The length of prompts not is equal to texts' length: {} != {}".format(len(prompts), len(texts)) + ) elif isinstance(texts, TextInput): texts = [texts] * len(prompts) else: @@ -137,47 +139,51 @@ def process_texts( if self.image_tag not in text: if self.image_tag not in prompt: raise ValueError( - "A prompt should contain a image tag `{}` to insert image embeddings. if you don't want to use prompt function, you have to input a text with the image tag `{}`.". - format(self.image_tag, self.image_tag)) + "A prompt should contain a image tag `{}` to insert image embeddings. if you don't want to use prompt function, you have to input a text with the image tag `{}`.".format( + self.image_tag, self.image_tag + ) + ) if self.text_tag not in prompt: raise ValueError( - "A prompt should contain a text tag `{}` to insert text information.". - format(self.text_tag)) + "A prompt should contain a text tag `{}` to insert text information.".format(self.text_tag) + ) assemble_texts.append(prompt.replace(self.text_tag, text)) else: assemble_texts.append(text) # processing with text tokenizer - first_texts, second_texts = zip(* [ - assemble_text.split(self.image_tag) - for assemble_text in assemble_texts - ]) + first_texts, second_texts = zip(*[assemble_text.split(self.image_tag) for assemble_text in assemble_texts]) first_text_encoding = self.tokenizer( text=first_texts, return_tensors=return_tensors, add_special_tokens=True, - **kwargs, ) + **kwargs, + ) second_text_encoding = self.tokenizer( text=second_texts, return_tensors=return_tensors, add_special_tokens=False, - **kwargs, ) - - encoded_texts = BatchEncoding({ - "first_input_ids": first_text_encoding["input_ids"], - "first_attention_mask": first_text_encoding["attention_mask"], - "second_input_ids": second_text_encoding["input_ids"], - "second_attention_mask": second_text_encoding["attention_mask"], - }) + **kwargs, + ) + + encoded_texts = BatchEncoding( + { + "first_input_ids": first_text_encoding["input_ids"], + "first_attention_mask": first_text_encoding["attention_mask"], + "second_input_ids": second_text_encoding["input_ids"], + "second_attention_mask": second_text_encoding["attention_mask"], + } + ) return encoded_texts def __call__( - self, - images: ImageInput=None, - text: str=None, - prompt: str=None, - return_tensors: Optional[Union[str, TensorType]]=TensorType.PADDLE, - **kwargs, ) -> BatchFeature: + self, + images: ImageInput = None, + text: str = None, + prompt: str = None, + return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE, + **kwargs, + ) -> BatchFeature: """ This method uses [`MiniGPT4ImageProcessor.__call__`] method to prepare image(s) for the model, and [`LlamaTokenizer.__call__`] to prepare text for the model. @@ -186,21 +192,17 @@ def __call__( prompt = prompt if prompt is not None else self.default_prompt if images is None and text is None: - raise ValueError( - "Images and text are None, you have to specify either images or texts." - ) - if images is not None and not isinstance( - images, (Image.Image, np.ndarray, paddle.Tensor, list)): + raise ValueError("Images and text are None, you have to specify either images or texts.") + if images is not None and not isinstance(images, (Image.Image, np.ndarray, paddle.Tensor, list)): raise TypeError( - "A type in [Image.Image, np.ndarray, paddle.Tensor, list] for images is expected, but received {}.". - format(type(images))) + "A type in [Image.Image, np.ndarray, paddle.Tensor, list] for images is expected, but received {}.".format( + type(images) + ) + ) if text is not None and not isinstance(text, str): - raise TypeError("A str type of text is expected, but received {}.". - format(type(text))) + raise TypeError("A str type of text is expected, but received {}.".format(type(text))) if prompt is not None and not isinstance(prompt, str): - raise TypeError( - "A str type of prompt is expected, but received {}.".format( - type(prompt))) + raise TypeError("A str type of prompt is expected, but received {}.".format(type(prompt))) if images is not None and not isinstance(images, list): images = [images] @@ -214,8 +216,7 @@ def __call__( # image-only mode if text is None: # processing with image processor - processed_features = self.process_images( - images, return_tensors=return_tensors, **kwargs) + processed_features = self.process_images(images, return_tensors=return_tensors, **kwargs) return processed_features # text-only mode @@ -225,8 +226,7 @@ def __call__( return encoded_texts # text-image mode - processed_features = self.image_processor( - images, return_tensors=return_tensors) + processed_features = self.image_processor(images, return_tensors=return_tensors) encoded_texts = self.process_texts(texts, prompts, **kwargs) processed_features.update(encoded_texts) @@ -251,5 +251,4 @@ def decode(self, *args, **kwargs): def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names image_processor_input_names = self.image_processor.model_input_names - return list( - dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) diff --git a/paddlemix/processors/processing_utils.py b/paddlemix/processors/processing_utils.py index dd755ce4c9447..4bce239cd1133 100644 --- a/paddlemix/processors/processing_utils.py +++ b/paddlemix/processors/processing_utils.py @@ -19,15 +19,23 @@ from typing import Any, Dict, Iterable, Optional, Tuple, Union import numpy as np -from huggingface_hub import (create_repo, get_hf_file_metadata, hf_hub_download, - hf_hub_url, repo_type_and_id_from_hf_id, - upload_folder) +from huggingface_hub import ( + create_repo, + get_hf_file_metadata, + hf_hub_download, + hf_hub_url, + repo_type_and_id_from_hf_id, + upload_folder, +) from huggingface_hub.utils import EntryNotFoundError from paddlenlp import __version__ from paddlenlp.transformers.tokenizer_utils_base import BatchEncoding from paddlemix.utils.downloader import ( - COMMUNITY_MODEL_PREFIX, get_path_from_url_with_filelock, resolve_cache_dir) + COMMUNITY_MODEL_PREFIX, + get_path_from_url_with_filelock, + resolve_cache_dir, +) from paddlemix.utils.log import logger PROCESSOR_CONFIG_MAPPING = { @@ -63,9 +71,7 @@ def _set_processor_class(self, processor_class: str): self._processor_class = processor_class @classmethod - def from_pretrained(cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - **kwargs): + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs): r""" Instantiate a type of [`~processing_utils.BaseProcessingMixin`] from an processor. @@ -121,13 +127,11 @@ def from_pretrained(cls, Returns: A processor of type [`~processing_utils.BaseProcessingMixin`]. ```""" - processor_dict, kwargs = cls.get_processor_dict( - pretrained_model_name_or_path, **kwargs) + processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs) return cls.from_dict(processor_dict, **kwargs) - def save_pretrained(self, save_directory: Union[str, os.PathLike], - **kwargs): + def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs): """ Save an processor object to the directory `save_directory`, so that it can be re-loaded using the [`~processing_utils.BaseProcessingMixin.from_pretrained`] class method. @@ -139,15 +143,12 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. """ if os.path.isfile(save_directory): - raise AssertionError( - f"Provided path ({save_directory}) should be a directory, not a file" - ) + raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") os.makedirs(save_directory, exist_ok=True) # If we save using the predefined names, we can load using `from_pretrained` - output_processor_file = os.path.join( - save_directory, PROCESSOR_CONFIG_MAPPING[self.input_type]) + output_processor_file = os.path.join(save_directory, PROCESSOR_CONFIG_MAPPING[self.input_type]) self.to_json_file(output_processor_file) logger.info(f"processor saved in {output_processor_file}") @@ -155,13 +156,14 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike], return [output_processor_file] def save_to_hf_hub( - self, - repo_id: str, - private: Optional[bool]=None, - subfolder: Optional[str]=None, - commit_message: Optional[str]=None, - revision: Optional[str]=None, - create_pr: bool=False, ): + self, + repo_id: str, + private: Optional[bool] = None, + subfolder: Optional[str] = None, + commit_message: Optional[str] = None, + revision: Optional[str] = None, + create_pr: bool = False, + ): """ Uploads all elements of this processor to a new HuggingFace Hub repository. Args: @@ -186,9 +188,7 @@ def save_to_hf_hub( # Check if README file already exist in repo try: - get_hf_file_metadata( - hf_hub_url( - repo_id=repo_id, filename="README.md", revision=revision)) + get_hf_file_metadata(hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision)) has_readme = True except EntryNotFoundError: has_readme = False @@ -214,12 +214,13 @@ def save_to_hf_hub( folder_path=root_dir, commit_message=commit_message, revision=revision, - create_pr=create_pr, ) + create_pr=create_pr, + ) @classmethod def get_processor_dict( - cls, pretrained_model_name_or_path: Union[str, os.PathLike], - **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]: + cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: """ From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a processor of type [`~processor_utils.BaseProcessingMixin`] using `from_dict`. @@ -237,15 +238,14 @@ def get_processor_dict( cache_dir = kwargs.pop("cache_dir", None) from_hf_hub = kwargs.pop("from_hf_hub", False) subfolder = kwargs.pop("subfolder", None) - cache_dir = resolve_cache_dir(pretrained_model_name_or_path, - from_hf_hub, cache_dir) + cache_dir = resolve_cache_dir(pretrained_model_name_or_path, from_hf_hub, cache_dir) pretrained_model_name_or_path = str(pretrained_model_name_or_path) is_local = os.path.isdir(pretrained_model_name_or_path) if os.path.isdir(pretrained_model_name_or_path): resolved_processor_file = os.path.join( - pretrained_model_name_or_path, - PROCESSOR_CONFIG_MAPPING[cls.input_type]) + pretrained_model_name_or_path, PROCESSOR_CONFIG_MAPPING[cls.input_type] + ) elif os.path.isfile(pretrained_model_name_or_path): resolved_processor_file = pretrained_model_name_or_path is_local = True @@ -257,18 +257,20 @@ def get_processor_dict( cache_dir=cache_dir, subfolder=subfolder, library_name="PaddleNLP", - library_version=__version__, ) + library_version=__version__, + ) else: # Assuming from community-contributed pretrained models - processor_file = "/".join([ - COMMUNITY_MODEL_PREFIX, - pretrained_model_name_or_path, - PROCESSOR_CONFIG_MAPPING[cls.input_type], - ]) + processor_file = "/".join( + [ + COMMUNITY_MODEL_PREFIX, + pretrained_model_name_or_path, + PROCESSOR_CONFIG_MAPPING[cls.input_type], + ] + ) try: # Load from local folder or from cache or download from model Hub and cache - resolved_processor_file = get_path_from_url_with_filelock( - processor_file, cache_dir) + resolved_processor_file = get_path_from_url_with_filelock(processor_file, cache_dir) except EnvironmentError: # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to # the original exception. @@ -296,9 +298,7 @@ def get_processor_dict( if is_local: logger.info(f"loading configuration file {resolved_processor_file}") else: - logger.info( - f"loading configuration file {processor_file} from cache at {resolved_processor_file}" - ) + logger.info(f"loading configuration file {processor_file} from cache at {resolved_processor_file}") return processor_dict, kwargs @@ -416,8 +416,7 @@ def __call__(self, images, **kwargs) -> BatchEncoding: return self.preprocess(images, **kwargs) def preprocess(self, images, **kwargs) -> BatchEncoding: - raise NotImplementedError( - "Each image processor must implement its own preprocess method") + raise NotImplementedError("Each image processor must implement its own preprocess method") class BaseTextProcessor(BaseProcessingMixin): @@ -431,8 +430,7 @@ def __call__(self, text, **kwargs) -> BatchEncoding: return self.preprocess(text, **kwargs) def preprocess(self, text, **kwargs) -> BatchEncoding: - raise NotImplementedError( - "Each image processor must implement its own preprocess method") + raise NotImplementedError("Each image processor must implement its own preprocess method") class BaseAudioProcessor(BaseProcessingMixin): @@ -446,14 +444,14 @@ def __call__(self, audios, **kwargs) -> BatchEncoding: return self.preprocess(audios, **kwargs) def preprocess(self, audios, **kwargs) -> BatchEncoding: - raise NotImplementedError( - "Each audios processor must implement its own preprocess method") + raise NotImplementedError("Each audios processor must implement its own preprocess method") VALID_SIZE_DICT_KEYS = ( {"height", "width"}, {"shortest_edge"}, - {"shortest_edge", "longest_edge"}, ) + {"shortest_edge", "longest_edge"}, +) def is_valid_size_dict(size_dict): @@ -468,16 +466,15 @@ def is_valid_size_dict(size_dict): def convert_to_size_dict( - size, - max_size: Optional[int]=None, - default_to_square: bool=True, - height_width_order: bool=True, ): + size, + max_size: Optional[int] = None, + default_to_square: bool = True, + height_width_order: bool = True, +): # By default, if size is an int we assume it represents a tuple of (size, size). if isinstance(size, int) and default_to_square: if max_size is not None: - raise ValueError( - "Cannot specify both size as an int, with default_to_square=True and max_size" - ) + raise ValueError("Cannot specify both size as an int, with default_to_square=True and max_size") return {"height": size, "width": size} # In other configs, if size is an int and default_to_square is False, size represents the length of # the shortest edge after resizing. @@ -496,11 +493,12 @@ def convert_to_size_dict( def get_size_dict( - size: Union[int, Iterable[int], Dict[str, int]]=None, - max_size: Optional[int]=None, - height_width_order: bool=True, - default_to_square: bool=True, - param_name="size", ) -> dict: + size: Union[int, Iterable[int], Dict[str, int]] = None, + max_size: Optional[int] = None, + height_width_order: bool = True, + default_to_square: bool = True, + param_name="size", +) -> dict: """ Converts the old size parameter in the config into the new dict expected in the config. This is to ensure backwards compatibility with the old image processor configs and removes ambiguity over whether the tuple is in (height, @@ -523,11 +521,11 @@ def get_size_dict( If `size` is an int, whether to default to a square image or not. """ if not isinstance(size, dict): - size_dict = convert_to_size_dict(size, max_size, default_to_square, - height_width_order) + size_dict = convert_to_size_dict(size, max_size, default_to_square, height_width_order) logger.info( f"{param_name} should be a dictionary on of the following set of keys: {VALID_SIZE_DICT_KEYS}, got {size}." - f" Converted to {size_dict}.", ) + f" Converted to {size_dict}.", + ) else: size_dict = size diff --git a/paddlemix/processors/sam_processing.py b/paddlemix/processors/sam_processing.py index 50ace480b7cfd..349d6d9e944a6 100644 --- a/paddlemix/processors/sam_processing.py +++ b/paddlemix/processors/sam_processing.py @@ -15,20 +15,22 @@ Processor class for Sam. """ -import re from copy import deepcopy from typing import Dict, List, Optional, Tuple, Union import numpy as np import paddle -import PIL from paddle.nn import functional as F from paddle.vision.transforms.functional import resize from .base_processing import ProcessorMixin from .image_transform_utils import to_pil_image -from .image_utils import (IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, - get_preprocess_shape, valid_images) +from .image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + get_preprocess_shape, + valid_images, +) from .processing_utils import BaseImageProcessor, BaseTextProcessor __all__ = [ @@ -52,17 +54,17 @@ def __init__(self, image_processor, prompt_processor): self.encode_size = self.image_processor.size def __call__( - self, - images, - input_type, - point_coords=None, - point_labels=None, - box=None, - **kwargs, ): + self, + images, + input_type, + point_coords=None, + point_labels=None, + box=None, + **kwargs, + ): if images is None or input_type is None: - raise ValueError( - "You have to specify either images and input_type.") + raise ValueError("You have to specify either images and input_type.") if input_type == "boxs" and box is None: raise ValueError("You have to specify either box.") @@ -78,20 +80,21 @@ def __call__( self.original_size, point_coords=point_coords, point_labels=point_labels, - box=box, ) + box=box, + ) return image_seg, prompt - def postprocess_masks(self, low_res_masks, mask_threshold: float=0.0): + def postprocess_masks(self, low_res_masks, mask_threshold: float = 0.0): masks = F.interpolate( paddle.to_tensor(low_res_masks), (self.encode_size, self.encode_size), mode="bilinear", - align_corners=False, ) - masks = masks[..., :self.input_size[0], :self.input_size[1]] - masks = F.interpolate( - masks, self.original_size, mode="bilinear", align_corners=False) + align_corners=False, + ) + masks = masks[..., : self.input_size[0], : self.input_size[1]] + masks = F.interpolate(masks, self.original_size, mode="bilinear", align_corners=False) masks = masks > mask_threshold return masks @@ -108,28 +111,26 @@ class SamPromptProcessor(BaseTextProcessor): """ def __init__( - self, - size: int=1024, - **kwargs, ): + self, + size: int = 1024, + **kwargs, + ): super().__init__(**kwargs) self.size = size - def apply_coords(self, coords: np.ndarray, - original_size: Tuple[int, ...]) -> np.ndarray: + def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray: """ Expects a numpy array of length 2 in the final dimension. Requires the original image size in (H, W) format. """ old_h, old_w = original_size - new_h, new_w = get_preprocess_shape(original_size[0], original_size[1], - self.size) + new_h, new_w = get_preprocess_shape(original_size[0], original_size[1], self.size) coords = deepcopy(coords).astype(float) coords[..., 0] = coords[..., 0] * (new_w / old_w) coords[..., 1] = coords[..., 1] * (new_h / old_h) return coords - def apply_boxes(self, boxes: np.ndarray, - original_size: Tuple[int, ...]) -> np.ndarray: + def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray: """ Expects a numpy array shape Bx4. Requires the original image size in (H, W) format. @@ -138,17 +139,23 @@ def apply_boxes(self, boxes: np.ndarray, return boxes.reshape([-1, 4]) def __call__( - self, - original_size, - point_coords=None, - point_labels=None, - box=None, - **kwargs, ): - coords_paddle, labels_paddle, box_paddle, mask_input_paddle = ( - None, + self, + original_size, + point_coords=None, + point_labels=None, + box=None, + **kwargs, + ): + # coords_paddle, labels_paddle, box_paddle, mask_input_paddle = ( + # None, + # None, + # None, + # None, + # ) + coords_paddle, box_paddle = ( None, None, - None, ) + ) if point_coords is not None: point_coords = self.apply_coords(point_coords, original_size) coords_paddle = paddle.to_tensor(point_coords).cast("float32") @@ -171,22 +178,22 @@ class SamImageProcessor(BaseImageProcessor): model_input_names = ["pixel_values"] def __init__( - self, - size: List[int]=None, - image_mean: Optional[Union[float, List[float]]]=None, - image_std: Optional[Union[float, List[float]]]=None, - image_format: str="RGB", - original_size: List[int]=None, - input_size: List[int]=None, - **kwargs, ) -> None: + self, + size: List[int] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + image_format: str = "RGB", + original_size: List[int] = None, + input_size: List[int] = None, + **kwargs, + ) -> None: super().__init__(**kwargs) size = size if size is not None else 1024 self.size = size self.image_format = image_format - self.image_mean = (image_mean if image_mean is not None else - IMAGENET_STANDARD_MEAN) + self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD self.original_size = original_size @@ -196,19 +203,19 @@ def apply_image(self, image: np.ndarray) -> np.ndarray: """ Expects a numpy array with shape HxWxC in uint8 format. """ - target_size = get_preprocess_shape(image.shape[0], image.shape[1], - self.size) + target_size = get_preprocess_shape(image.shape[0], image.shape[1], self.size) return np.array(resize(to_pil_image(image), target_size)) def preprocess( - self, - images, - size: Optional[Dict[str, int]]=None, - image_mean: Optional[Union[float, List[float]]]=None, - image_std: Optional[Union[float, List[float]]]=None, - image_format: str="RGB", - **kwargs, ): + self, + images, + size: Optional[Dict[str, int]] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + image_format: str = "RGB", + **kwargs, + ): """ Preprocess an image or batch of images. @@ -221,13 +228,11 @@ def preprocess( if not isinstance(images, (list, tuple)): images = [images] - if isinstance(images[0], str): - images = [load_image(image) for image in images] + # if isinstance(images[0], str): + # images = [load_image(image) for image in images] if not valid_images(images): - raise ValueError( - "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " - "paddle.Tensor.") + raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") assert image_format in [ "RGB", @@ -248,9 +253,8 @@ def preprocess( self.input_size = tuple(input_image_paddle.shape[-2:]) input_image_paddle = ( - input_image_paddle - paddle.to_tensor(self.image_mean).reshape( - [-1, 1, 1])) / paddle.to_tensor(self.image_std).reshape( - [-1, 1, 1]) + input_image_paddle - paddle.to_tensor(self.image_mean).reshape([-1, 1, 1]) + ) / paddle.to_tensor(self.image_std).reshape([-1, 1, 1]) # Pad h, w = input_image_paddle.shape[-2:] diff --git a/paddlemix/processors/tokenizer.py b/paddlemix/processors/tokenizer.py index 31c9b01b37444..fab9519a091b3 100644 --- a/paddlemix/processors/tokenizer.py +++ b/paddlemix/processors/tokenizer.py @@ -13,6 +13,7 @@ # limitations under the License. import paddle + """ CLIP tokenizer Copied from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI. @@ -31,9 +32,7 @@ @lru_cache() def default_bpe(): - return os.path.join( - os.path.dirname(os.path.abspath(__file__)), - "bpe_simple_vocab_16e6.txt.gz") + return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz") @lru_cache() @@ -47,9 +46,9 @@ def bytes_to_unicode(): To avoid that, we want lookup tables between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on. """ - bs = (list(range(ord("!"), ord("~") + 1)) + - list(range(ord("¡"), ord("¬") + 1)) + - list(range(ord("®"), ord("ÿ") + 1))) + bs = ( + list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + ) cs = bs[:] n = 0 for b in range(2**8): @@ -87,11 +86,11 @@ def whitespace_clean(text): class SimpleTokenizer(object): - def __init__(self, bpe_path: str=default_bpe(), special_tokens=None): + def __init__(self, bpe_path: str = default_bpe(), special_tokens=None): self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} merges = gzip.open(bpe_path).read().decode("utf-8").split("\n") - merges = merges[1:49152 - 256 - 2 + 1] + merges = merges[1 : 49152 - 256 - 2 + 1] """Class Method: *.split, not convert, please check whether it is torch.Tensor.*/Optimizer.*/nn.Module.*, and convert manually""" merges = [tuple(merge.split()) for merge in merges] vocab = list(bytes_to_unicode().values()) @@ -101,8 +100,7 @@ def __init__(self, bpe_path: str=default_bpe(), special_tokens=None): if not special_tokens: special_tokens = ["", ""] else: - special_tokens = ["", "" - ] + special_tokens + special_tokens = ["", ""] + special_tokens vocab.extend(special_tokens) self.encoder = dict(zip(vocab, range(len(vocab)))) self.decoder = {v: k for k, v in self.encoder.items()} @@ -110,22 +108,21 @@ def __init__(self, bpe_path: str=default_bpe(), special_tokens=None): self.cache = {t: t for t in special_tokens} special = "|".join(special_tokens) self.pat = re.compile( - special + - "|'s|'t|'re|'ve|'m|'ll|'d|[\\p{L}]+|[\\p{N}]|[^\\s\\p{L}\\p{N}]+", - re.IGNORECASE, ) + special + "|'s|'t|'re|'ve|'m|'ll|'d|[\\p{L}]+|[\\p{N}]|[^\\s\\p{L}\\p{N}]+", + re.IGNORECASE, + ) self.vocab_size = len(self.encoder) self.all_special_ids = [self.encoder[t] for t in special_tokens] def bpe(self, token): if token in self.cache: return self.cache[token] - word = tuple(token[:-1]) + (token[-1] + "", ) + word = tuple(token[:-1]) + (token[-1] + "",) pairs = get_pairs(word) if not pairs: return token + "" while True: - bigram = min( - pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) if bigram not in self.bpe_ranks: break first, second = bigram @@ -139,8 +136,7 @@ def bpe(self, token): except: new_word.extend(word[i:]) break - if word[i] == first and i < len(word) - 1 and word[i + - 1] == second: + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: new_word.append(first + second) i += 2 else: @@ -162,30 +158,26 @@ def encode(self, text1): for token in re.findall(self.pat, text): token = "".join(self.byte_encoder[b] for b in token.encode("utf-8")) """Class Method: *.split, not convert, please check whether it is torch.Tensor.*/Optimizer.*/nn.Module.*, and convert manually""" - bpe_tokens.extend(self.encoder[bpe_token] - for bpe_token in self.bpe(token).split(" ")) + bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ")) return bpe_tokens def decode(self, tokens): text = "".join([self.decoder[token] for token in tokens]) - text = (bytearray([self.byte_decoder[c] for c in text]).decode( - "utf-8", errors="replace").replace("", " ")) + text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors="replace").replace("", " ") return text def __call__(self, text, max_length=77, return_tensors=True, **kwargs): texts = text sot_token = self.encoder[""] eot_token = self.encoder[""] - all_tokens = [([sot_token] + _tokenizer.encode(text) + [eot_token]) - for text in texts] + all_tokens = [([sot_token] + _tokenizer.encode(text) + [eot_token]) for text in texts] if return_tensors: - result = paddle.zeros( - shape=[len(all_tokens), max_length], dtype="int64") + result = paddle.zeros(shape=[len(all_tokens), max_length], dtype="int64") for i, tokens in enumerate(all_tokens): if len(tokens) > max_length: tokens = tokens[:max_length] tokens[-1] = eot_token - result[(i), :len(tokens)] = paddle.to_tensor(data=tokens) + result[(i), : len(tokens)] = paddle.to_tensor(data=tokens) return {"input_ids": result} else: result = [] @@ -204,8 +196,7 @@ def from_pretrained(cls, *args, **kwargs): _tokenizer = SimpleTokenizer() -def tokenize(texts: Union[str, List[str]], - context_length: int=77) -> paddle.Tensor: +def tokenize(texts: Union[str, List[str]], context_length: int = 77) -> paddle.Tensor: """ Returns the tokenized representation of given input string(s) @@ -224,15 +215,13 @@ def tokenize(texts: Union[str, List[str]], texts = [texts] sot_token = _tokenizer.encoder[""] eot_token = _tokenizer.encoder[""] - all_tokens = [([sot_token] + _tokenizer.encode(text) + [eot_token]) - for text in texts] - result = paddle.zeros( - shape=[len(all_tokens), context_length], dtype="int64") + all_tokens = [([sot_token] + _tokenizer.encode(text) + [eot_token]) for text in texts] + result = paddle.zeros(shape=[len(all_tokens), context_length], dtype="int64") for i, tokens in enumerate(all_tokens): if len(tokens) > context_length: tokens = tokens[:context_length] tokens[-1] = eot_token - result[(i), :len(tokens)] = paddle.to_tensor(data=tokens) + result[(i), : len(tokens)] = paddle.to_tensor(data=tokens) return result @@ -244,8 +233,7 @@ def __init__(self, tokenizer_name: str): self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - def __call__(self, texts: Union[str, List[str]], - context_length: int=77) -> paddle.Tensor: + def __call__(self, texts: Union[str, List[str]], context_length: int = 77) -> paddle.Tensor: if isinstance(texts, str): texts = [texts] texts = [whitespace_clean(basic_clean(text)) for text in texts] @@ -254,5 +242,6 @@ def __call__(self, texts: Union[str, List[str]], return_tensors="pt", max_length=context_length, padding="max_length", - truncation=True, ).input_ids + truncation=True, + ).input_ids return input_ids diff --git a/paddlemix/processors/utils.py b/paddlemix/processors/utils.py index 29ae623096eff..53fe7051435f1 100644 --- a/paddlemix/processors/utils.py +++ b/paddlemix/processors/utils.py @@ -28,7 +28,6 @@ def _missing_(cls, value): def _max_by_axis(the_list): - # type: (List[List[int]]) -> List[int] maxes = the_list[0] for sublist in the_list[1:]: for index, item in enumerate(sublist): diff --git a/paddlemix/processors/visualglm_image_processing.py b/paddlemix/processors/visualglm_image_processing.py index 2ed3464393e07..7e0afc9ec6ef6 100644 --- a/paddlemix/processors/visualglm_image_processing.py +++ b/paddlemix/processors/visualglm_image_processing.py @@ -21,14 +21,26 @@ import PIL from paddlenlp.transformers.tokenizer_utils_base import TensorType -from .image_processing_utils import (BaseImageProcessor, BatchFeature, - get_size_dict) -from .image_transforms import (convert_to_rgb, normalize, rescale, resize, - to_channel_dimension_format) -from .image_utils import (ChannelDimension, ImageInput, PILImageResampling, - is_batched, to_numpy_array, valid_images) - -__all__ = ["VisualGLMImageProcessor", ] +from .image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from .image_transforms import ( + convert_to_rgb, + normalize, + rescale, + resize, + to_channel_dimension_format, +) +from .image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + is_batched, + to_numpy_array, + valid_images, +) + +__all__ = [ + "VisualGLMImageProcessor", +] class VisualGLMImageProcessor(BaseImageProcessor): @@ -69,17 +81,18 @@ class VisualGLMImageProcessor(BaseImageProcessor): model_input_names = ["pixel_values"] def __init__( - self, - do_resize: bool=True, - size: Dict[str, int]=None, - resample: PILImageResampling=PILImageResampling.BICUBIC, - do_rescale: bool=True, - rescale_factor: Union[int, float]=1 / 255, - do_normalize: bool=True, - image_mean: Optional[Union[float, List[float]]]=None, - image_std: Optional[Union[float, List[float]]]=None, - do_convert_rgb: bool=True, - **kwargs, ) -> None: + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = True, + **kwargs, + ) -> None: super().__init__(**kwargs) default_image_mean = [0.48145466, 0.4578275, 0.40821073] default_image_std = [0.26862954, 0.26130258, 0.27577711] @@ -97,12 +110,13 @@ def __init__( self.do_convert_rgb = do_convert_rgb def resize( - self, - image: np.ndarray, - size: Dict[str, int], - resample: PILImageResampling=PILImageResampling.BICUBIC, - data_format: Optional[Union[str, ChannelDimension]]=None, - **kwargs, ) -> np.ndarray: + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: """ Resize an image. @@ -127,14 +141,16 @@ def resize( size=output_size, resample=resample, data_format=data_format, - **kwargs, ) + **kwargs, + ) def rescale( - self, - image: np.ndarray, - scale: Union[int, float], - data_format: Optional[Union[str, ChannelDimension]]=None, - **kwargs, ): + self, + image: np.ndarray, + scale: Union[int, float], + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ): """ Rescale an image by a scale factor. image = image * scale. @@ -149,12 +165,13 @@ def rescale( return rescale(image, scale=scale, data_format=data_format, **kwargs) def normalize( - self, - image: np.ndarray, - mean: Union[float, List[float]], - std: Union[float, List[float]], - data_format: Optional[Union[str, ChannelDimension]]=None, - **kwargs, ) -> np.ndarray: + self, + image: np.ndarray, + mean: Union[float, List[float]], + std: Union[float, List[float]], + data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. @@ -168,24 +185,24 @@ def normalize( data_format (`str` or `ChannelDimension`, *optional*): The channel dimension format of the image. If not provided, it will be the same as the input image. """ - return normalize( - image, mean=mean, std=std, data_format=data_format, **kwargs) + return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs) def preprocess( - self, - images: ImageInput, - do_resize: Optional[bool]=None, - size: Optional[Dict[str, int]]=None, - resample: PILImageResampling=None, - do_rescale: Optional[bool]=None, - rescale_factor: Optional[float]=None, - do_normalize: Optional[bool]=None, - image_mean: Optional[Union[float, List[float]]]=None, - image_std: Optional[Union[float, List[float]]]=None, - return_tensors: Optional[Union[str, TensorType]]=None, - do_convert_rgb: bool=None, - data_format: ChannelDimension=ChannelDimension.FIRST, - **kwargs, ) -> PIL.Image.Image: + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample: PILImageResampling = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + do_convert_rgb: bool = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + **kwargs, + ) -> PIL.Image.Image: """ Preprocess an image or batch of images. @@ -227,13 +244,11 @@ def preprocess( do_resize = do_resize if do_resize is not None else self.do_resize resample = resample if resample is not None else self.resample do_rescale = do_rescale if do_rescale is not None else self.do_rescale - rescale_factor = (rescale_factor if rescale_factor is not None else - self.rescale_factor) + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor do_normalize = do_normalize if do_normalize is not None else self.do_normalize image_mean = image_mean if image_mean is not None else self.image_mean image_std = image_std if image_std is not None else self.image_std - do_convert_rgb = (do_convert_rgb if do_convert_rgb is not None else - self.do_convert_rgb) + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb size = size if size is not None else self.size size = get_size_dict(size, default_to_square=False) @@ -242,21 +257,16 @@ def preprocess( images = [images] if not valid_images(images): - raise ValueError( - "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " - "paddle.Tensor.") + raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.") if do_resize and size is None or resample is None: - raise ValueError( - "Size and resample must be specified if do_resize is True.") + raise ValueError("Size and resample must be specified if do_resize is True.") if do_rescale and rescale_factor is None: - raise ValueError( - "Rescale factor must be specified if do_rescale is True.") + raise ValueError("Rescale factor must be specified if do_rescale is True.") if do_normalize and (image_mean is None or image_std is None): - raise ValueError( - "Image mean and std must be specified if do_normalize is True.") + raise ValueError("Image mean and std must be specified if do_normalize is True.") # PIL RGBA images are converted to RGB if do_convert_rgb: @@ -266,28 +276,15 @@ def preprocess( images = [to_numpy_array(image) for image in images] if do_resize: - images = [ - self.resize( - image=image, size=size, resample=resample) - for image in images - ] + images = [self.resize(image=image, size=size, resample=resample) for image in images] if do_rescale: - images = [ - self.rescale( - image=image, scale=rescale_factor) for image in images - ] + images = [self.rescale(image=image, scale=rescale_factor) for image in images] if do_normalize: - images = [ - self.normalize( - image=image, mean=image_mean, std=image_std) - for image in images - ] - - images = [ - to_channel_dimension_format(image, data_format) for image in images - ] + images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images] + + images = [to_channel_dimension_format(image, data_format) for image in images] data = {"pixel_values": images} return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/paddlemix/processors/visualglm_processing.py b/paddlemix/processors/visualglm_processing.py index 5295186a7eab0..e26d1302e45ae 100644 --- a/paddlemix/processors/visualglm_processing.py +++ b/paddlemix/processors/visualglm_processing.py @@ -21,15 +21,20 @@ import numpy as np import paddle -from paddlenlp.transformers.tokenizer_utils_base import (BatchEncoding, - TensorType, TextInput) +from paddlenlp.transformers.tokenizer_utils_base import ( + BatchEncoding, + TensorType, + TextInput, +) from PIL import Image from .base_processing import ProcessorMixin from .image_processing_utils import BatchFeature from .image_utils import ImageInput -__all__ = ["VisualGLMProcessor", ] +__all__ = [ + "VisualGLMProcessor", +] class VisualGLMProcessor(ProcessorMixin): @@ -78,10 +83,11 @@ def __init__(self, image_processor, tokenizer): self.num_query_tokens = 32 def process_images( - self, - images: ImageInput, - return_tensors: Optional[Union[str, TensorType]]=TensorType.PADDLE, - **kwargs, ) -> BatchFeature: + self, + images: ImageInput, + return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE, + **kwargs, + ) -> BatchFeature: """ This method uses [`VisualGLMImageProcessor.__call__`] method to prepare image(s) for the model. Please refer to the docstring of the method for more information. @@ -92,31 +98,31 @@ def process_images( if isinstance(images, (Image.Image, np.ndarray, paddle.Tensor)): images = [images] - processed_images = self.image_processor( - images, return_tensors=return_tensors) + processed_images = self.image_processor(images, return_tensors=return_tensors) return processed_images def process_texts( - self, - texts: Union[TextInput, List[TextInput]], - return_tensors: Optional[Union[str, TensorType]]=TensorType.PADDLE, - **kwargs, ) -> BatchEncoding: + self, + texts: Union[TextInput, List[TextInput]], + return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE, + **kwargs, + ) -> BatchEncoding: if not texts: raise ValueError("You have to input correct texts.") if isinstance(texts, TextInput): texts = [texts] - processed_texts = self.tokenizer( - text=texts, return_tensors=return_tensors, **kwargs) + processed_texts = self.tokenizer(text=texts, return_tensors=return_tensors, **kwargs) return BatchEncoding(processed_texts) def build_inputs_with_image( - self, - image: Union[Image.Image, np.ndarray, paddle.Tensor], - query: str, - history: Optional[str]=None, ): + self, + image: Union[Image.Image, np.ndarray, paddle.Tensor], + query: str, + history: Optional[str] = None, + ): # construct prompt with inputs if image is not None: prompt = self.default_prompt @@ -129,22 +135,17 @@ def build_inputs_with_image( if image is not None: image_start_position = prompt.rfind(self.image_tag) image_end_position = image_start_position + len(self.image_tag) - first_text_input = self.tokenizer.encode( - prompt[:image_start_position], add_special_tokens=False) + first_text_input = self.tokenizer.encode(prompt[:image_start_position], add_special_tokens=False) image_input = [self.tokenizer.unk_token_id] * self.num_query_tokens - second_text_input = self.tokenizer.encode( - prompt[image_end_position:], add_special_tokens=False) - all_input_ids = (first_text_input["input_ids"] + image_input + - second_text_input["input_ids"]) - all_input_ids = self.tokenizer.build_inputs_with_special_tokens( - all_input_ids) + second_text_input = self.tokenizer.encode(prompt[image_end_position:], add_special_tokens=False) + all_input_ids = first_text_input["input_ids"] + image_input + second_text_input["input_ids"] + all_input_ids = self.tokenizer.build_inputs_with_special_tokens(all_input_ids) # processing image processed_image = self.process_images(image) inputs = { - "input_ids": paddle.to_tensor( - all_input_ids, dtype="int64").unsqueeze(0), + "input_ids": paddle.to_tensor(all_input_ids, dtype="int64").unsqueeze(0), "pre_image_length": len(first_text_input["input_ids"]), "pixel_values": processed_image["pixel_values"], } @@ -155,23 +156,24 @@ def build_inputs_with_image( return inputs def __call__( - self, - image: Union[Image.Image, np.ndarray, paddle.Tensor], - query: str, - history: Optional[str]=[], - **kwargs, ): + self, + image: Union[Image.Image, np.ndarray, paddle.Tensor], + query: str, + history: Optional[str] = [], + **kwargs, + ): if image is None: raise ValueError("Image should not be None.") if query is None: raise ValueError("Query should not be None.") if not isinstance(query, str): - raise TypeError( - "A string type of query is expected, but acceived {}.".format( - type(query))) + raise TypeError("A string type of query is expected, but acceived {}.".format(type(query))) if not isinstance(history, list): raise TypeError( - "A list type of history is expected with each item [query, response] in it, but acceived {}.". - format(type(history))) + "A list type of history is expected with each item [query, response] in it, but acceived {}.".format( + type(history) + ) + ) inputs = self.build_inputs_with_image(image, query, history=history) @@ -203,10 +205,8 @@ def process_response(self, response): ["\?", "?"], ] for item in punkts: - response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], - r"\1%s" % item[1], response) - response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], - r"%s\1" % item[1], response) + response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response) + response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response) return response def get_responses(self, *args, **kwargs): @@ -223,5 +223,4 @@ def get_responses(self, *args, **kwargs): def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names image_processor_input_names = self.image_processor.model_input_names - return list( - dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) diff --git a/paddlemix/trainer/blip2_trainer.py b/paddlemix/trainer/blip2_trainer.py index 708236f547e58..b3b9a6a831e4e 100644 --- a/paddlemix/trainer/blip2_trainer.py +++ b/paddlemix/trainer/blip2_trainer.py @@ -11,39 +11,36 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import paddlemix -from paddlenlp.trainer.trainer import Trainer -from paddlemix.optimization import FilterParamsName -from paddlemix.examples.blip2.utils import coco_caption_eval - import contextlib import inspect -import math +import json import os import sys import time -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union -import numpy as np import paddle import paddle.amp.auto_cast as autocast import paddle.nn as nn from paddle.distributed import fleet -from paddle.io import DataLoader, Dataset, DistributedBatchSampler - +from paddle.io import DataLoader, Dataset +from paddlenlp.trainer.trainer import Trainer +from paddlenlp.trainer.trainer_callback import DefaultFlowCallback, ProgressCallback +from paddlenlp.trainer.trainer_utils import ( # set_hyrbid_parallel_seed, + EvalLoopOutput, + IterableDatasetShard, + ShardingOption, + has_length, + speed_metrics, +) from paddlenlp.transformers.model_utils import unwrap_model from paddlenlp.utils import device_guard -from paddlenlp.utils.batch_sampler import DistributedBatchSampler as NlpDistributedBatchSampler from paddlenlp.utils.import_utils import is_datasets_available from paddlenlp.utils.log import logger -from paddlenlp.trainer.trainer_callback import ( - DefaultFlowCallback, - ProgressCallback, ) -from paddlenlp.trainer.trainer_utils import ( # set_hyrbid_parallel_seed, - EvalLoopOutput, EvalPrediction, IterableDatasetShard, ShardingOption, - find_batch_size, has_length, speed_metrics, ) -import json -from paddlemix.examples.blip2.utils import save_result, VQA, VQAEval + +import paddlemix +from paddlemix.examples.blip2.utils import VQA, VQAEval, coco_caption_eval, save_result +from paddlemix.optimization import FilterParamsName DEFAULT_CALLBACKS = [DefaultFlowCallback] DEFAULT_PROGRESS_CALLBACK = ProgressCallback @@ -74,9 +71,7 @@ def paddlenlp_load(path, return_numpy=False): def is_dp_group_support_in_group_sharded_parallel(): - return "dp_group" in set( - inspect.signature(paddle.distributed.sharding.group_sharded_parallel) - .parameters.keys()) + return "dp_group" in set(inspect.signature(paddle.distributed.sharding.group_sharded_parallel).parameters.keys()) __all__ = ["BLIP2Trainer"] @@ -93,21 +88,21 @@ class BLIP2Trainer(Trainer): """ - from paddlenlp.trainer.trainer_utils import log_metrics, metrics_format, save_metrics, save_state + from paddlenlp.trainer.trainer_utils import ( + log_metrics, + metrics_format, + save_metrics, + save_state, + ) - def __init__(self, - processor=None, - eval_processor=None, - eval_collator=None, - **kwargs): + def __init__(self, processor=None, eval_processor=None, eval_collator=None, **kwargs): super().__init__(**kwargs) self.processor = processor self.eval_processor = eval_processor self.eval_collator = eval_collator def create_optimizer_and_scheduler(self, num_training_steps: int): - self.lr_scheduler = self.create_scheduler(num_training_steps // - self.args.num_train_epochs) + self.lr_scheduler = self.create_scheduler(num_training_steps // self.args.num_train_epochs) param_filter = FilterParamsName() p_wd, p_non_wd = param_filter(self.model) self.optimizer = paddle.optimizer.AdamW( @@ -116,22 +111,22 @@ def create_optimizer_and_scheduler(self, num_training_steps: int): weight_decay=float(self.args.weight_decay), beta1=self.args.adam_beta1, beta2=self.args.adam_beta2, - apply_decay_param_fun=param_filter._apply_decay_param_fun, ) + apply_decay_param_fun=param_filter._apply_decay_param_fun, + ) def create_scheduler(self, num_training_steps): - lr_sched_func = getattr(paddlemix.optimization, - self.args.lr_scheduler_name) + lr_sched_func = getattr(paddlemix.optimization, self.args.lr_scheduler_name) lr_sched = lr_sched_func( learning_rate=self.args.learning_rate, epochs=self.args.num_train_epochs, warmup_start_lr=self.args.warmup_start_lr, eta_min=self.args.eta_min, warmup_steps=self.args.warmup_steps, - step_each_epoch=num_training_steps, ) + step_each_epoch=num_training_steps, + ) return lr_sched - def get_eval_dataloader(self, - eval_dataset: Optional[Dataset]=None) -> DataLoader: + def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader: """ Returns the evaluation [`~paddle.io.DataLoader`]. @@ -146,10 +141,8 @@ def get_eval_dataloader(self, raise ValueError("Trainer: evaluation requires an eval_dataset.") eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset - if is_datasets_available() and isinstance(eval_dataset, - datasets.Dataset): - eval_dataset = self._remove_unused_columns( - eval_dataset, description="evaluation") + if is_datasets_available() and isinstance(eval_dataset, datasets.Dataset): + eval_dataset = self._remove_unused_columns(eval_dataset, description="evaluation") if self._is_iterable_dataset(eval_dataset): if self.args.dataset_world_size > 1: @@ -158,13 +151,15 @@ def get_eval_dataloader(self, batch_size=self.args.per_device_eval_batch_size, drop_last=self.args.dataloader_drop_last, num_processes=self.args.dataset_world_size, - process_index=self.args.dataset_rank, ) + process_index=self.args.dataset_rank, + ) return DataLoader( eval_dataset, batch_size=self.args.per_device_eval_batch_size, collate_fn=self.eval_collator, - num_workers=self.args.dataloader_num_workers, ) + num_workers=self.args.dataloader_num_workers, + ) eval_sampler = self._get_eval_sampler(eval_dataset) @@ -172,7 +167,8 @@ def get_eval_dataloader(self, eval_dataset, batch_sampler=eval_sampler, collate_fn=self.eval_collator, - num_workers=self.args.dataloader_num_workers, ) + num_workers=self.args.dataloader_num_workers, + ) def _wrap_model(self, model, training=True): @@ -190,15 +186,11 @@ def _wrap_model(self, model, training=True): # model, self.optimizer if hasattr(model, "language_model"): decorated = paddle.amp.decorate( - models=[model.visual_encoder, model.language_model], - optimizers=self.optimizer, - level="O2") + models=[model.visual_encoder, model.language_model], optimizers=self.optimizer, level="O2" + ) model.visual_encoder, model.language_model = decorated[0] else: - decorated = paddle.amp.decorate( - models=[model.visual_encoder], - optimizers=self.optimizer, - level="O2") + decorated = paddle.amp.decorate(models=[model.visual_encoder], optimizers=self.optimizer, level="O2") model.visual_encoder = decorated[0][0] self.optimizer.set_state_dict(decorated[1].state_dict()) @@ -206,34 +198,32 @@ def _wrap_model(self, model, training=True): if self.args.world_size > 1 and not self.args.use_hybrid_parallel: model = paddle.DataParallel(model) assert self.args.tensor_parallel_degree < 2, "tensor_parallel_degree = {}, pelease init optimizer.".format( - self.args.tensor_parallel_degree) + self.args.tensor_parallel_degree + ) + in_pipeline_parallel_mode = self.args.pipeline_parallel_degree > 1 in_sharding_parallel_mode = self.sharding is not None in_tensor_parallel_model = self.args.tensor_parallel_degree > 1 if in_pipeline_parallel_mode: if self.args.amp_master_grad: - mix_precision_utils.MixPrecisionLayer( - model, dtype=self.amp_dtype) # return value has no use + mix_precision_utils.MixPrecisionLayer(model, dtype=self.amp_dtype) # return value has no use # hack for pipeline model mini batch to batch # need batter solution @ZHUI # make batch_fn compatible for fleet.distributed_model decorate. prepare_pipeline_inputs_func = ( - model._prepare_pipeline_inputs_func - if hasattr(model, "_prepare_pipeline_inputs_func") else None) + model._prepare_pipeline_inputs_func if hasattr(model, "_prepare_pipeline_inputs_func") else None + ) model = fleet.distributed_model(model) if prepare_pipeline_inputs_func is not None: model._prepare_pipeline_inputs_func = prepare_pipeline_inputs_func else: def _prepare_pipeline_inputs_func(inputs): - first_stage_keys = [ - "input_ids", "attention_mask", "position_ids" - ] + first_stage_keys = ["input_ids", "attention_mask", "position_ids"] last_stage_keys = ["labels"] def get_expected_keys(inputs, keys): - ret = tuple( - [inputs.pop(k) for k in keys if k in inputs]) + ret = tuple([inputs.pop(k) for k in keys if k in inputs]) if len(ret) == 1: ret = ret[0] return ret @@ -245,10 +235,7 @@ def get_expected_keys(inputs, keys): ] keys = list(inputs[0].keys()) - inputs_batch = { - key: [data.pop(key) for data in inputs] - for key in keys - } + inputs_batch = {key: [data.pop(key) for data in inputs] for key in keys} return [ get_expected_keys(inputs_batch, first_stage_keys), get_expected_keys(inputs_batch, last_stage_keys), @@ -261,8 +248,7 @@ def get_expected_keys(inputs, keys): assert self.optimizer is not None, "Pipeline mode need decorate optimizer, pelease init optimizer." if self.args.amp_master_grad: - self.optimizer = mix_precision_utils.MixPrecisionOptimizer( - self.optimizer) + self.optimizer = mix_precision_utils.MixPrecisionOptimizer(self.optimizer) self.optimizer = fleet.distributed_optimizer(self.optimizer) # No pipeline mode, sharding only @@ -271,19 +257,17 @@ def get_expected_keys(inputs, keys): if self.args.tensor_parallel_degree > 1: hcg = fleet.get_hybrid_communicate_group() assert ( - ShardingOption.SHARD_GRAD_OP in self.args.sharding or - ShardingOption.SHARD_OP in self.args.sharding + ShardingOption.SHARD_GRAD_OP in self.args.sharding or ShardingOption.SHARD_OP in self.args.sharding ), "Only support tensor parallel + sharding stage1/stage2 hybrid parallel now." - model = paddle.distributed.fleet.meta_parallel.TensorParallel( - model, hcg, strategy=None) + model = paddle.distributed.fleet.meta_parallel.TensorParallel(model, hcg, strategy=None) if ShardingOption.SHARD_OP in self.args.sharding: model = fleet.distributed_model(model) self.optimizer = fleet.distributed_optimizer(self.optimizer) else: # sync params (broadcast) buffers in dp group - if not is_dp_group_support_in_group_sharded_parallel( - ) and self.args.data_parallel_degree > 1: + + if not is_dp_group_support_in_group_sharded_parallel() and self.args.data_parallel_degree > 1: try: from paddle.fluid.dygraph.parallel import sync_params_buffers except ImportError: @@ -292,8 +276,7 @@ def get_expected_keys(inputs, keys): hcg = fleet.get_hybrid_communicate_group() dp_group = hcg.get_data_parallel_group() - sync_params_buffers( - model, comm_group=dp_group, src_rank=dp_group.ranks[0]) + sync_params_buffers(model, comm_group=dp_group, src_rank=dp_group.ranks[0]) cpu_offload = ShardingOption.OFFLOAD in self.args.sharding assert self.optimizer is not None, "optimizer is empty!" @@ -319,20 +302,19 @@ def get_expected_keys(inputs, keys): scaler=None, group=self.sharding_group, offload=cpu_offload, - **extra_kwargs, ) + **extra_kwargs, + ) self.optimizer = optimizer # pure tesnor parallel mode, no pipeline_parallel, no sharding. if not in_pipeline_parallel_mode and not in_sharding_parallel_mode and in_tensor_parallel_model: if self.args.amp_master_grad: - mix_precision_utils.MixPrecisionLayer( - model, dtype=self.amp_dtype) # return value has no use + mix_precision_utils.MixPrecisionLayer(model, dtype=self.amp_dtype) # return value has no use model = fleet.distributed_model(model) assert self.optimizer is not None, "Tensor parallel mode need decorate optimizer, pelease init optimizer." if self.args.amp_master_grad: - self.optimizer = mix_precision_utils.MixPrecisionOptimizer( - self.optimizer) + self.optimizer = mix_precision_utils.MixPrecisionOptimizer(self.optimizer) self.optimizer = fleet.distributed_optimizer(self.optimizer) return model @@ -342,18 +324,21 @@ def autocast_smart_context_manager(self): arguments, depending on the situation. """ if self.enable_autocast_context_manager: - ctx_manager = autocast(True, ) + ctx_manager = autocast( + True, + ) else: - ctx_manager = contextlib.nullcontext() if sys.version_info >= ( - 3, 7) else contextlib.suppress() + ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress() return ctx_manager - def evaluate(self, - eval_dataset: Optional[Dataset]=None, - ignore_keys: Optional[List[str]]=None, - metric_key_prefix: str="eval", - task_name="coco_caption") -> Dict[str, float]: + def evaluate( + self, + eval_dataset: Optional[Dataset] = None, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "eval", + task_name="coco_caption", + ) -> Dict[str, float]: """ Run evaluation and returns metrics. @@ -382,7 +367,7 @@ def evaluate(self, self._memory_tracker.start() self.task_name = task_name if isinstance(eval_dataset, dict): - eval_dataset = eval_dataset['test'] + eval_dataset = eval_dataset["test"] eval_dataloader = self.get_eval_dataloader(eval_dataset) start_time = time.time() @@ -390,30 +375,34 @@ def evaluate(self, eval_dataloader, description="Evaluation", ignore_keys=ignore_keys, - metric_key_prefix=metric_key_prefix, ) + metric_key_prefix=metric_key_prefix, + ) - total_batch_size = self.args.eval_batch_size * self.args.dataset_world_size - output.metrics.update(speed_metrics( - metric_key_prefix, - start_time, )) + # total_batch_size = self.args.eval_batch_size * self.args.dataset_world_size + output.metrics.update( + speed_metrics( + metric_key_prefix, + start_time, + ) + ) self.log(output.metrics) - self.control = self.callback_handler.on_evaluate( - self.args, self.state, self.control, output.metrics) + self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics) self._memory_tracker.stop_and_update_metrics(output.metrics) return output.metrics def evaluation_loop( - self, - dataloader: DataLoader, - description: str, - prediction_loss_only: Optional[bool]=None, - ignore_keys: Optional[List[str]]=None, - metric_key_prefix: str="eval", - max_eval_iters: Optional[int]=-1, ) -> EvalLoopOutput: + self, + dataloader: DataLoader, + description: str, + prediction_loss_only: Optional[bool] = None, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "eval", + max_eval_iters: Optional[int] = -1, + ) -> EvalLoopOutput: """ Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`. @@ -427,9 +416,7 @@ def evaluation_loop( if isinstance(dataloader, paddle.io.DataLoader): batch_size = dataloader.batch_sampler.batch_size - elif isinstance( - dataloader, - paddle.fluid.dataloader.dataloader_iter._DataLoaderIterBase): + elif isinstance(dataloader, paddle.fluid.dataloader.dataloader_iter._DataLoaderIterBase): # support for inner dataloader batch_size = dataloader._batch_sampler.batch_size # alias for inner dataloader @@ -450,8 +437,7 @@ def evaluation_loop( logger.info(f" Total prediction steps = {max_eval_iters}") logger.info(f" Pre device batch size = {batch_size}") - logger.info( - f" Total Batch size = {batch_size * self.args.dataset_world_size}") + logger.info(f" Total Batch size = {batch_size * self.args.dataset_world_size}") model.eval() @@ -464,8 +450,7 @@ def evaluation_loop( # Prediction step eval_output = self.prediction_step(model, inputs) results.extend(eval_output) - self.control = self.callback_handler.on_prediction_step( - args, self.state, self.control) + self.control = self.callback_handler.on_prediction_step(args, self.state, self.control) if max_eval_iters > 0 and step >= max_eval_iters - 1: break if results is not None: @@ -473,15 +458,13 @@ def evaluation_loop( else: metrics = None - return EvalLoopOutput( - predictions=None, label_ids=None, metrics=metrics, num_samples=None) + return EvalLoopOutput(predictions=None, label_ids=None, metrics=metrics, num_samples=None) def prediction_step( - self, - model: nn.Layer, - inputs: Dict[str, Union[paddle.Tensor, Any]], ) -> Tuple[Optional[ - paddle.Tensor], Optional[paddle.Tensor], Optional[ - paddle.Tensor]]: + self, + model: nn.Layer, + inputs: Dict[str, Union[paddle.Tensor, Any]], + ) -> Tuple[Optional[paddle.Tensor], Optional[paddle.Tensor], Optional[paddle.Tensor]]: """ Perform an evaluation step on `model` using `inputs`. @@ -505,27 +488,23 @@ def prediction_step( with paddle.no_grad(): # with paddle.amp.auto_cast(level='O2'): model_inputs = self.eval_processor( - text=[""] * inputs['pixel_values'].shape[0], + text=[""] * inputs["pixel_values"].shape[0], return_tensors="pd", return_attention_mask=True, - mode="test", ) + mode="test", + ) model_inputs.update(inputs) generated_ids, scores = model.generate(**model_inputs) - generated_text = self.processor.batch_decode( - generated_ids, skip_special_tokens=True) + generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True) generated_text = [text.strip() for text in generated_text] - for caption, img_id in zip(generated_text, inputs['image_id']): - results.append({ - "caption": caption, - "image_id": int(img_id) - }) + for caption, img_id in zip(generated_text, inputs["image_id"]): + results.append({"caption": caption, "image_id": int(img_id)}) elif "vqa" in self.task_name: with paddle.no_grad(): # with paddle.amp.auto_cast(level='O2'): model_inputs = inputs generated_ids, scores = model.predict_answers(**model_inputs) - answers = self.processor.batch_decode( - generated_ids, skip_special_tokens=True) + answers = self.processor.batch_decode(generated_ids, skip_special_tokens=True) answers = [text.strip() for text in answers] question_id = inputs["question_id"] for answer, ques_id in zip(answers, question_id): @@ -540,21 +519,21 @@ def after_evaluation(self, val_result): eval_result_file = save_result( result=val_result, result_dir=self.args.output_dir + self.task_name + "/result", - filename="{}_epoch{}".format('eval', 'eval'), + filename="{}_epoch{}".format("eval", "eval"), remove_duplicate="image_id", - world_size=self.args.world_size) + world_size=self.args.world_size, + ) - metrics = self._report_metrics_caption( - eval_result_file=eval_result_file) + metrics = self._report_metrics_caption(eval_result_file=eval_result_file) elif "vqa" in self.task_name: eval_result_file = save_result( val_result, result_dir=self.args.output_dir + self.task_name + "/result", - filename="{}_epoch{}".format('eval', 'eval'), - remove_duplicate="question_id", ) + filename="{}_epoch{}".format("eval", "eval"), + remove_duplicate="question_id", + ) - metrics = self._report_metrics_vqa( - eval_result_file=eval_result_file) + metrics = self._report_metrics_vqa(eval_result_file=eval_result_file) else: raise NotImplementedError return metrics @@ -562,7 +541,7 @@ def after_evaluation(self, val_result): def _report_metrics_caption(self, eval_result_file, split_name="test"): # TODO better way to define this - coco_gt_root = os.path.join('/root/.paddlemix/datasets/', "coco_gt") + coco_gt_root = os.path.join("/root/.paddlemix/datasets/", "coco_gt") coco_val = coco_caption_eval(coco_gt_root, eval_result_file, split_name) agg_metrics = coco_val.eval["CIDEr"] + coco_val.eval["Bleu_4"] @@ -579,12 +558,11 @@ def _report_metrics_caption(self, eval_result_file, split_name="test"): def _report_metrics_vqa(self, eval_result_file): metrics = {} - self.anno_files = '/root/.paddlemix/datasets/coco/annotations/v2_mscoco_val2014_annotations.json' - self.ques_files = '/root/.paddlemix/datasets/coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json' + self.anno_files = "/root/.paddlemix/datasets/coco/annotations/v2_mscoco_val2014_annotations.json" + self.ques_files = "/root/.paddlemix/datasets/coco/annotations/v2_OpenEnded_mscoco_val2014_questions.json" vqa = VQA(self.anno_files, self.ques_files) - vqa_result = vqa.loadRes( - resFile=eval_result_file, quesFile=self.ques_files) + vqa_result = vqa.loadRes(resFile=eval_result_file, quesFile=self.ques_files) vqa_scorer = VQAEval(vqa, vqa_result, n=2) logger.info("Start VQA evaluation.") vqa_scorer.evaluate() @@ -597,12 +575,10 @@ def _report_metrics_vqa(self, eval_result_file): logger.info("Per Answer Type Accuracy is the following:") for ans_type in vqa_scorer.accuracy["perAnswerType"]: - logger.info( - "%s : %.02f" % - (ans_type, vqa_scorer.accuracy["perAnswerType"][ans_type])) + logger.info("%s : %.02f" % (ans_type, vqa_scorer.accuracy["perAnswerType"][ans_type])) metrics[ans_type] = vqa_scorer.accuracy["perAnswerType"][ans_type] with open(os.path.join(self.args.output_dir, "evaluate.txt"), "a") as f: f.write(json.dumps(metrics) + "\n") - return metrics \ No newline at end of file + return metrics diff --git a/paddlemix/trainer/trainer.py b/paddlemix/trainer/trainer.py index 8b2e44bd86de8..25570303f64bc 100644 --- a/paddlemix/trainer/trainer.py +++ b/paddlemix/trainer/trainer.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np + import paddle from paddle.io import DataLoader from paddlenlp.trainer.trainer import Trainer @@ -80,10 +80,8 @@ def training_step(self, model, inputs) -> paddle.Tensor: if self.rank == 0 and self.args.tensorboard: self.logstep += 1 self.writer.add_scalar("train/loss", loss.item(), self.logstep) - self.writer.add_scalar("train/grad_norm", - grad_norms.item(), self.logstep) - self.writer.add_scalar("train/logit_scale", - logit_scale.item(), self.logstep) + self.writer.add_scalar("train/grad_norm", grad_norms.item(), self.logstep) + self.writer.add_scalar("train/logit_scale", logit_scale.item(), self.logstep) return loss.detach() @@ -103,4 +101,5 @@ def get_train_dataloader(self): collate_fn=self.data_collator, num_workers=self.args.dataloader_num_workers, prefetch_factor=1, - shuffle=False, ) + shuffle=False, + ) diff --git a/paddlemix/utils/downloader.py b/paddlemix/utils/downloader.py index 08f4139832d1c..f1659ec9225fa 100644 --- a/paddlemix/utils/downloader.py +++ b/paddlemix/utils/downloader.py @@ -30,15 +30,18 @@ from huggingface_hub.utils import EntryNotFoundError from tqdm.auto import tqdm -from .env import (DOWNLOAD_SERVER, FAILED_STATUS, HF_CACHE_HOME, MODEL_HOME, - SUCCESS_STATUS) +from .env import ( + DOWNLOAD_SERVER, + FAILED_STATUS, + HF_CACHE_HOME, + MODEL_HOME, + SUCCESS_STATUS, +) from .log import logger __all__ = ["get_weights_path_from_url", "resolve_cache_dir"] -COMMUNITY_MODEL_PREFIX = os.getenv( - "COMMUNITY_MODEL_PREFIX", - "https://bj.bcebos.com/paddlenlp/models/community") +COMMUNITY_MODEL_PREFIX = os.getenv("COMMUNITY_MODEL_PREFIX", "https://bj.bcebos.com/paddlenlp/models/community") WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/hapi/weights") DOWNLOAD_RETRY_LIMIT = 3 DOWNLOAD_CHECK = False @@ -111,11 +114,12 @@ def get_path_from_url(url, root_dir, md5sum=None, check_exist=True): def get_path_from_url_with_filelock( - url: str, - root_dir: str, - md5sum: Optional[str]=None, - check_exist: bool=True, - timeout: float=-1, ) -> str: + url: str, + root_dir: str, + md5sum: Optional[str] = None, + check_exist: bool = True, + timeout: float = -1, +) -> str: """construct `get_path_from_url` for `model_utils` to enable downloading multiprocess-safe Args: @@ -140,8 +144,7 @@ def get_path_from_url_with_filelock( os.makedirs(os.path.dirname(lock_file_path), exist_ok=True) with FileLock(lock_file_path, timeout=timeout): - result = get_path_from_url( - url=url, root_dir=root_dir, md5sum=md5sum, check_exist=check_exist) + result = get_path_from_url(url=url, root_dir=root_dir, md5sum=md5sum, check_exist=check_exist) return result @@ -161,15 +164,13 @@ def _download(url, path, md5sum=None): if retry_cnt < DOWNLOAD_RETRY_LIMIT: retry_cnt += 1 else: - raise RuntimeError("Download from {} failed. " - "Retry limit reached".format(url)) + raise RuntimeError("Download from {} failed. " "Retry limit reached".format(url)) logger.info("Downloading {} from {}".format(fname, url)) req = requests.get(url, stream=True) if req.status_code != 200: - raise RuntimeError("Downloading from {} failed with code " - "{}!".format(url, req.status_code)) + raise RuntimeError("Downloading from {} failed with code " "{}!".format(url, req.status_code)) # For protecting download interupted, download to # tmp_fullname firstly, move tmp_fullname to fullname @@ -178,11 +179,7 @@ def _download(url, path, md5sum=None): total_size = req.headers.get("content-length") with open(tmp_fullname, "wb") as f: if total_size: - with tqdm( - total=int(total_size), - unit="B", - unit_scale=True, - unit_divisor=1024) as pbar: + with tqdm(total=int(total_size), unit="B", unit_scale=True, unit_divisor=1024) as pbar: for chunk in req.iter_content(chunk_size=1024): f.write(chunk) pbar.update(len(chunk)) @@ -207,8 +204,7 @@ def _md5check(fullname, md5sum=None): calc_md5sum = md5.hexdigest() if calc_md5sum != md5sum: - logger.info("File {} md5 check failed, {}(calc) != " - "{}(base)".format(fullname, calc_md5sum, md5sum)) + logger.info("File {} md5 check failed, {}(calc) != " "{}(base)".format(fullname, calc_md5sum, md5sum)) return False return True @@ -425,10 +421,11 @@ def url_file_exists(url: str) -> bool: def hf_file_exists( - repo_id: str, - filename: str, - token: Union[bool, str, None]=None, - subfolder: Optional[str]=None, ) -> bool: + repo_id: str, + filename: str, + token: Union[bool, str, None] = None, + subfolder: Optional[str] = None, +) -> bool: """Check whether the HF file exists Args: @@ -447,16 +444,18 @@ def hf_file_exists( try: _ = get_hf_file_metadata( url=url, - token=token, ) + token=token, + ) return True except EntryNotFoundError: return False def resolve_cache_dir( - pretrained_model_name_or_path: str, - from_hf_hub: bool, - cache_dir: Optional[str]=None, ) -> str: + pretrained_model_name_or_path: str, + from_hf_hub: bool, + cache_dir: Optional[str] = None, +) -> str: """resolve cache dir for PretrainedModel and PretrainedConfig Args: diff --git a/paddlemix/utils/env.py b/paddlemix/utils/env.py index 83a2f55c9cac0..0251734197348 100644 --- a/paddlemix/utils/env.py +++ b/paddlemix/utils/env.py @@ -39,9 +39,7 @@ def _get_ppmix_home(): if os.path.isdir(home_path): return home_path else: - raise RuntimeError( - "The environment variable PPMIX_HOME {} is not a directory.". - format(home_path)) + raise RuntimeError("The environment variable PPMIX_HOME {} is not a directory.".format(home_path)) else: return home_path return os.path.join(_get_user_home(), ".paddlemix") @@ -108,8 +106,8 @@ def set_hyrbid_parallel_seed(basic_seed, data_world_rank, mp_rank, pp_rank=0): def setdistenv(args): if dist.get_world_size() > 1: args.dp_degree = dist.get_world_size() // ( - args.tensor_parallel_degree * args.sharding_parallel_degree * - args.pipeline_parallel_degree) + args.tensor_parallel_degree * args.sharding_parallel_degree * args.pipeline_parallel_degree + ) strategy = fleet.DistributedStrategy() strategy.hybrid_configs = { "dp_degree": args.dp_degree, @@ -131,10 +129,10 @@ def setdistenv(args): args.dp_rank = hcg.get_data_parallel_rank() args.sharding_rank = hcg.get_sharding_parallel_rank() - args.data_world_rank = ( - args.dp_rank * args.sharding_parallel_degree + args.sharding_rank) + args.data_world_rank = args.dp_rank * args.sharding_parallel_degree + args.sharding_rank args.data_world_size = dist.get_world_size() // abs( - args.tensor_parallel_degree * args.pipeline_parallel_degree) + args.tensor_parallel_degree * args.pipeline_parallel_degree + ) else: args.data_world_rank = 0 args.data_world_size = 1 diff --git a/paddlemix/utils/initializer.py b/paddlemix/utils/initializer.py index 82777c8ad1f88..8a13c739977bb 100644 --- a/paddlemix/utils/initializer.py +++ b/paddlemix/utils/initializer.py @@ -136,9 +136,7 @@ def _calculate_fan_in_and_fan_out(tensor, reverse=False): Tuple[fan_in, fan_out] """ if tensor.ndim < 2: - raise ValueError( - "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions" - ) + raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions") if reverse: num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1] @@ -191,8 +189,7 @@ def _calculate_correct_fan(tensor, mode, reverse=False): mode = mode.lower() valid_modes = ["fan_in", "fan_out"] if mode not in valid_modes: - raise ValueError("Mode {} not supported, please use one of {}".format( - mode, valid_modes)) + raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes)) fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse) @@ -218,13 +215,11 @@ def _calculate_gain(nonlinearity, param=None): elif nonlinearity == "leaky_relu": if param is None: negative_slope = 0.01 - elif (not isinstance(param, bool) and isinstance(param, int) or - isinstance(param, float)): + elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float): # True/False are instances of int, hence check above negative_slope = param else: - raise ValueError("negative_slope {} not a valid number".format( - param)) + raise ValueError("negative_slope {} not a valid number".format(param)) return math.sqrt(2.0 / (1 + negative_slope**2)) elif nonlinearity == "selu": return 3.0 / 4 @@ -232,11 +227,7 @@ def _calculate_gain(nonlinearity, param=None): raise ValueError("Unsupported nonlinearity {}".format(nonlinearity)) -def kaiming_uniform_(tensor, - a=0, - mode="fan_in", - nonlinearity="leaky_relu", - reverse=False): +def kaiming_uniform_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False): """ Modified tensor inspace using kaiming_uniform method Args: @@ -254,11 +245,7 @@ def kaiming_uniform_(tensor, return _no_grad_uniform_(tensor, -k, k) -def kaiming_normal_(tensor, - a=0, - mode="fan_in", - nonlinearity="leaky_relu", - reverse=False): +def kaiming_normal_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False): """ Modified tensor inspace using kaiming_normal_ Args: @@ -306,8 +293,7 @@ def reset_initialized_parameter(model, include_self=True): """ for _, m in model.named_sublayers(include_self=include_self): if isinstance(m, nn.Conv2D): - k = float(m._groups) / (m._in_channels * m._kernel_size[0] * - m._kernel_size[1]) + k = float(m._groups) / (m._in_channels * m._kernel_size[0] * m._kernel_size[1]) k = math.sqrt(k) _no_grad_uniform_(m.weight, -k, k) if hasattr(m, "bias") and getattr(m, "bias") is not None: @@ -343,13 +329,11 @@ def _transform(t, device, dtype, blocking): size_dtype = core.size_of_dtype(dtype) # Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space. # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough. - waiting_alloc_memory = ( - (np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2 + waiting_alloc_memory = ((np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2 gpu_memory_available = core.gpu_memory_available() if gpu_memory_available < waiting_alloc_memory: # Copy param / Tensor to cpu - t_used = t._copy_to(paddle.CPUPlace(), - blocking) # k-v type will error + t_used = t._copy_to(paddle.CPUPlace(), blocking) # k-v type will error # Release mem of t t.value().get_tensor()._clear() else: @@ -379,11 +363,12 @@ def _transform(t, device, dtype, blocking): def to( - self, - device=None, - dtype=None, - blocking=None, - floating_only=True, ): + self, + device=None, + dtype=None, + blocking=None, + floating_only=True, +): """ Cast the parameters and buffers of Layer by the give device, dtype and blocking. @@ -411,24 +396,25 @@ def to( if isinstance(device, str): device = paddle.device._convert_to_place(device) elif isinstance( - device, + device, ( core.CPUPlace, core.CUDAPlace, core.CUDAPinnedPlace, - core.XPUPlace, ), ): + core.XPUPlace, + ), + ): pass else: raise ValueError( "device value error, must be str, paddle.CPUPlace(), paddle.CUDAPlace(), paddle.CUDAPinnedPlace() or paddle.XPUPlace(), but the type of device is " - + type(device).__name__) + + type(device).__name__ + ) if blocking is None: blocking = True else: - assert isinstance( - blocking, - bool), "blocking value error, must be the True, False or None" + assert isinstance(blocking, bool), "blocking value error, must be the True, False or None" def transform(t, device, dtype, blocking): if floating_only and (not paddle.is_floating_point(t)): diff --git a/paddlemix/utils/log.py b/paddlemix/utils/log.py index b951c826b6321..97f7a407c70dd 100644 --- a/paddlemix/utils/log.py +++ b/paddlemix/utils/log.py @@ -23,34 +23,13 @@ loggers = {} log_config = { - "DEBUG": { - "level": 10, - "color": "purple" - }, - "INFO": { - "level": 20, - "color": "green" - }, - "TRAIN": { - "level": 21, - "color": "cyan" - }, - "EVAL": { - "level": 22, - "color": "blue" - }, - "WARNING": { - "level": 30, - "color": "yellow" - }, - "ERROR": { - "level": 40, - "color": "red" - }, - "CRITICAL": { - "level": 50, - "color": "bold_red" - }, + "DEBUG": {"level": 10, "color": "purple"}, + "INFO": {"level": 20, "color": "green"}, + "TRAIN": {"level": 21, "color": "cyan"}, + "EVAL": {"level": 22, "color": "blue"}, + "WARNING": {"level": 30, "color": "yellow"}, + "ERROR": {"level": 40, "color": "red"}, + "CRITICAL": {"level": 50, "color": "bold_red"}, } @@ -62,22 +41,19 @@ class Logger(object): name(str) : Logger name, default is 'PaddleNLP' """ - def __init__(self, name: str=None): + def __init__(self, name: str = None): name = "PaddleMIX" if not name else name self.logger = logging.getLogger(name) for key, conf in log_config.items(): logging.addLevelName(conf["level"], key) self.__dict__[key] = functools.partial(self.__call__, conf["level"]) - self.__dict__[key.lower()] = functools.partial(self.__call__, - conf["level"]) + self.__dict__[key.lower()] = functools.partial(self.__call__, conf["level"]) self.format = colorlog.ColoredFormatter( "%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s", - log_colors={ - key: conf["color"] - for key, conf in log_config.items() - }, ) + log_colors={key: conf["color"] for key, conf in log_config.items()}, + ) self.handler = logging.StreamHandler() self.handler.setFormatter(self.format) @@ -95,8 +71,7 @@ def enable(self): self._is_enable = True def set_level(self, log_level: str): - assert (log_level in log_config - ), f"Invalid log level. Choose among {log_config.keys()}" + assert log_level in log_config, f"Invalid log level. Choose among {log_config.keys()}" self.logger.setLevel(log_level) @property @@ -117,7 +92,7 @@ def use_terminator(self, terminator: str): self.handler.terminator = old_terminator @contextlib.contextmanager - def processing(self, msg: str, interval: float=0.1): + def processing(self, msg: str, interval: float = 0.1): """ Continuously print a progress bar with rotating special effects. diff --git a/paddlemix/utils/parameters.py b/paddlemix/utils/parameters.py index 1e34840804b4c..e6a68aa0010db 100644 --- a/paddlemix/utils/parameters.py +++ b/paddlemix/utils/parameters.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import numpy as np import paddle @@ -19,20 +20,19 @@ def transfer_param(p, is_bias=False, dtype="float16", restore_data=False): param_shape = p.shape # Allow CPU/GPU and float16/float32 transfer # NOTE: str(p.place) differs between paddle develop and 2.2 - if str(p.dtype)[-len(dtype):] == dtype and ("gpu" in str(p.place).lower() or - "cuda" in str(p.place).lower()): + if str(p.dtype)[-len(dtype) :] == dtype and ("gpu" in str(p.place).lower() or "cuda" in str(p.place).lower()): return p if restore_data: - if (getattr(paddle.fluid.framework, "_in_eager_mode_", False) and - getattr(paddle.fluid.framework, "_dygraph_tracer_", None) is - not None) or (hasattr(paddle.fluid.framework, "global_var") and - getattr(paddle.fluid.framework.global_var, - "_in_eager_mode_", False) and - getattr(paddle.fluid.framework.global_var, - "_dygraph_tracer_", None) is not None): + if ( + getattr(paddle.fluid.framework, "_in_eager_mode_", False) + and getattr(paddle.fluid.framework, "_dygraph_tracer_", None) is not None + ) or ( + hasattr(paddle.fluid.framework, "global_var") + and getattr(paddle.fluid.framework.global_var, "_in_eager_mode_", False) + and getattr(paddle.fluid.framework.global_var, "_dygraph_tracer_", None) is not None + ): param_data = p.numpy() - new_p = paddle.create_parameter( - shape=param_shape, dtype=dtype, is_bias=is_bias) + new_p = paddle.create_parameter(shape=param_shape, dtype=dtype, is_bias=is_bias) new_p.set_value(param_data.astype(dtype)) return new_p elif paddle.in_dynamic_mode(): @@ -42,16 +42,13 @@ def transfer_param(p, is_bias=False, dtype="float16", restore_data=False): # elaborately to get a ParamBase. Also note `VarBase.set_value` # enforce the same dtype and can not be used directly. new_p = type(p)(shape=param_shape, dtype=dtype, is_bias=is_bias) - new_p.value().get_tensor().set( - param_data.astype(dtype), - paddle.framework._current_expected_place()) + new_p.value().get_tensor().set(param_data.astype(dtype), paddle.framework._current_expected_place()) return new_p else: - param_data = np.array(paddle.static.global_scope().find_var(p.name) - .get_tensor()) + param_data = np.array(paddle.static.global_scope().find_var(p.name).get_tensor()) return paddle.create_parameter( shape=param_shape, dtype=dtype, is_bias=is_bias, - default_initializer=paddle.nn.initializer.Assign(param_data) - if restore_data else None, ) + default_initializer=paddle.nn.initializer.Assign(param_data) if restore_data else None, + ) diff --git a/paddlevlp/datasets/dataset.py b/paddlevlp/datasets/dataset.py deleted file mode 100644 index 96452fb68de78..0000000000000 --- a/paddlevlp/datasets/dataset.py +++ /dev/null @@ -1,1136 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import atexit -import inspect -import os -import time -import warnings -from collections import namedtuple -from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast - -import datasets -from multiprocess import Pool, RLock -from PIL import Image - -import paddlemix - -try: - import paddle.distributed as dist -except Exception: - warnings.warn("paddle.distributed is not contains in you paddle!") - -import importlib -from functools import partial - -from paddle.io import Dataset, IterableDataset -from paddle.utils.download import _get_unique_endpoints - -from paddlemix.utils.env import DATA_HOME - -__all__ = ["MapDataset", "DatasetBuilder", "IterDataset", "load_dataset"] - -DATASETS_MODULE_PATH = "paddlemix.datasets." - -# Patch for intranet -from datasets import load_dataset as origin_load_dataset # noqa: E402 - - -def load_from_ppvlp(path, *args, **kwargs): - ppvlp_path = paddlemix.datasets.__path__[0] - new_path = os.path.split(path)[-1] - new_path = os.path.join(ppvlp_path, "hf_datasets", new_path + ".py") - if os.path.exists(new_path): - return origin_load_dataset(new_path, *args, **kwargs) - else: - return origin_load_dataset(path, *args, **kwargs) - - -datasets.load_dataset = load_from_ppvlp - - -class DatasetTuple: - def __init__(self, splits): - self.identifier_map, identifiers = self._gen_identifier_map(splits) - self.tuple_cls = namedtuple("datasets", identifiers) - self.tuple = self.tuple_cls(* [None for _ in splits]) - - def __getitem__(self, key): - if isinstance(key, (int, slice)): - return self.tuple[key] - if isinstance(key, str): - return getattr(self.tuple, self.identifier_map[key]) - - def __setitem__(self, key, value): - self.tuple = self.tuple._replace(**{self.identifier_map[key]: value}) - - def _gen_identifier_map(self, splits): - identifier_map = {} - identifiers = [] - for i in range(len(splits)): - identifiers.append("splits_" + str(i)) - identifier_map[splits[i]] = "splits_" + str(i) - return identifier_map, identifiers - - def __len__(self): - return len(self.tuple) - - -def import_main_class(module_path): - """ - Import a module at module_path and return its DatasetBuilder class. - - """ - module_path = DATASETS_MODULE_PATH + module_path - module = importlib.import_module(module_path) - main_cls_type = DatasetBuilder - - # Find the main class in our imported module - module_main_cls = None - for name, obj in module.__dict__.items(): - if isinstance(obj, type) and issubclass(obj, main_cls_type): - if name == "DatasetBuilder": - continue - module_main_cls = obj - break - - return module_main_cls - - -def load_from_hf(path, name=None, splits=None, **kwargs): - from datasets import DatasetDict - from datasets import load_dataset as load_hf_dataset - from datasets.features import ClassLabel - - try: - hf_datasets = load_hf_dataset(path, name=name, split=splits, **kwargs) - except FileNotFoundError: - raise FileNotFoundError("Couldn't find the dataset script for '" + path - + "' on PaddleNLP or HuggingFace") - else: - label_list = [] - if isinstance(hf_datasets, DatasetDict): - datasets = DatasetTuple(list(hf_datasets.keys())) - for split, ds in hf_datasets.items(): - for feature in ds.features.values(): - if isinstance(feature, ClassLabel): - label_list = feature.names - datasets[split] = MapDataset(ds, label_list=label_list) - elif isinstance(hf_datasets, list): - datasets = DatasetTuple(splits) - for i, split in enumerate(splits): - for feature in hf_datasets[i].features.values(): - if isinstance(feature, ClassLabel): - label_list = feature.names - datasets[split] = MapDataset( - hf_datasets[i], label_list=label_list) - else: - for feature in hf_datasets.features.values(): - if isinstance(feature, ClassLabel): - label_list = feature.names - datasets = MapDataset(hf_datasets, label_list=label_list) - return datasets - - -def load_dataset(path_or_read_func, - name=None, - data_files=None, - splits=None, - lazy=None, - **kwargs): - """ - This method will load a dataset, either form PaddleNLP library or from a - self-defined data loading script, by calling functions in `DatasetBuilder`. - - For all the names of datasets in PaddleNLP library, see here: `dataset_list - `__. - - Either `splits` or `data_files` must be specified. - - Args: - path_or_read_func (str|callable): Name of the dataset processing script - in PaddleNLP library or a custom data reading function. - name (str, optional): Additional name to select a more specific dataset. - Defaults to None. - data_files (str|list|tuple|dict, optional): Defining the path of dataset - files. If None. `splits` must be specified. Defaults to None. - splits (str|list|tuple, optional): Which split of the data to load. If None. - `data_files` must be specified. Defaults to None. - lazy (bool, optional): Weather to return `MapDataset` or an `IterDataset`. - True for `IterDataset`. False for `MapDataset`. If None, return the - default type of this dataset. Defaults to None. - kwargs (dict): Other keyword arguments to be passed to the `DatasetBuilder`. - - Returns: - A `MapDataset` or `IterDataset` or a tuple of those. - - For how to use this function, please see `dataset_load - `__ - and `dataset_self_defined - `__ - - """ - if inspect.isfunction(path_or_read_func): - assert lazy is not None, "lazy can not be None in custom mode." - kwargs["name"] = name - kwargs["data_files"] = data_files - kwargs["splits"] = splits - custom_kwargs = {} - for name in inspect.signature(path_or_read_func).parameters.keys(): - if name in kwargs.keys(): - custom_kwargs[name] = kwargs[name] - - reader_instance = SimpleBuilder(lazy=lazy, read_func=path_or_read_func) - return reader_instance.read(**custom_kwargs) - else: - try: - reader_cls = import_main_class(path_or_read_func) - except ModuleNotFoundError: - datasets = load_from_hf( - path_or_read_func, name=name, splits=splits, **kwargs) - else: - reader_instance = reader_cls(lazy=lazy, name=name, **kwargs) - - # Check if selected name and split is valid in this DatasetBuilder - if hasattr(reader_instance, "BUILDER_CONFIGS"): - if name in reader_cls.BUILDER_CONFIGS.keys(): - split_names = reader_cls.BUILDER_CONFIGS[name][ - "splits"].keys() - else: - raise ValueError( - 'Invalid name "{}". Should be one of {}.'.format( - name, list(reader_cls.BUILDER_CONFIGS.keys()))) - elif hasattr(reader_instance, "SPLITS"): - split_names = reader_instance.SPLITS.keys() - else: - raise AttributeError( - "Either 'SPLITS' or 'BUILDER_CONFIGS' must be implemented for DatasetBuilder." - ) - - selected_splits = [] - if isinstance(splits, list) or isinstance(splits, tuple): - selected_splits.extend(splits) - else: - selected_splits += [splits] - - for split_name in selected_splits: - if split_name not in split_names and split_name is not None: - raise ValueError('Invalid split "{}". Should be one of {}.'. - format(split_name, list(split_names))) - - datasets = reader_instance.read_datasets( - data_files=data_files, splits=splits) - return datasets - - -class MapDataset(Dataset): - """ - Wraps a map-style dataset-like object as an instance of `MapDataset`, and equips it - with `map` and other utility methods. All non-magic methods of the raw object - are also accessible. - - Args: - data (list|Dataset): An object with `__getitem__` and `__len__` methods. It could - be a list or a subclass of `paddle.io.Dataset`. - kwargs (dict, optional): Other information to be passed to the dataset. - - For examples of this class, please see `dataset_self_defined - `__. - - """ - - def __init__(self, data, **kwargs): - self.data = data - self._transform_pipline = [] - self.new_data = self.data - self.info = kwargs - self.label_list = self.info.pop("label_list", None) - self.vocab_info = self.info.pop("vocab_info", None) - - def _transform(self, data): - for fn in self._transform_pipline: - data = fn(data) - return data - - def __getitem__(self, idx): - """ - Basic function of `MapDataset` to get sample from dataset with a given - index. - """ - return (self._transform(self.new_data[idx]) - if self._transform_pipline else self.new_data[idx]) - - def __len__(self): - """ - Returns the number of samples in dataset. - """ - return len(self.new_data) - - def filter(self, fn, num_workers=0): - """ - Filters samples by the filter function and uses the filtered data to - update this dataset. - - Args: - fn (callable): A filter function that takes a sample as input and - returns a boolean. Samples that return False would be discarded. - num_workers(int, optional): Number of processes for multiprocessing. If - set to 0, it doesn't use multiprocessing. Defaults to `0`. - """ - assert num_workers >= 0, "num_workers should be a non-negative value" - if num_workers > 1: - shards = [ - self._shard( - num_shards=num_workers, index=index, contiguous=True) - for index in range(num_workers) - ] - kwds_per_shard = [ - dict( - self=shards[rank], fn=fn) for rank in range(num_workers) - ] - pool = Pool(num_workers, initargs=(RLock(), )) - - results = [ - pool.apply_async( - self.__class__._filter, kwds=kwds) - for kwds in kwds_per_shard - ] - transformed_shards = [r.get() for r in results] - - pool.close() - pool.join() - self.new_data = [] - for i in range(num_workers): - self.new_data += transformed_shards[i].new_data - return self - else: - return self._filter(fn) - - def _filter(self, fn): - self.new_data = [ - self.new_data[idx] for idx in range(len(self.new_data)) - if fn(self.new_data[idx]) - ] - return self - - def shard(self, num_shards=None, index=None, contiguous=False): - self.new_data = self._shard( - num_shards=num_shards, index=index, contiguous=contiguous).data - return self - - def _shard(self, num_shards=None, index=None, contiguous=False): - """ - Split the dataset into `num_shards` pieces. Note that the size of each - shard might be different because the original dataset may not be evenly - divisible. - - Args: - num_shards (int, optional): An integer representing the number of - data shards. If None, `num_shards` would be number of trainers. - Defaults to `None`. - index (int, optional): An integer representing the index of the - current shard. If None, `index` would be the current trainer rank - id. Defaults to `None`. - contiguous: (bool, optional): If true, contiguous chunks of data - will be select for sharding. And total number of examples will - be the same. Otherwise each shard will contain all examples of - dataset whose index mod `num_shards` = `index`. Defaults to `False`. - """ - if num_shards is None: - num_shards = dist.get_world_size() - if index is None: - index = dist.get_rank() - - if contiguous: - div = len(self) // num_shards - mod = len(self) % num_shards - start = div * index + min(index, mod) - end = start + div + (1 if index < mod else 0) - new_data = [self.new_data[idx] for idx in range(start, end)] - else: - new_data = [ - self.new_data[idx] for idx in range(len(self.new_data)) - if idx % num_shards == index - ] - - return MapDataset(new_data) - - def map(self, fn, lazy=True, batched=False, num_workers=0): - """ - Performs specific function on the dataset to transform and update every sample. - - Args: - fn (callable): Transformations to be performed. It receives single - sample as argument if batched is False. Else it receives all examples. - lazy (bool, optional): If True, transformations would be delayed and - performed on demand. Otherwise, transforms all samples at once. Note that - if `fn` is stochastic, `lazy` should be True or you will get the same - result on all epochs. Defaults to False. - batched(bool, optional): If True, transformations would take all examples as - input and return a collection of transformed examples. Note that if set - True, `lazy` option would be ignored. Defaults to False. - num_workers(int, optional): Number of processes for multiprocessing. If - set to 0, it doesn't use multiprocessing. Note that if set to positive - value, `lazy` option would be ignored. Defaults to 0. - """ - - assert num_workers >= 0, "num_workers should be a non-negative value" - if num_workers > 1: - shards = [ - self._shard( - num_shards=num_workers, index=index, contiguous=True) - for index in range(num_workers) - ] - kwds_per_shard = [ - dict( - self=shards[rank], fn=fn, lazy=False, batched=batched) - for rank in range(num_workers) - ] - pool = Pool(num_workers, initargs=(RLock(), )) - results = [ - pool.apply_async( - self.__class__._map, kwds=kwds) for kwds in kwds_per_shard - ] - transformed_shards = [r.get() for r in results] - pool.close() - pool.join() - self.new_data = [] - for i in range(num_workers): - self.new_data += transformed_shards[i].new_data - return self - else: - return self._map(fn, lazy=lazy, batched=batched) - - def _map(self, fn, lazy=True, batched=False): - if batched: - self.new_data = fn(self.new_data) - elif lazy: - self._transform_pipline.append(fn) - else: - self.new_data = [ - fn(self.new_data[idx]) for idx in range(len(self.new_data)) - ] - return self - - -class IterDataset(IterableDataset): - """ - Wraps a dataset-like object as an instance of `IterDataset`, and equips it with - `map` and other utility methods. All non-magic methods of the raw object - also accessible. - - Args: - data (Iterable): An object with `__iter__` function. It can be a Iterable or a - subclass of `paddle.io.IterableDataset`. - kwargs (dict, optional): Other information to be passed to the dataset. - - For examples of this class, please see `dataset_self_defined - `__. - """ - - def __init__(self, data, **kwargs): - self.data = data - self._transform_pipline = [] - self._filter_pipline = [] - - self.label_list = kwargs.pop("label_list", None) - self.vocab_info = kwargs.pop("vocab_info", None) - - def _transform(self, data): - for fn in self._transform_pipline: - data = fn(data) - return data - - def _shard_filter(self, num_samples): - return True - - def _filter(self, data): - for fn in self._filter_pipline: - if not fn(data): - return False - return True - - def __iter__(self): - """ - yields sample sequentially. - """ - num_samples = 0 - if inspect.isfunction(self.data): - for example in self.data(): - if (not self._filter_pipline or - self._filter(self._filter_pipline) - ) and self._shard_filter(num_samples=num_samples): - yield self._transform( - example) if self._transform_pipline else example - num_samples += 1 - else: - if inspect.isgenerator(self.data): - warnings.warn( - "Reciving generator as data source, data can only be iterated once" - ) - for example in self.data: - if (not self._filter_pipline or - self._filter(self._filter_pipline) - ) and self._shard_filter(num_samples=num_samples): - yield self._transform( - example) if self._transform_pipline else example - num_samples += 1 - - def filter(self, fn): - """ - Filters samples by the filter function and uses the filtered data to - update this dataset. - - Args: - fn (callable): A filter function that takes a sample as input and - returns a boolean. Samples that return False are discarded. - """ - - self._filter_pipline.append(fn) - - return self - - def shard(self, num_shards=None, index=None): - """ - Split the dataset into `num_shards` pieces. - - Args: - num_shards (int, optional): An integer representing the number of - data shards. If None, `num_shards` would be number of trainers. - Defaults to None. - index (int, optional): An integer representing the index of the - current shard. If None, `index` would be the current trainer rank - id. Defaults to None. - """ - if num_shards is None: - num_shards = dist.get_world_size() - if index is None: - index = dist.get_rank() - - def sharder(num_shards, index, num_samples): - if num_samples % num_shards == index: - return True - else: - return False - - fn = partial(sharder, num_shards=num_shards, index=index) - self._shard_filter = fn - return self - - def map(self, fn): - """ - Performs specific function on the dataset to transform and update every sample. - - Args: - fn (callable): Transformations to be performed. It receives single - sample as argument. - """ - - self._transform_pipline.append(fn) - - return self - - -class DatasetBuilder: - """ - A base class for all DatasetBuilder. It provides a `read()` function to turn - a data file into a MapDataset or IterDataset. - - `_get_data()` function and `_read()` function should be implemented to download - data file and read data file into a `Iterable` of the examples. - - For how to define a custom `DatasetBuilder`, please see `contribute_dataset - `__. - """ - - lazy = False - - def __init__(self, lazy=None, name=None, **config): - if lazy is not None: - self.lazy = lazy - self.name = name - self.config = config - - def read_datasets(self, splits=None, data_files=None): - def remove_if_exit(filepath): - if isinstance(filepath, (list, tuple)): - for file in filepath: - try: - os.remove(file) - except OSError: - pass - else: - try: - os.remove(filepath) - except OSError: - pass - - if data_files is None: - if splits is None: - splits = (list(self.BUILDER_CONFIGS[self.name]["splits"].keys()) - if hasattr(self, "BUILDER_CONFIGS") else - list(self.SPLITS.keys())) - - assert ( - isinstance(splits, str) or - (isinstance(splits, list) and isinstance(splits[0], str)) or - (isinstance(splits, tuple) and isinstance(splits[0], str)) - ), "`splits` should be a string or list of string or a tuple of string." - - if isinstance(splits, str): - splits = [splits] - datasets = DatasetTuple(splits) - parallel_env = dist.ParallelEnv() - unique_endpoints = _get_unique_endpoints( - parallel_env.trainer_endpoints[:]) - # move register hook to first and register togather - lock_files = [] - for split in splits: - lock_file = os.path.join(DATA_HOME, self.__class__.__name__) - if self.name is not None: - lock_file = lock_file + "." + self.name - lock_file += "." + split + ".done" + "." + str(os.getppid()) - lock_files.append(lock_file) - # Must register to all procs to make the lock file can be removed - # when any proc breaks. Otherwise, the single registered proc may - # not receive proper singal send by the parent proc to exit. - atexit.register(lambda: remove_if_exit(lock_files)) - for split in splits: - filename = self._get_data(split) - lock_file = os.path.join(DATA_HOME, self.__class__.__name__) - if self.name is not None: - lock_file = lock_file + "." + self.name - lock_file += "." + split + ".done" + "." + str(os.getppid()) - # `lock_file` indicates the finished status of`_get_data`. - # `_get_data` only works in the `unique_endpoints` specified - # proc since `get_path_from_url` only work for it. The other - # procs wait `_get_data` to be finished. - if parallel_env.current_endpoint in unique_endpoints: - f = open(lock_file, "w") - f.close() - else: - while not os.path.exists(lock_file): - time.sleep(1) - datasets[split] = self.read(filename=filename, split=split) - else: - assert ( - isinstance(data_files, str) or isinstance(data_files, tuple) or - isinstance(data_files, list) - ), "`data_files` should be a string or tuple or list of strings." - if isinstance(data_files, str): - data_files = [data_files] - default_split = "train" - if splits: - if isinstance(splits, str): - splits = [splits] - datasets = DatasetTuple(splits) - assert len(splits) == len( - data_files - ), "Number of `splits` and number of `data_files` should be the same if you want to specify the split of loacl data file." - for i in range(len(data_files)): - datasets[splits[i]] = self.read( - filename=data_files[i], split=splits[i]) - else: - datasets = DatasetTuple( - ["split" + str(i) for i in range(len(data_files))]) - for i in range(len(data_files)): - datasets["split" + str(i)] = self.read( - filename=data_files[i], split=default_split) - - return datasets if len(datasets) > 1 else datasets[0] - - def read(self, filename, split="train"): - """ - Returns a dataset containing all the examples that can be read from the file path. - - If `self.lazy` is False, this eagerly reads all instances from `self._read()` - and returns a `MapDataset`. - - If `self.lazy` is True, this returns an `IterDataset`, which internally - relies on the generator created from `self._read()` to lazily produce examples. - In this case your implementation of `_read()` must also be lazy - (that is, not load all examples into memory at once). - - Args: - filename (str): Path of data file to read, usually provided by `_get_data` - function. - split (str, optional): The split name of selected dataset. This only makes - a different when data files of different splits have different structures. - - Returns: - A `MapDataset|IterDataset`. - """ - - label_list = self.get_labels() - vocab_info = self.get_vocab() - - def _create_dict(labels): - # For multiple labels in the form of list. - if isinstance(labels[0], list) or isinstance(labels[0], tuple): - label_dict = [] - for sub_labels in labels: - sub_dict = {} - for i, label in enumerate(sub_labels): - sub_dict[label] = i - label_dict.append(sub_dict) - else: - label_dict = {} - for i, label in enumerate(labels): - label_dict[label] = i - return label_dict - - def _convert_label_to_id(labels, label_dict): - if isinstance(labels, list) or isinstance(labels, tuple): - for label_idx in range(len(labels)): - labels[label_idx] = label_dict[labels[label_idx]] - else: - labels = label_dict[labels] - return labels - - if self.lazy: - - def generate_examples(): - generator = (self._read(filename, split) - if self._read.__code__.co_argcount > 2 else - self._read(filename)) - for example in generator: - # We need to check if the example contains label column and confirm its name. - # For now we only allow `label` or `labels` to be the name of label column. - if "labels" in example.keys(): - label_col = "labels" - elif "label" in example.keys(): - label_col = "label" - else: - label_col = None - - # Convert class label to label ids. - if label_list is not None and example.get(label_col, None): - label_dict = _create_dict(label_list) - # For multiple labels in the form of list. - if isinstance(label_dict, list): - for idx, sub_dict in enumerate(label_dict): - example[label_col][idx] = _convert_label_to_id( - example[label_col][idx], sub_dict) - else: - example[label_col] = _convert_label_to_id( - example[label_col], label_dict) - - yield example - else: - yield example - - return IterDataset( - generate_examples(), - label_list=label_list, - vocab_info=vocab_info) - else: - examples = (self._read(filename, split) - if self._read.__code__.co_argcount > 2 else - self._read(filename)) - - # Then some validation. - if not isinstance(examples, list): - examples = list(examples) - - if not examples: - raise ValueError( - "No instances were read from the given filepath {}. " - "Is the path correct?".format(filename)) - - # We need to check if the example contains label column and confirm its name. - # For now we only allow `label` or `labels` to be the name of label column. - if "labels" in examples[0].keys(): - label_col = "labels" - elif "label" in examples[0].keys(): - label_col = "label" - else: - label_col = None - - # Convert class label to label ids. - if label_list is not None and examples[0].get(label_col, None): - label_dict = _create_dict(label_list) - for idx in range(len(examples)): - # For multiple labels in the form of list. - if isinstance(label_dict, list): - for i, sub_dict in enumerate(label_dict): - examples[idx][label_col][i] = _convert_label_to_id( - examples[idx][label_col][i], sub_dict) - else: - examples[idx][label_col] = _convert_label_to_id( - examples[idx][label_col], label_dict) - - return MapDataset( - examples, label_list=label_list, vocab_info=vocab_info) - - def _read(self, filename: str, *args): - """ - Reads examples from the given file_path and returns them as an - `Iterable` (which could be a list or a generator). - - This method must be implemented in self-defined `DatasetBuilder`. - """ - raise NotImplementedError - - def _get_data(self, mode: str): - """ - Downloads examples from the given URL and customized split - informations and returns a filepath. - - This method must be implemented in self-defined `DatasetBuilder`. - """ - raise NotImplementedError - - def get_labels(self): - """ - Returns list of class labels of the dataset if specified. - """ - return None - - def get_vocab(self): - """ - Returns vocab file path of the dataset if specified. - """ - return None - - -class SimpleBuilder(DatasetBuilder): - def __init__(self, lazy, read_func): - self._read = read_func - self.lazy = lazy - - def read(self, **kwargs): - if self.lazy: - - def generate_examples(): - generator = self._read(**kwargs) - for example in generator: - yield example - - return IterDataset(generate_examples) - else: - examples = self._read(**kwargs) - if hasattr(examples, "__len__") and hasattr(examples, - "__getitem__"): - return MapDataset(examples) - else: - return MapDataset(list(examples)) - - -def has_file_allowed_extension(filename: str, - extensions: Union[str, Tuple[str, ...]]) -> bool: - """Checks if a file is an allowed extension. - - Args: - filename (string): path to a file - extensions (tuple of strings): extensions to consider (lowercase) - - Returns: - bool: True if the filename ends with one of given extensions - """ - return filename.lower().endswith( - extensions if isinstance(extensions, str) else tuple(extensions)) - - -def find_classes(directory: str) -> Tuple[List[str], Dict[str, int]]: - """Finds the class folders in a dataset. - - See :class:`DatasetFolder` for details. - """ - classes = sorted( - entry.name for entry in os.scandir(directory) if entry.is_dir()) - if not classes: - raise FileNotFoundError( - f"Couldn't find any class folder in {directory}.") - - class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)} - return classes, class_to_idx - - -def make_dataset( - directory: str, - class_to_idx: Optional[Dict[str, int]]=None, - extensions: Optional[Union[str, Tuple[str, ...]]]=None, - is_valid_file: Optional[Callable[[str], bool]]=None, ) -> List[Tuple[ - str, int]]: - """Generates a list of samples of a form (path_to_sample, class). - - See :class:`DatasetFolder` for details. - - Note: The class_to_idx parameter is here optional and will use the logic of the ``find_classes`` function - by default. - """ - directory = os.path.expanduser(directory) - - if class_to_idx is None: - _, class_to_idx = find_classes(directory) - elif not class_to_idx: - raise ValueError( - "'class_to_index' must have at least one entry to collect any samples." - ) - - both_none = extensions is None and is_valid_file is None - both_something = extensions is not None and is_valid_file is not None - if both_none or both_something: - raise ValueError( - "Both extensions and is_valid_file cannot be None or not None at the same time" - ) - - if extensions is not None: - - def is_valid_file(x: str) -> bool: - return has_file_allowed_extension( - x, extensions) # type: ignore[arg-type] - - is_valid_file = cast(Callable[[str], bool], is_valid_file) - - instances = [] - available_classes = set() - for target_class in sorted(class_to_idx.keys()): - class_index = class_to_idx[target_class] - target_dir = os.path.join(directory, target_class) - if not os.path.isdir(target_dir): - continue - for root, _, fnames in sorted(os.walk(target_dir, followlinks=True)): - for fname in sorted(fnames): - path = os.path.join(root, fname) - if is_valid_file(path): - item = path, class_index - instances.append(item) - - if target_class not in available_classes: - available_classes.add(target_class) - - empty_classes = set(class_to_idx.keys()) - available_classes - if empty_classes: - msg = ( - f"Found no valid file for the classes {', '.join(sorted(empty_classes))}. " - ) - if extensions is not None: - msg += f"Supported extensions are: {extensions if isinstance(extensions, str) else ', '.join(extensions)}" - raise FileNotFoundError(msg) - - return instances - - -class DatasetFolder(Dataset): - """A generic data loader. - - This default directory structure can be customized by overriding the - :meth:`find_classes` method. - - Args: - root (string): Root directory path. - loader (callable): A function to load a sample given its path. - extensions (tuple[string]): A list of allowed extensions. - both extensions and is_valid_file should not be passed. - transform (callable, optional): A function/transform that takes in - a sample and returns a transformed version. - E.g, ``transforms.RandomCrop`` for images. - target_transform (callable, optional): A function/transform that takes - in the target and transforms it. - is_valid_file (callable, optional): A function that takes path of a file - and check if the file is a valid file (used to check of corrupt files) - both extensions and is_valid_file should not be passed. - - Attributes: - classes (list): List of the class names sorted alphabetically. - class_to_idx (dict): Dict with items (class_name, class_index). - samples (list): List of (sample path, class_index) tuples - targets (list): The class_index value for each image in the dataset - """ - - def __init__( - self, - root: str, - loader: Callable[[str], Any], - extensions: Optional[Tuple[str, ...]]=None, - transform: Optional[Callable]=None, - target_transform: Optional[Callable]=None, - is_valid_file: Optional[Callable[[str], bool]]=None, ) -> None: - # super().__init__(root, transform=transform, target_transform=target_transform) - # super().__init__() - self.root = root - self.transform = transform - self.target_transform = target_transform - - classes, class_to_idx = self.find_classes(self.root) - samples = self.make_dataset(self.root, class_to_idx, extensions, - is_valid_file) - - self.loader = loader - self.extensions = extensions - - self.classes = classes - self.class_to_idx = class_to_idx - self.samples = samples - self.targets = [s[1] for s in samples] - - @staticmethod - def make_dataset( - directory: str, - class_to_idx: Dict[str, int], - extensions: Optional[Tuple[str, ...]]=None, - is_valid_file: Optional[Callable[[str], bool]]=None, ) -> List[ - Tuple[str, int]]: - """Generates a list of samples of a form (path_to_sample, class). - - This can be overridden to e.g. read files from a compressed zip file instead of from the disk. - - Args: - directory (str): root dataset directory, corresponding to ``self.root``. - class_to_idx (Dict[str, int]): Dictionary mapping class name to class index. - extensions (optional): A list of allowed extensions. - Either extensions or is_valid_file should be passed. Defaults to None. - is_valid_file (optional): A function that takes path of a file - and checks if the file is a valid file - (used to check of corrupt files) both extensions and - is_valid_file should not be passed. Defaults to None. - - Raises: - ValueError: In case ``class_to_idx`` is empty. - ValueError: In case ``extensions`` and ``is_valid_file`` are None or both are not None. - FileNotFoundError: In case no valid file was found for any class. - - Returns: - List[Tuple[str, int]]: samples of a form (path_to_sample, class) - """ - if class_to_idx is None: - # prevent potential bug since make_dataset() would use the class_to_idx logic of the - # find_classes() function, instead of using that of the find_classes() method, which - # is potentially overridden and thus could have a different logic. - raise ValueError("The class_to_idx parameter cannot be None.") - return make_dataset( - directory, - class_to_idx, - extensions=extensions, - is_valid_file=is_valid_file) - - def find_classes(self, directory: str) -> Tuple[List[str], Dict[str, int]]: - """Find the class folders in a dataset structured as follows:: - - directory/ - ├── class_x - │ ├── xxx.ext - │ ├── xxy.ext - │ └── ... - │ └── xxz.ext - └── class_y - ├── 123.ext - ├── nsdf3.ext - └── ... - └── asd932_.ext - - This method can be overridden to only consider - a subset of classes, or to adapt to a different dataset directory structure. - - Args: - directory(str): Root directory path, corresponding to ``self.root`` - - Raises: - FileNotFoundError: If ``dir`` has no class folders. - - Returns: - (Tuple[List[str], Dict[str, int]]): List of all classes and dictionary mapping each class to an index. - """ - return find_classes(directory) - - def __getitem__(self, index: int) -> Tuple[Any, Any]: - """ - Args: - index (int): Index - - Returns: - tuple: (sample, target) where target is class_index of the target class. - """ - path, target = self.samples[index] - sample = self.loader(path) - if self.transform is not None: - sample = self.transform(sample) - if self.target_transform is not None: - target = self.target_transform(target) - - return sample, target - - def __len__(self) -> int: - return len(self.samples) - - -IMG_EXTENSIONS = ( - ".jpg", - ".jpeg", - ".png", - ".ppm", - ".bmp", - ".pgm", - ".tif", - ".tiff", - ".webp", ) - - -def pil_loader(path: str) -> Image.Image: - # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) - with open(path, "rb") as f: - img = Image.open(f) - return img.convert("RGB") - - -def default_loader(path: str) -> Any: - return pil_loader(path) - - -class ImageFolder(DatasetFolder): - """A generic data loader where the images are arranged in this way by default: :: - - root/dog/xxx.png - root/dog/xxy.png - root/dog/[...]/xxz.png - - root/cat/123.png - root/cat/nsdf3.png - root/cat/[...]/asd932_.png - - This class inherits from :class:`~torchvision.datasets.DatasetFolder` so - the same methods can be overridden to customize the dataset. - - Args: - root (string): Root directory path. - transform (callable, optional): A function/transform that takes in an PIL image - and returns a transformed version. E.g, ``transforms.RandomCrop`` - target_transform (callable, optional): A function/transform that takes in the - target and transforms it. - loader (callable, optional): A function to load an image given its path. - is_valid_file (callable, optional): A function that takes path of an Image file - and check if the file is a valid file (used to check of corrupt files) - - Attributes: - classes (list): List of the class names sorted alphabetically. - class_to_idx (dict): Dict with items (class_name, class_index). - imgs (list): List of (image path, class_index) tuples - """ - - def __init__( - self, - root: str, - transform: Optional[Callable]=None, - target_transform: Optional[Callable]=None, - loader: Callable[[str], Any]=default_loader, - is_valid_file: Optional[Callable[[str], bool]]=None, ): - super().__init__( - root, - loader, - IMG_EXTENSIONS if is_valid_file is None else None, - transform=transform, - target_transform=target_transform, - is_valid_file=is_valid_file, ) - self.imgs = self.samples diff --git a/paddlevlp/models/blip2/eva_vit.py b/paddlevlp/models/blip2/eva_vit.py deleted file mode 100644 index 5e6d2c50c950f..0000000000000 --- a/paddlevlp/models/blip2/eva_vit.py +++ /dev/null @@ -1,517 +0,0 @@ -# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from collections.abc import Callable - -import numpy as np -import paddle -import paddle.nn as nn -from paddle.distributed import fleet -from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker -from paddle.nn.initializer import Constant, Normal, TruncatedNormal - -from paddlemix.models.blip2.configuration import Blip2VisionConfig -from paddlemix.models.blip2.modeling import Blip2PretrainedModel -# Code was based on https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py -# reference: https://arxiv.org/abs/2010.11929 -from paddlemix.utils.log import logger - -trunc_normal_ = TruncatedNormal(std=0.02) -normal_ = Normal -zeros_ = Constant(value=0.0) -ones_ = Constant(value=1.0) -from paddle.distributed.fleet.utils import recompute - - -def to_2tuple(x): - return tuple([x] * 2) - - -def drop_path(x, drop_prob=0.0, training=False): - """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). - the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... - See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... - """ - if drop_prob == 0.0 or not training: - return x - keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype) - shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) - random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype) - random_tensor = paddle.floor(random_tensor) # binarize - output = x.divide(keep_prob) * random_tensor - return output - - -class DropPath(nn.Layer): - """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" - - def __init__(self, drop_prob=None): - super(DropPath, self).__init__() - self.drop_prob = drop_prob - - def forward(self, x): - return drop_path(x, self.drop_prob, self.training) - - -class Mlp(nn.Layer): - def __init__( - self, - in_features, - hidden_features=None, - out_features=None, - act_layer=nn.GELU, - drop=0.0, - mp_degree=1, ): - super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or in_features - if mp_degree > 1: - self.fc1 = fleet.meta_parallel.ColumnParallelLinear( - in_features, - hidden_features, - weight_attr=None, - has_bias=True, - gather_output=True, ) - self.fc2 = fleet.meta_parallel.ColumnParallelLinear( - hidden_features, - out_features, - weight_attr=None, - has_bias=True, - gather_output=True, ) - else: - self.fc1 = nn.Linear(in_features, hidden_features) - self.fc2 = nn.Linear(hidden_features, out_features) - self.mp_degree = mp_degree - self.act = act_layer() - self.drop = nn.Dropout(drop) - - def forward(self, x): - x = self.fc1(x) - x = self.act(x) - x = self.fc2(x) - if self.mp_degree > 1: - with get_rng_state_tracker().rng_state("global_seed"): - x = self.drop(x) - else: - x = self.drop(x) - return x - - -class Attention(nn.Layer): - def __init__( - self, - dim, - num_heads=8, - qkv_bias=False, - qk_scale=None, - attn_drop=0.0, - proj_drop=0.0, - window_size=None, - mp_degree=1, ): - super().__init__() - self.num_heads = num_heads - head_dim = dim // num_heads - self.scale = qk_scale or head_dim**-0.5 - if mp_degree > 1: - self.qkv = fleet.meta_parallel.ColumnParallelLinear( - dim, - dim * 3, - weight_attr=None, - has_bias=True, - gather_output=True) - else: - self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) - self.attn_drop = nn.Dropout(attn_drop) - if mp_degree > 1: - self.proj = fleet.meta_parallel.ColumnParallelLinear( - dim, dim, weight_attr=None, has_bias=True, gather_output=True) - else: - self.proj = nn.Linear(dim, dim) - self.mp_degree = mp_degree - self.proj_drop = nn.Dropout(proj_drop) - - def _register_relative_position_index( - self, - window_size, - num_heads, ): - self.num_relative_distance = (2 * window_size[0] - 1) * ( - 2 * window_size[1] - 1) + 3 - self.relative_position_bias_table = self.create_parameter( - [self.num_relative_distance, num_heads], - default_initializer=zeros_) # 2*Wh-1 * 2*Ww-1, nH - coords_h = paddle.arange(window_size[0]) - coords_w = paddle.arange(window_size[1]) - coords = paddle.stack(paddle.meshgrid( - [coords_h, coords_w])) # 2, Wh, Ww - coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww - relative_coords = ( - coords_flatten[:, :, None] - coords_flatten[:, None, :] - ) # 2, Wh*Ww, Wh*Ww - relative_coords = relative_coords.transpose( - [1, 2, 0]) # Wh*Ww, Wh*Ww, 2 - relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 - relative_coords[:, :, 1] += window_size[1] - 1 - relative_coords[:, :, 0] *= 2 * window_size[1] - 1 - relative_position_index = paddle.zeros( - (window_size[0] * window_size[1] + 1, ) * 2, - dtype=relative_coords.dtype) - relative_position_index[1:, 1:] = relative_coords.sum( - -1) # Wh*Ww, Wh*Ww - relative_position_index[0, 0:] = self.num_relative_distance - 3 - relative_position_index[0:, 0] = self.num_relative_distance - 2 - relative_position_index[0, 0] = self.num_relative_distance - 1 - - self.register_buffer("relative_position_index", relative_position_index) - - def forward(self, x, rel_pos_bias=None): - # B= paddle.shape(x)[0] - N, C = x.shape[1:] - # if self.q_bias is not None: - # qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias)) - qkv = (self.qkv(x).reshape( - (-1, N, 3, self.num_heads, C // self.num_heads)).transpose( - (2, 0, 3, 1, 4))) - # print(self.qkv.bias[2100]) - q, k, v = qkv[0], qkv[1], qkv[2] - - attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale - if hasattr(self, "relative_position_bias_table"): - relative_position_bias = self.relative_position_bias_table[ - self.relative_position_index.reshape([-1])].reshape([ - self.window_size[0] * self.window_size[1] + 1, - self.window_size[0] * self.window_size[1] + 1, - -1, - ]) # Wh*Ww,Wh*Ww,nH - relative_position_bias = relative_position_bias.transpose( - [2, 0, 1]) # nH, Wh*Ww, Wh*Ww - attn = attn + relative_position_bias.unsqueeze(0) - - attn = nn.functional.softmax(attn, axis=-1) - if self.mp_degree > 1: - with get_rng_state_tracker().rng_state("global_seed"): - attn = self.attn_drop(attn) - else: - attn = self.attn_drop(attn) - - x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C)) - x = self.proj(x) - if self.mp_degree > 1: - with get_rng_state_tracker().rng_state("global_seed"): - x = self.proj_drop(x) - else: - x = self.proj_drop(x) - return x - - -class Block(nn.Layer): - def __init__( - self, - dim, - num_heads, - mlp_ratio=4.0, - qkv_bias=False, - qk_scale=None, - drop=0.0, - init_values=0.0, - attn_drop=0.0, - drop_path=0.0, - act_layer=nn.GELU, - norm_layer="nn.LayerNorm", - epsilon=1e-5, - window_size=None, - mp_degree=1, ): - super().__init__() - if isinstance(norm_layer, str): - self.norm1 = eval(norm_layer)(dim, epsilon=epsilon) - elif isinstance(norm_layer, Callable): - self.norm1 = norm_layer(dim) - else: - raise TypeError( - "The norm_layer must be str or paddle.nn.layer.Layer class") - self.attn = Attention( - dim, - num_heads=num_heads, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - attn_drop=attn_drop, - proj_drop=drop, - window_size=window_size, - mp_degree=mp_degree, ) - # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here - self.drop_path = DropPath(drop_path) - self.gamma_1 = None - self.gamma_2 = None - if isinstance(norm_layer, str): - self.norm2 = eval(norm_layer)(dim, epsilon=epsilon) - elif isinstance(norm_layer, Callable): - self.norm2 = norm_layer(dim) - else: - raise TypeError( - "The norm_layer must be str or paddle.nn.layer.Layer class") - mlp_hidden_dim = int(dim * mlp_ratio) - self.mlp = Mlp( - in_features=dim, - hidden_features=mlp_hidden_dim, - act_layer=act_layer, - drop=drop, - mp_degree=mp_degree, ) - - def forward(self, x, rel_pos_bias=None): - if self.gamma_1 is not None: - x = x + self.drop_path(self.gamma_1 * self.attn( - self.norm1(x), rel_pos_bias=rel_pos_bias)) - x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) - else: - x = x + self.drop_path( - self.attn( - self.norm1(x), rel_pos_bias=rel_pos_bias)) - x = x + self.drop_path(self.mlp(self.norm2(x))) - return x - - -class RelativePositionBias(nn.Layer): - def __init__(self, window_size, num_heads): - super().__init__() - self.window_size = window_size - self.num_relative_distance = (2 * window_size[0] - 1) * ( - 2 * window_size[1] - 1) + 3 - self.relative_position_bias_table = self.create_parameter( - [self.num_relative_distance, num_heads], - default_initializer=zeros_) # 2*Wh-1 * 2*Ww-1, nH - # cls to token & token 2 cls & cls to cls - - # get pair-wise relative position index for each token inside the window - coords_h = paddle.arange(window_size[0]) - coords_w = paddle.arange(window_size[1]) - coords = paddle.stack(paddle.meshgrid( - [coords_h, coords_w])) # 2, Wh, Ww - coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww - relative_coords = ( - coords_flatten[:, :, None] - coords_flatten[:, None, :] - ) # 2, Wh*Ww, Wh*Ww - relative_coords = relative_coords.transpose( - [1, 2, 0]) # Wh*Ww, Wh*Ww, 2 - relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 - relative_coords[:, :, 1] += window_size[1] - 1 - relative_coords[:, :, 0] *= 2 * window_size[1] - 1 - relative_position_index = paddle.zeros( - (window_size[0] * window_size[1] + 1, ) * 2, - dtype=relative_coords.dtype) - relative_position_index[1:, 1:] = relative_coords.sum( - -1) # Wh*Ww, Wh*Ww - relative_position_index[0, 0:] = self.num_relative_distance - 3 - relative_position_index[0:, 0] = self.num_relative_distance - 2 - relative_position_index[0, 0] = self.num_relative_distance - 1 - - self.register_buffer("relative_position_index", relative_position_index) - - # trunc_normal_(self.relative_position_bias_table, std=.02) - - def forward(self): - relative_position_bias = self.relative_position_bias_table[ - self.relative_position_index.reshape([-1])].reshape([ - self.window_size[0] * self.window_size[1] + 1, - self.window_size[0] * self.window_size[1] + 1, - -1, - ]) # Wh*Ww,Wh*Ww,nH - return relative_position_bias.transpose([2, 0, 1]) # nH, Wh*Ww, Wh*Ww - - -class PatchEmbed(nn.Layer): - """Image to Patch Embedding""" - - def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): - super().__init__() - img_size = to_2tuple(img_size) - patch_size = to_2tuple(patch_size) - num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // - patch_size[0]) - self.img_size = img_size - self.patch_size = patch_size - self.num_patches = num_patches - - self.proj = nn.Conv2D( - in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) - - def forward(self, x): - B, C, H, W = x.shape - assert ( - H == self.img_size[0] and W == self.img_size[1] - ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." - - x = self.proj(x).flatten(2).transpose((0, 2, 1)) - return x - - -class VisionTransformer(Blip2PretrainedModel): - """Vision Transformer with support for patch input""" - - main_input_name = "pixel_values" - config_class = Blip2VisionConfig - - def __init__(self, config: Blip2VisionConfig, **kwargs): - super().__init__(config) - from paddle.distributed import fleet - - mp_degree = fleet.DistributedStrategy().hybrid_configs["mp_degree"] - self.class_num = config.class_num - self.num_features = self.embed_dim = config.embed_dim - _img_size = to_2tuple(config.img_size) - _patch_size = to_2tuple(config.patch_size) - self.window_size = ( - _img_size[0] // _patch_size[0], - _img_size[1] // _patch_size[1], ) - self.patch_embed = PatchEmbed( - img_size=config.img_size, - patch_size=config.patch_size, - in_chans=config.in_chans, - embed_dim=config.embed_dim, ) - num_patches = self.patch_embed.num_patches - self.cls_token = self.create_parameter( - shape=(1, 1, config.embed_dim), default_initializer=zeros_) - - self.pos_embed = self.create_parameter( - shape=(1, num_patches + 1, config.embed_dim), - default_initializer=zeros_) - - self.add_parameter("pos_embed", self.pos_embed) - - self.add_parameter("cls_token", self.cls_token) - self.pos_drop = nn.Dropout(p=config.drop_rate) - self.gradient_checkpointing = config.gradient_checkpointing - logger.info("self.gradient_checkpointing:{}".format( - self.gradient_checkpointing)) - dpr = np.linspace(0, config.drop_path_rate, config.depth) - - self.blocks = nn.LayerList([ - Block( - dim=config.embed_dim, - num_heads=config.num_heads, - mlp_ratio=config.mlp_ratio, - qkv_bias=config.qkv_bias, - qk_scale=config.qk_scale, - drop=config.drop_rate, - attn_drop=config.attn_drop_rate, - drop_path=dpr[i], - norm_layer=config.norm_layer, - epsilon=config.epsilon, - window_size=self.window_size, - mp_degree=mp_degree, ) for i in range(config.depth) - ]) - - self.mp_degree = mp_degree - if self.pos_embed is not None: - trunc_normal_(self.pos_embed) - trunc_normal_(self.cls_token) - self.apply(self._init_weights) - - def _init_weights(self, m): - if isinstance(m, (nn.Linear, fleet.meta_parallel.ColumnParallelLinear)): - trunc_normal_(m.weight) - if (isinstance(m, (nn.Linear, - fleet.meta_parallel.ColumnParallelLinear)) and - m.bias is not None): - zeros_(m.bias) - elif isinstance(m, nn.LayerNorm): - zeros_(m.bias) - ones_(m.weight) - - def forward_features(self, x): - # B = x.shape[0] - B = paddle.shape(x)[0] - x = self.patch_embed(x) - cls_tokens = self.cls_token.expand((B, -1, -1)) - x = paddle.concat((cls_tokens, x), axis=1) - - if self.pos_embed is not None: - x = x + self.pos_embed - if self.mp_degree > 1: - with get_rng_state_tracker().rng_state("global_seed"): - x = self.pos_drop(x) - else: - x = self.pos_drop(x) - rel_pos_bias = self.rel_pos_bias() if hasattr(self, - "rel_pos_bias") else None - for blk in self.blocks: - if self.gradient_checkpointing and self.training: - - x = recompute(blk, x, rel_pos_bias=rel_pos_bias) - else: - x = blk(x, rel_pos_bias=rel_pos_bias) - # x = self.norm(x) - return x - - def forward(self, x): - x = self.forward_features(x) - return x - - -def interpolate_pos_embed(model, checkpoint_model): - if "visual_encoder.pos_embed" in checkpoint_model: - pos_embed_checkpoint = checkpoint_model["visual_encoder.pos_embed"] - embedding_size = pos_embed_checkpoint.shape[-1] - num_patches = model.visual_encoder.patch_embed.num_patches - num_extra_tokens = model.visual_encoder.pos_embed.shape[ - -2] - num_patches - # height (== width) for the checkpoint position embedding - orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens)** - 0.5) - # height (== width) for the new position embedding - new_size = int(num_patches**0.5) - # class_token and dist_token are kept unchanged - if orig_size != new_size: - print("Position interpolate from %dx%d to %dx%d" % - (orig_size, orig_size, new_size, new_size)) - extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] - # only the position tokens are interpolated - pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] - pos_tokens = pos_tokens.reshape( - (-1, orig_size, orig_size, embedding_size)).transpose( - (0, 3, 1, 2)) - pos_tokens = paddle.nn.functional.interpolate( - pos_tokens, - size=(new_size, new_size), - mode="bicubic", - align_corners=False, ) - pos_tokens = pos_tokens.transpose((0, 2, 3, 1)).flatten(1, 2) - new_pos_embed = paddle.concat((extra_tokens, pos_tokens), axis=1) - checkpoint_model["visual_encoder.pos_embed"] = new_pos_embed - elif "pos_embed" in checkpoint_model: - pos_embed_checkpoint = checkpoint_model["pos_embed"] - embedding_size = pos_embed_checkpoint.shape[-1] - num_patches = model.patch_embed.num_patches - num_extra_tokens = model.pos_embed.shape[-2] - num_patches - # height (== width) for the checkpoint position embedding - orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens)** - 0.5) - # height (== width) for the new position embedding - new_size = int(num_patches**0.5) - # class_token and dist_token are kept unchanged - if orig_size != new_size: - print("Position interpolate from %dx%d to %dx%d" % - (orig_size, orig_size, new_size, new_size)) - extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] - # only the position tokens are interpolated - pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] - pos_tokens = pos_tokens.reshape( - (-1, orig_size, orig_size, embedding_size)).transpose( - (0, 3, 1, 2)) - pos_tokens = paddle.nn.functional.interpolate( - pos_tokens, - size=(new_size, new_size), - mode="bicubic", - align_corners=False, ) - pos_tokens = pos_tokens.transpose((0, 2, 3, 1)).flatten(1, 2) - new_pos_embed = paddle.concat((extra_tokens, pos_tokens), axis=1) - checkpoint_model["pos_embed"] = new_pos_embed diff --git a/paddlevlp/trainer/trainer.py b/paddlevlp/trainer/trainer.py deleted file mode 100644 index 8b2e44bd86de8..0000000000000 --- a/paddlevlp/trainer/trainer.py +++ /dev/null @@ -1,106 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import numpy as np -import paddle -from paddle.io import DataLoader -from paddlenlp.trainer.trainer import Trainer -from tensorboardX import SummaryWriter - -from paddlemix.models.evaclip.utils import clip_grad_norm - - -class CLIPTrainer(Trainer): - def __init__(self, **kwargs): - """ - Implementation of an `Trainer` suitable for EVA-CLIP - 1、selfdefine optimizer for sharding which can't create by passing by args - - Args: - kwargs (dict): any arugments to pass to `Trainer` - - Returns: - None - """ - super().__init__(**kwargs) - self.rank = paddle.distributed.get_rank() - if self.rank == 0 and self.args.tensorboard: - self.writer = SummaryWriter("output/tensorboard") - self.logstep = 0 - - def training_step(self, model, inputs) -> paddle.Tensor: - """ - Perform a training step on a batch of inputs. - - Subclass and override to inject custom behavior. - - Args: - model (`nn.Layer`): - The model to train. - inputs (`Dict[str, Union[paddle.Tensor, Any]]`): - The inputs and targets of the model. - - The dictionary will be unpacked before being fed to the model. Most models expect the targets under the - argument `labels`. Check your model's documentation for all accepted arguments. - - Return: - `paddle.Tensor`: The tensor with training loss on this batch. - """ - - if self.args.pipeline_parallel_degree > 1: - return self.training_pipeline_step(model, inputs) - - model.train() - inputs = self._prepare_inputs(inputs) - - with self.autocast_smart_context_manager(): - loss, outputs = self.compute_loss(model, inputs, return_outputs=1) - loss_itc, image_features, text_features, logit_scale = outputs - - if self.args.gradient_accumulation_steps > 1: - loss = loss / self.args.gradient_accumulation_steps - - if self.do_grad_scaling: - self.scaler.scale(loss).backward() - else: - loss.backward() - - if self.args.max_grad_norm > 0.0: - grad_norms = clip_grad_norm(model, self.args.max_grad_norm) - if self.rank == 0 and self.args.tensorboard: - self.logstep += 1 - self.writer.add_scalar("train/loss", loss.item(), self.logstep) - self.writer.add_scalar("train/grad_norm", - grad_norms.item(), self.logstep) - self.writer.add_scalar("train/logit_scale", - logit_scale.item(), self.logstep) - - return loss.detach() - - def get_train_dataloader(self): - """ - Returns the training [`~paddle.io.DataLoader`]. - - Will use no sampler if `self.train_dataset` does not implement `__len__`, a random sampler (adapted to - distributed training if necessary) otherwise. - - Subclass and override this method if you want to inject some custom behavior. - """ - - return DataLoader( - self.train_dataset, - batch_size=self.args.per_device_train_batch_size, - collate_fn=self.data_collator, - num_workers=self.args.dataloader_num_workers, - prefetch_factor=1, - shuffle=False, ) diff --git a/ppdiffusers/deploy/controlnet/export_model.py b/ppdiffusers/deploy/controlnet/export_model.py index b6b3b146277a8..c2a406db5acc9 100644 --- a/ppdiffusers/deploy/controlnet/export_model.py +++ b/ppdiffusers/deploy/controlnet/export_model.py @@ -18,36 +18,42 @@ import paddle -from ppdiffusers import (ControlNetModel, FastDeployRuntimeModel, - FastDeployStableDiffusionControlNetPipeline, - StableDiffusionControlNetPipeline, - UNet2DConditionModel) +from ppdiffusers import ( + ControlNetModel, + FastDeployRuntimeModel, + FastDeployStableDiffusionControlNetPipeline, + StableDiffusionControlNetPipeline, + UNet2DConditionModel, +) class ControlNetWithUnetModel(paddle.nn.Layer): def __init__( - self, - unet, - controlnet, ): + self, + unet, + controlnet, + ): super().__init__() self.unet = unet self.controlnet = controlnet def forward( - self, - sample, - timestep, - encoder_hidden_states, - controlnet_cond, - controlnet_conditioning_scale, - return_dict=True, ): + self, + sample, + timestep, + encoder_hidden_states, + controlnet_cond, + controlnet_conditioning_scale, + return_dict=True, + ): down_block_res_samples, mid_block_res_sample = self.controlnet( sample, timestep, encoder_hidden_states=encoder_hidden_states, controlnet_cond=controlnet_cond, conditioning_scale=controlnet_conditioning_scale, - return_dict=False, ) + return_dict=False, + ) noise_pred = self.unet( sample, @@ -55,21 +61,21 @@ def forward( encoder_hidden_states=encoder_hidden_states, down_block_additional_residuals=down_block_res_samples, mid_block_additional_residual=mid_block_res_sample, - return_dict=return_dict, ) + return_dict=return_dict, + ) return noise_pred def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( - model_path: str, - controlnet_model_path: str, - output_path: str, - sample: bool=False, - height: int=None, - width: int=None, ): - unet_tmp = UNet2DConditionModel.from_pretrained( - model_path, resnet_pre_temb_non_linearity=True, subfolder="unet") - controlnet_tmp = ControlNetModel.from_pretrained( - controlnet_model_path, resnet_pre_temb_non_linearity=True) + model_path: str, + controlnet_model_path: str, + output_path: str, + sample: bool = False, + height: int = None, + width: int = None, +): + unet_tmp = UNet2DConditionModel.from_pretrained(model_path, resnet_pre_temb_non_linearity=True, subfolder="unet") + controlnet_tmp = ControlNetModel.from_pretrained(controlnet_model_path, resnet_pre_temb_non_linearity=True) pipeline = StableDiffusionControlNetPipeline.from_pretrained( model_path, @@ -77,7 +83,8 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( controlnet=controlnet_tmp, safety_checker=None, feature_extractor=None, - requires_safety_checker=False, ) + requires_safety_checker=False, + ) # make sure we disable xformers pipeline.disable_xformers_memory_efficient_attention() output_path = Path(output_path) @@ -85,8 +92,7 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( latent_height = height // 8 if height is not None else None latent_width = width // 8 if width is not None else None # get arguments - cross_attention_dim = ( - pipeline.unet.config.cross_attention_dim) # 768 or 1024 or 1280 + cross_attention_dim = pipeline.unet.config.cross_attention_dim # 768 or 1024 or 1280 unet_channels = pipeline.unet.config.in_channels # 4 vae_in_channels = pipeline.vae.config.in_channels # 3 vae_latent_channels = pipeline.vae.config.latent_channels # 4 @@ -94,14 +100,12 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( f"cross_attention_dim: {cross_attention_dim}\n", f"unet_in_channels: {unet_channels}\n", f"vae_encoder_in_channels: {vae_in_channels}\n", - f"vae_decoder_latent_channels: {vae_latent_channels}", ) + f"vae_decoder_latent_channels: {vae_latent_channels}", + ) # 1. Convert text_encoder text_encoder = paddle.jit.to_static( pipeline.text_encoder, - input_spec=[ - paddle.static.InputSpec( - shape=[None, None], dtype="int64", name="input_ids") - ], # input_ids + input_spec=[paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")], # input_ids ) save_path = os.path.join(args.output_path, "text_encoder", "inference") paddle.jit.save(text_encoder, save_path) @@ -109,8 +113,7 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( del pipeline.text_encoder # wrap unet + controlnet - new_unet = ControlNetWithUnetModel( - unet=pipeline.unet, controlnet=pipeline.controlnet) + new_unet = ControlNetWithUnetModel(unet=pipeline.unet, controlnet=pipeline.controlnet) # 2. Convert unet unet = paddle.jit.to_static( @@ -119,23 +122,26 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( paddle.static.InputSpec( shape=[None, unet_channels, latent_height, latent_width], dtype="float32", - name="sample", ), # sample - paddle.static.InputSpec( - shape=[1], dtype="float32", name="timestep"), # timestep + name="sample", + ), # sample + paddle.static.InputSpec(shape=[1], dtype="float32", name="timestep"), # timestep paddle.static.InputSpec( shape=[None, None, cross_attention_dim], dtype="float32", - name="encoder_hidden_states", ), # encoder_hidden_states + name="encoder_hidden_states", + ), # encoder_hidden_states paddle.static.InputSpec( shape=[None, vae_in_channels, height, width], dtype="float32", - name="controlnet_cond", ), # controlnet_cond + name="controlnet_cond", + ), # controlnet_cond paddle.static.InputSpec( shape=[len(pipeline.unet.config.block_out_channels) * 3 + 1], dtype="float32", name="controlnet_conditioning_scale", ), # controlnet_conditioning_scale - ], ) + ], + ) save_path = os.path.join(args.output_path, "unet", "inference") paddle.jit.save(unet, save_path) @@ -152,8 +158,7 @@ def forward_vae_encoder_sample(self, z): # 3. Convert vae encoder vae_encoder = pipeline.vae if sample: - vae_encoder.forward = MethodType(forward_vae_encoder_sample, - vae_encoder) + vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder) else: vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder) @@ -165,7 +170,8 @@ def forward_vae_encoder_sample(self, z): dtype="float32", name="sample", # N, C, H, W ), # latent - ], ) + ], + ) # Save vae_encoder in static graph model. save_path = os.path.join(args.output_path, "vae_encoder", "inference") paddle.jit.save(vae_encoder, save_path) @@ -184,8 +190,10 @@ def forward_vae_decoder(self, z): paddle.static.InputSpec( shape=[None, vae_latent_channels, latent_height, latent_width], dtype="float32", - name="latent_sample", ), # latent_sample - ], ) + name="latent_sample", + ), # latent_sample + ], + ) # Save vae_decoder in static graph model. save_path = os.path.join(args.output_path, "vae_decoder", "inference") paddle.jit.save(vae_decoder, save_path) @@ -193,18 +201,16 @@ def forward_vae_decoder(self, z): del pipeline.vae fastdeploy_pipeline = FastDeployStableDiffusionControlNetPipeline( - vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / - "vae_encoder"), - vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path / - "vae_decoder"), - text_encoder=FastDeployRuntimeModel.from_pretrained(output_path / - "text_encoder"), + vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"), + vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_decoder"), + text_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder"), unet=FastDeployRuntimeModel.from_pretrained(output_path / "unet"), tokenizer=pipeline.tokenizer, scheduler=pipeline.scheduler, safety_checker=None, feature_extractor=None, - requires_safety_checker=False, ) + requires_safety_checker=False, + ) fastdeploy_pipeline.save_pretrained(output_path) print("FastDeploy pipeline saved to", output_path) @@ -224,26 +230,25 @@ def forward_vae_decoder(self, z): default="lllyasviel/sd-controlnet-canny", help="Path to the `ppdiffusers` controlnet_pretrained_model_name_or_path checkpoint to convert (either a local directory or on the bos).", ) - parser.add_argument( - "--output_path", - type=str, - required=True, - help="Path to the output model.") + parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.") parser.add_argument( "--sample", action="store_true", default=False, - help="Export the vae encoder in mode or sample", ) + help="Export the vae encoder in mode or sample", + ) parser.add_argument( "--height", type=int, default=None, - help="The height of output images. Default: None", ) + help="The height of output images. Default: None", + ) parser.add_argument( "--width", type=int, default=None, - help="The width of output images. Default: None", ) + help="The width of output images. Default: None", + ) args = parser.parse_args() convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( @@ -252,4 +257,5 @@ def forward_vae_decoder(self, z): args.output_path, args.sample, args.height, - args.width, ) + args.width, + ) diff --git a/ppdiffusers/deploy/controlnet/infer.py b/ppdiffusers/deploy/controlnet/infer.py index 3e516abb02cc0..10350965eb703 100644 --- a/ppdiffusers/deploy/controlnet/infer.py +++ b/ppdiffusers/deploy/controlnet/infer.py @@ -27,8 +27,7 @@ from PIL import Image from tqdm.auto import trange -from ppdiffusers import (DiffusionPipeline, - FastDeployStableDiffusionMegaPipeline) +from ppdiffusers import DiffusionPipeline, FastDeployStableDiffusionMegaPipeline from ppdiffusers.utils import load_image @@ -48,17 +47,20 @@ def parse_arguments(): parser.add_argument( "--model_dir", default="runwayml/stable-diffusion-v1-5@fastdeploy", - help="The model directory of diffusion_model.", ) + help="The model directory of diffusion_model.", + ) parser.add_argument( "--inference_steps", type=int, default=50, - help="The number of unet inference steps.", ) + help="The number of unet inference steps.", + ) parser.add_argument( "--benchmark_steps", type=int, default=1, - help="The number of performance benchmark steps.", ) + help="The number of performance benchmark steps.", + ) parser.add_argument( "--backend", type=str, @@ -78,7 +80,8 @@ def parse_arguments(): "huawei_ascend_npu", "kunlunxin_xpu", ], - help="The inference runtime device of models.", ) + help="The inference runtime device of models.", + ) parser.add_argument( "--task_name", type=str, @@ -100,17 +103,10 @@ def parse_arguments(): "raw", "lpw", ], - help="The parse_prompt_type can be one of [raw, lpw]. ", ) - parser.add_argument( - "--use_fp16", - type=strtobool, - default=True, - help="Wheter to use FP16 mode") - parser.add_argument( - "--device_id", - type=int, - default=0, - help="The selected gpu id. -1 means use cpu") + help="The parse_prompt_type can be one of [raw, lpw]. ", + ) + parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode") + parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu") parser.add_argument( "--scheduler", type=str, @@ -131,7 +127,8 @@ def parse_arguments(): "kdpm2-ancestral", "kdpm2", ], - help="The scheduler type of stable diffusion.", ) + help="The scheduler type of stable diffusion.", + ) parser.add_argument( "--infer_op", type=str, @@ -141,33 +138,25 @@ def parse_arguments(): "raw", "all", ], - help="The type of infer op.", ) - parser.add_argument( - "--height", type=int, default=512, help="Height of input image") - parser.add_argument( - "--width", type=int, default=512, help="Width of input image") - parser.add_argument( - "--hr_resize_height", - type=int, - default=768, - help="HR Height of input image") - parser.add_argument( - "--hr_resize_width", - type=int, - default=768, - help="HR Width of input image") - parser.add_argument( - "--is_sd2_0", type=strtobool, default=False, help="Is sd2_0 model?") + help="The type of infer op.", + ) + parser.add_argument("--height", type=int, default=512, help="Height of input image") + parser.add_argument("--width", type=int, default=512, help="Width of input image") + parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image") + parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image") + parser.add_argument("--is_sd2_0", type=strtobool, default=False, help="Is sd2_0 model?") parser.add_argument( "--low_threshold", type=int, default=100, - help="The value of Canny low threshold.", ) + help="The value of Canny low threshold.", + ) parser.add_argument( "--high_threshold", type=int, default=200, - help="The value of Canny high threshold.", ) + help="The value of Canny high threshold.", + ) return parser.parse_args() @@ -182,14 +171,15 @@ def create_ort_runtime(device_id=0): def create_paddle_inference_runtime( - use_trt=False, - dynamic_shape=None, - use_fp16=False, - device_id=0, - disable_paddle_trt_ops=[], - disable_paddle_pass=[], - paddle_stream=None, - workspace=None, ): + use_trt=False, + dynamic_shape=None, + use_fp16=False, + device_id=0, + disable_paddle_trt_ops=[], + disable_paddle_pass=[], + paddle_stream=None, + workspace=None, +): option = fd.RuntimeOption() option.use_paddle_backend() if device_id == -1: @@ -227,7 +217,8 @@ def create_paddle_inference_runtime( key, shape_dict["min_shape"], shape_dict.get("opt_shape", None), - shape_dict.get("max_shape", None), ) + shape_dict.get("max_shape", None), + ) return option @@ -238,8 +229,10 @@ def create_paddle_lite_runtime(device="cpu", device_id=0, use_fp16=False): option.use_ascend() option.set_lite_device_names(["huawei_ascend_npu"]) option.set_lite_context_properties( - "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS={};HUAWEI_ASCEND_NPU_PRECISION_MODE=allow_mix_precision". - format(device_id)) + "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS={};HUAWEI_ASCEND_NPU_PRECISION_MODE=allow_mix_precision".format( + device_id + ) + ) elif device == "kunlunxin_xpu": # TODO(shentanyue): Add kunlunxin_xpu code # https://github.com/PaddlePaddle/FastDeploy/blob/4c3e7030e151528d304619901c794481bb2f6037/examples/multimodal/stable_diffusion/infer.py#L178-L195 @@ -251,7 +244,8 @@ def create_paddle_lite_runtime(device="cpu", device_id=0, use_fp16=False): autotune_file="", precision="int16", adaptive_seqlen=True, - enable_multi_stream=True, ) + enable_multi_stream=True, + ) if use_fp16: option.enable_lite_fp16() else: @@ -259,10 +253,7 @@ def create_paddle_lite_runtime(device="cpu", device_id=0, use_fp16=False): return option -def create_trt_runtime(workspace=(1 << 31), - dynamic_shape=None, - use_fp16=False, - device_id=0): +def create_trt_runtime(workspace=(1 << 31), dynamic_shape=None, use_fp16=False, device_id=0): option = fd.RuntimeOption() option.use_trt_backend() option.use_gpu(device_id) @@ -276,7 +267,8 @@ def create_trt_runtime(workspace=(1 << 31), key, min_shape=shape_dict["min_shape"], opt_shape=shape_dict.get("opt_shape", None), - max_shape=shape_dict.get("max_shape", None), ) + max_shape=shape_dict.get("max_shape", None), + ) # cache_file = os.path.join(model_dir, model_prefix, "inference.trt") # option.set_trt_cache_file(cache_file) return option @@ -288,8 +280,7 @@ def main(args): paddle_stream = None else: paddle.set_device(f"gpu:{args.device_id}") - paddle_stream = paddle.device.cuda.current_stream( - args.device_id).cuda_stream + paddle_stream = paddle.device.cuda.current_stream(args.device_id).cuda_stream infer_op_dict = { "vae_encoder": args.infer_op, @@ -323,12 +314,9 @@ def main(args): } vae_decoder_dynamic_shape = { "latent_sample": { - "min_shape": - [1, vae_in_channels, min_image_size // 8, min_image_size // 8], - "max_shape": - [1, vae_in_channels, max_image_size // 8, max_image_size // 8], - "opt_shape": - [1, vae_in_channels, min_image_size // 8, min_image_size // 8], + "min_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8], + "max_shape": [1, vae_in_channels, max_image_size // 8, max_image_size // 8], + "opt_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8], } } unet_dynamic_shape = { @@ -379,37 +367,38 @@ def main(args): text_encoder=create_ort_runtime(device_id=args.device_id), vae_encoder=create_ort_runtime(device_id=args.device_id), vae_decoder=create_ort_runtime(device_id=args.device_id), - unet=create_ort_runtime(device_id=args.device_id), ) + unet=create_ort_runtime(device_id=args.device_id), + ) elif args.backend == "paddlelite": runtime_options = dict( - text_encoder=create_paddle_lite_runtime( - device=args.device, device_id=args.device_id, use_fp16=False), - vae_encoder=create_paddle_lite_runtime( - device=args.device, device_id=args.device_id, use_fp16=False), - vae_decoder=create_paddle_lite_runtime( - device=args.device, device_id=args.device_id, use_fp16=False), - unet=create_paddle_lite_runtime( - device=args.device, - device_id=args.device_id, - use_fp16=args.use_fp16), ) + text_encoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False), + vae_encoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False), + vae_decoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False), + unet=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=args.use_fp16), + ) elif args.backend == "tensorrt": runtime_options = dict( text_encoder=create_trt_runtime( dynamic_shape=text_encoder_dynamic_shape, use_fp16=args.use_fp16, - device_id=args.device_id, ), + device_id=args.device_id, + ), vae_encoder=create_trt_runtime( dynamic_shape=vae_encoder_dynamic_shape, use_fp16=args.use_fp16, - device_id=args.device_id, ), + device_id=args.device_id, + ), vae_decoder=create_trt_runtime( dynamic_shape=vae_decoder_dynamic_shape, use_fp16=args.use_fp16, - device_id=args.device_id, ), + device_id=args.device_id, + ), unet=create_trt_runtime( dynamic_shape=unet_dynamic_shape, use_fp16=args.use_fp16, - device_id=args.device_id, ), ) + device_id=args.device_id, + ), + ) elif args.backend == "paddle" or args.backend == "paddle_tensorrt": args.use_trt = args.backend == "paddle_tensorrt" runtime_options = dict( @@ -419,28 +408,34 @@ def main(args): use_fp16=args.use_fp16, device_id=args.device_id, disable_paddle_trt_ops=["arg_max", "range", "lookup_table_v2"], - paddle_stream=paddle_stream, ), + paddle_stream=paddle_stream, + ), vae_encoder=create_paddle_inference_runtime( use_trt=args.use_trt, dynamic_shape=vae_encoder_dynamic_shape, use_fp16=args.use_fp16, device_id=args.device_id, - paddle_stream=paddle_stream, ), + paddle_stream=paddle_stream, + ), vae_decoder=create_paddle_inference_runtime( use_trt=args.use_trt, dynamic_shape=vae_decoder_dynamic_shape, use_fp16=args.use_fp16, device_id=args.device_id, - paddle_stream=paddle_stream, ), + paddle_stream=paddle_stream, + ), unet=create_paddle_inference_runtime( use_trt=args.use_trt, dynamic_shape=unet_dynamic_shape, use_fp16=args.use_fp16, device_id=args.device_id, - paddle_stream=paddle_stream, ), ) + paddle_stream=paddle_stream, + ), + ) pipe = FastDeployStableDiffusionMegaPipeline.from_pretrained( args.model_dir, - runtime_options=runtime_options, ) + runtime_options=runtime_options, + ) pipe.set_progress_bar_config(disable=True) pipe.change_scheduler(args.scheduler) parse_prompt_type = args.parse_prompt_type @@ -454,9 +449,7 @@ def main(args): else: infer_op_list = [args.infer_op] if args.device == "kunlunxin_xpu" or args.backend == "paddle": - print( - "When device is kunlunxin_xpu or backend is paddle, we will use `raw` infer op." - ) + print("When device is kunlunxin_xpu or backend is paddle, we will use `raw` infer op.") infer_op_list = ["raw"] for infer_op in infer_op_list: @@ -466,8 +459,7 @@ def main(args): "text_encoder": infer_op, "unet": infer_op, } - folder = (f"infer_op_{infer_op}_fp16" - if args.use_fp16 else f"infer_op_{infer_op}_fp32") + folder = f"infer_op_{infer_op}_fp16" if args.use_fp16 else f"infer_op_{infer_op}_fp32" os.makedirs(folder, exist_ok=True) if args.task_name in ["text2img_control", "all"]: @@ -487,7 +479,8 @@ def main(args): parse_prompt_type=parse_prompt_type, controlnet_cond=controlnet_cond, controlnet_conditioning_scale=1.0, - infer_op_dict=infer_op_dict, ) + infer_op_dict=infer_op_dict, + ) print("==> Test text2img_control performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -500,7 +493,8 @@ def main(args): parse_prompt_type=parse_prompt_type, controlnet_cond=controlnet_cond, controlnet_conditioning_scale=1.0, - infer_op_dict=infer_op_dict, ).images + infer_op_dict=infer_op_dict, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") @@ -526,7 +520,8 @@ def main(args): parse_prompt_type=parse_prompt_type, controlnet_cond=controlnet_cond, controlnet_conditioning_scale=1.0, - infer_op_dict=infer_op_dict, ) + infer_op_dict=infer_op_dict, + ) print("==> Test img2img_control performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -540,7 +535,8 @@ def main(args): parse_prompt_type=parse_prompt_type, controlnet_cond=controlnet_cond, controlnet_conditioning_scale=1.0, - infer_op_dict=infer_op_dict, ).images + infer_op_dict=infer_op_dict, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") @@ -551,7 +547,9 @@ def main(args): images[0].save(f"{folder}/img2img_control.png") if args.task_name in ["inpaint_legacy_control", "all"]: - img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" + img_url = ( + "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" + ) mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png" init_image = load_image(img_url) mask_image = load_image(mask_url) @@ -569,7 +567,8 @@ def main(args): parse_prompt_type=parse_prompt_type, controlnet_cond=controlnet_cond, controlnet_conditioning_scale=1.0, - infer_op_dict=infer_op_dict, ) + infer_op_dict=infer_op_dict, + ) print("==> Test inpaint_legacy_control performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -584,7 +583,8 @@ def main(args): parse_prompt_type=parse_prompt_type, controlnet_cond=controlnet_cond, controlnet_conditioning_scale=1.0, - infer_op_dict=infer_op_dict, ).images + infer_op_dict=infer_op_dict, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") @@ -636,7 +636,8 @@ def main(args): controlnet_cond=controlnet_cond, controlnet_conditioning_scale=1.0, parse_prompt_type=parse_prompt_type, - infer_op_dict=infer_op_dict, ) + infer_op_dict=infer_op_dict, + ) print("==> Test hiresfix_control performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -653,7 +654,8 @@ def main(args): controlnet_cond=controlnet_cond, controlnet_conditioning_scale=1.0, parse_prompt_type=parse_prompt_type, - infer_op_dict=infer_op_dict, ).images + infer_op_dict=infer_op_dict, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") diff --git a/ppdiffusers/deploy/controlnet/infer_dygraph.py b/ppdiffusers/deploy/controlnet/infer_dygraph.py index 94204a1a5bc77..89bd4d1e51aa9 100644 --- a/ppdiffusers/deploy/controlnet/infer_dygraph.py +++ b/ppdiffusers/deploy/controlnet/infer_dygraph.py @@ -60,12 +60,14 @@ def parse_arguments(): "--inference_steps", type=int, default=50, - help="The number of unet inference steps.", ) + help="The number of unet inference steps.", + ) parser.add_argument( "--benchmark_steps", type=int, default=1, - help="The number of performance benchmark steps.", ) + help="The number of performance benchmark steps.", + ) parser.add_argument( "--task_name", type=str, @@ -87,12 +89,9 @@ def parse_arguments(): "raw", "lpw", ], - help="The parse_prompt_type can be one of [raw, lpw]. ", ) - parser.add_argument( - "--use_fp16", - type=strtobool, - default=True, - help="Wheter to use FP16 mode") + help="The parse_prompt_type can be one of [raw, lpw]. ", + ) + parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode") parser.add_argument( "--guess_mode", type=strtobool, @@ -104,12 +103,9 @@ def parse_arguments(): type=str, default="raw", choices=["raw", "cutlass", "flash", "all"], - help="attention_type.", ) - parser.add_argument( - "--device_id", - type=int, - default=0, - help="The selected gpu id. -1 means use cpu") + help="attention_type.", + ) + parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu") parser.add_argument( "--scheduler", type=str, @@ -129,31 +125,24 @@ def parse_arguments(): "kdpm2-ancestral", "kdpm2", ], - help="The scheduler type of stable diffusion.", ) - parser.add_argument( - "--height", type=int, default=512, help="Height of input image") - parser.add_argument( - "--width", type=int, default=512, help="Width of input image") - parser.add_argument( - "--hr_resize_height", - type=int, - default=768, - help="HR Height of input image") - parser.add_argument( - "--hr_resize_width", - type=int, - default=768, - help="HR Width of input image") + help="The scheduler type of stable diffusion.", + ) + parser.add_argument("--height", type=int, default=512, help="Height of input image") + parser.add_argument("--width", type=int, default=512, help="Width of input image") + parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image") + parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image") parser.add_argument( "--low_threshold", type=int, default=100, - help="The value of Canny low threshold.", ) + help="The value of Canny low threshold.", + ) parser.add_argument( "--high_threshold", type=int, default=200, - help="The value of Canny high threshold.", ) + help="The value of Canny high threshold.", + ) return parser.parse_args() @@ -165,8 +154,8 @@ def main(args): seed = 1024 paddle_dtype = paddle.float16 if args.use_fp16 else paddle.float32 controlnet = ControlNetModel.from_pretrained( - args.controlnet_pretrained_model_name_or_path, - paddle_dtype=paddle_dtype) + args.controlnet_pretrained_model_name_or_path, paddle_dtype=paddle_dtype + ) pipe = DiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, controlnet=controlnet, @@ -174,7 +163,8 @@ def main(args): feature_extractor=None, requires_safety_checker=False, paddle_dtype=paddle_dtype, - custom_pipeline="stable_diffusion_mega", ) + custom_pipeline="stable_diffusion_mega", + ) pipe.set_progress_bar_config(disable=True) pipe.change_scheduler(args.scheduler) parse_prompt_type = args.parse_prompt_type @@ -200,17 +190,14 @@ def main(args): raise ValueError(e) if not args.use_fp16 and attention_type == "flash": - print( - "Flash attention is not supported dtype=float32! Please use float16 or bfloat16. We will skip this!" - ) + print("Flash attention is not supported dtype=float32! Please use float16 or bfloat16. We will skip this!") continue guess_mode = args.guess_mode width = args.width height = args.height hr_resize_width = args.hr_resize_width hr_resize_height = args.hr_resize_height - folder = (f"attn_{attention_type}_fp16" - if args.use_fp16 else f"attn_{attention_type}_fp32") + folder = f"attn_{attention_type}_fp16" if args.use_fp16 else f"attn_{attention_type}_fp32" os.makedirs(folder, exist_ok=True) if args.task_name in ["text2img_control", "all"]: init_image = load_image( @@ -229,7 +216,8 @@ def main(args): controlnet_cond=controlnet_cond, controlnet_conditioning_scale=1.0, guess_mode=guess_mode, - parse_prompt_type=parse_prompt_type, ) + parse_prompt_type=parse_prompt_type, + ) print("==> Test text2img_control performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -242,7 +230,8 @@ def main(args): controlnet_cond=controlnet_cond, controlnet_conditioning_scale=1.0, guess_mode=guess_mode, - parse_prompt_type=parse_prompt_type, ).images + parse_prompt_type=parse_prompt_type, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") @@ -268,7 +257,8 @@ def main(args): controlnet_cond=controlnet_cond, controlnet_conditioning_scale=1.0, guess_mode=guess_mode, - parse_prompt_type=parse_prompt_type, ) + parse_prompt_type=parse_prompt_type, + ) print("==> Test img2img_control performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -282,7 +272,8 @@ def main(args): controlnet_cond=controlnet_cond, controlnet_conditioning_scale=1.0, guess_mode=guess_mode, - parse_prompt_type=parse_prompt_type, ).images + parse_prompt_type=parse_prompt_type, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") @@ -293,7 +284,9 @@ def main(args): images[0].save(f"{folder}/img2img_control.png") if args.task_name in ["inpaint_legacy_control", "all"]: - img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" + img_url = ( + "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" + ) mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png" init_image = load_image(img_url) mask_image = load_image(mask_url) @@ -311,7 +304,8 @@ def main(args): controlnet_cond=controlnet_cond, controlnet_conditioning_scale=1.0, guess_mode=guess_mode, - parse_prompt_type=parse_prompt_type, ) + parse_prompt_type=parse_prompt_type, + ) print(f"==> Test {task_name} performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -326,7 +320,8 @@ def main(args): controlnet_cond=controlnet_cond, controlnet_conditioning_scale=1.0, guess_mode=guess_mode, - parse_prompt_type=parse_prompt_type, ).images + parse_prompt_type=parse_prompt_type, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") @@ -358,7 +353,8 @@ def main(args): controlnet_cond=controlnet_cond, controlnet_conditioning_scale=1.0, guess_mode=guess_mode, - parse_prompt_type=parse_prompt_type, ) + parse_prompt_type=parse_prompt_type, + ) print("==> Test hiresfix_control performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -375,7 +371,8 @@ def main(args): controlnet_cond=controlnet_cond, controlnet_conditioning_scale=1.0, guess_mode=guess_mode, - parse_prompt_type=parse_prompt_type, ).images + parse_prompt_type=parse_prompt_type, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") diff --git a/ppdiffusers/deploy/controlnet/infer_dygraph_torch.py b/ppdiffusers/deploy/controlnet/infer_dygraph_torch.py index 673834d7dbd52..64e67ac852da1 100644 --- a/ppdiffusers/deploy/controlnet/infer_dygraph_torch.py +++ b/ppdiffusers/deploy/controlnet/infer_dygraph_torch.py @@ -18,23 +18,31 @@ import torch -torch.nn.functional.scaled_dot_product_attention_ = ( - torch.nn.functional.scaled_dot_product_attention) +torch.nn.functional.scaled_dot_product_attention_ = torch.nn.functional.scaled_dot_product_attention delattr(torch.nn.functional, "scaled_dot_product_attention") import cv2 import numpy as np from diffusers import ( - ControlNetModel, DDIMScheduler, DDPMScheduler, DEISMultistepScheduler, - DPMSolverMultistepScheduler, DPMSolverSinglestepScheduler, - EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, - HeunDiscreteScheduler, KDPM2AncestralDiscreteScheduler, - KDPM2DiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler, + ControlNetModel, + DDIMScheduler, + DDPMScheduler, + DEISMultistepScheduler, + DPMSolverMultistepScheduler, + DPMSolverSinglestepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + HeunDiscreteScheduler, + KDPM2AncestralDiscreteScheduler, + KDPM2DiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, StableDiffusionControlNetImg2ImgPipeline, - StableDiffusionControlNetInpaintPipeline, StableDiffusionControlNetPipeline, - UniPCMultistepScheduler) -from diffusers.models.attention_processor import (AttnProcessor, - AttnProcessor2_0) + StableDiffusionControlNetInpaintPipeline, + StableDiffusionControlNetPipeline, + UniPCMultistepScheduler, +) +from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0 from diffusers.utils import load_image from PIL import Image from tqdm.auto import trange @@ -67,46 +75,40 @@ def change_scheduler(self, scheduler_type="ddim"): self.orginal_scheduler_config = self.scheduler.config scheduler_type = scheduler_type.lower() if scheduler_type == "pndm": - scheduler = PNDMScheduler.from_config( - self.orginal_scheduler_config, skip_prk_steps=True) + scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True) elif scheduler_type == "lms": - scheduler = LMSDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "heun": - scheduler = HeunDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "euler": - scheduler = EulerDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "euler-ancestral": - scheduler = EulerAncestralDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "dpm-multi": - scheduler = DPMSolverMultistepScheduler.from_config( - self.orginal_scheduler_config) + scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "dpm-single": - scheduler = DPMSolverSinglestepScheduler.from_config( - self.orginal_scheduler_config) + scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "kdpm2-ancestral": - scheduler = KDPM2AncestralDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "kdpm2": - scheduler = KDPM2DiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "unipc-multi": - scheduler = UniPCMultistepScheduler.from_config( - self.orginal_scheduler_config) + scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "ddim": scheduler = DDIMScheduler.from_config( self.orginal_scheduler_config, steps_offset=1, clip_sample=False, - set_alpha_to_one=False, ) + set_alpha_to_one=False, + ) elif scheduler_type == "ddpm": - scheduler = DDPMScheduler.from_config(self.orginal_scheduler_config, ) + scheduler = DDPMScheduler.from_config( + self.orginal_scheduler_config, + ) elif scheduler_type == "deis-multi": scheduler = DEISMultistepScheduler.from_config( - self.orginal_scheduler_config, ) + self.orginal_scheduler_config, + ) else: raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") return scheduler @@ -131,12 +133,14 @@ def parse_arguments(): "--inference_steps", type=int, default=50, - help="The number of unet inference steps.", ) + help="The number of unet inference steps.", + ) parser.add_argument( "--benchmark_steps", type=int, default=10, - help="The number of performance benchmark steps.", ) + help="The number of performance benchmark steps.", + ) parser.add_argument( "--task_name", type=str, @@ -157,20 +161,17 @@ def parse_arguments(): "raw", "lpw", ], - help="The parse_prompt_type can be one of [raw, lpw]. ", ) + help="The parse_prompt_type can be one of [raw, lpw]. ", + ) parser.add_argument( "--channels_last", type=strtobool, default=False, - help="Wheter to use channels_last", ) - parser.add_argument( - "--use_fp16", - type=strtobool, - default=True, - help="Wheter to use FP16 mode") + help="Wheter to use channels_last", + ) + parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode") parser.add_argument("--tf32", type=strtobool, default=True, help="tf32") - parser.add_argument( - "--compile", type=strtobool, default=False, help="compile") + parser.add_argument("--compile", type=strtobool, default=False, help="compile") parser.add_argument( "--attention_type", type=str, @@ -179,12 +180,9 @@ def parse_arguments(): "raw", "sdp", ], - help="attention_type.", ) - parser.add_argument( - "--device_id", - type=int, - default=0, - help="The selected gpu id. -1 means use cpu") + help="attention_type.", + ) + parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu") parser.add_argument( "--scheduler", type=str, @@ -204,21 +202,22 @@ def parse_arguments(): "kdpm2-ancestral", "kdpm2", ], - help="The scheduler type of stable diffusion.", ) - parser.add_argument( - "--height", type=int, default=512, help="Height of input image") - parser.add_argument( - "--width", type=int, default=512, help="Width of input image") + help="The scheduler type of stable diffusion.", + ) + parser.add_argument("--height", type=int, default=512, help="Height of input image") + parser.add_argument("--width", type=int, default=512, help="Width of input image") parser.add_argument( "--low_threshold", type=int, default=100, - help="The value of Canny low threshold.", ) + help="The value of Canny low threshold.", + ) parser.add_argument( "--high_threshold", type=int, default=200, - help="The value of Canny high threshold.", ) + help="The value of Canny high threshold.", + ) return parser.parse_args() @@ -272,14 +271,16 @@ def main(args): seed = 1024 torch_dtype = torch.float16 if args.use_fp16 else torch.float32 controlnet = ControlNetModel.from_pretrained( - args.controlnet_pretrained_model_name_or_path, torch_dtype=torch_dtype) + args.controlnet_pretrained_model_name_or_path, torch_dtype=torch_dtype + ) pipe = StableDiffusionControlNetPipeline.from_pretrained( args.pretrained_model_name_or_path, controlnet=controlnet, safety_checker=None, feature_extractor=None, requires_safety_checker=False, - torch_dtype=torch_dtype, ) + torch_dtype=torch_dtype, + ) scheduler = change_scheduler(pipe, args.scheduler) pipe.scheduler = scheduler if args.device_id >= 0: @@ -291,11 +292,9 @@ def main(args): args.attention_type = [args.attention_type] for attention_type in args.attention_type: - attn_prrocessor_cls = (AttnProcessor - if attention_type == "raw" else AttnProcessor2_0) + attn_prrocessor_cls = AttnProcessor if attention_type == "raw" else AttnProcessor2_0 if attention_type == "sdp": - torch.nn.functional.scaled_dot_product_attention = ( - torch.nn.functional.scaled_dot_product_attention_) + torch.nn.functional.scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention_ set_attn_processor(pipe.unet, attn_prrocessor_cls()) set_attn_processor(pipe.vae, attn_prrocessor_cls()) set_attn_processor(pipe.controlnet, attn_prrocessor_cls()) @@ -306,24 +305,20 @@ def main(args): if args.compile: print("Run torch compile") - pipe.unet = torch.compile( - pipe.unet, mode="reduce-overhead", fullgraph=True) - pipe.controlnet = torch.compile( - pipe.controlnet, mode="reduce-overhead", fullgraph=True) + pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) + pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True) width = args.width height = args.height pipe.set_progress_bar_config(disable=True) - folder = (f"torch_attn_{attention_type}_fp16" - if args.use_fp16 else f"torch_attn_{attention_type}_fp32") + folder = f"torch_attn_{attention_type}_fp16" if args.use_fp16 else f"torch_attn_{attention_type}_fp32" os.makedirs(folder, exist_ok=True) if args.task_name in ["text2img_control", "all"]: init_image = load_image( "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png" ) - controlnet_cond = get_canny_image(init_image, args).resize( - (width, height)) + controlnet_cond = get_canny_image(init_image, args).resize((width, height)) # text2img prompt = "bird" time_costs = [] @@ -334,7 +329,8 @@ def main(args): height=height, width=width, image=controlnet_cond, - controlnet_conditioning_scale=1.0, ) + controlnet_conditioning_scale=1.0, + ) print("==> Test text2img_control performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -345,7 +341,8 @@ def main(args): height=height, width=width, image=controlnet_cond, - controlnet_conditioning_scale=1.0, ).images + controlnet_conditioning_scale=1.0, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") @@ -356,13 +353,11 @@ def main(args): images[0].save(f"{folder}/text2img_control.png") if args.task_name in ["img2img_control", "all"]: - pipe_img2img = StableDiffusionControlNetImg2ImgPipeline( - **pipe.components) + pipe_img2img = StableDiffusionControlNetImg2ImgPipeline(**pipe.components) pipe_img2img.set_progress_bar_config(disable=True) img_url = "sketch-mountains-input.png" init_image = load_image(img_url).resize((width, height)) - controlnet_cond = get_canny_image(init_image, args).resize( - (width, height)) + controlnet_cond = get_canny_image(init_image, args).resize((width, height)) prompt = "A fantasy landscape, trending on artstation" time_costs = [] # warmup @@ -373,7 +368,8 @@ def main(args): height=height, width=width, control_image=controlnet_cond, - controlnet_conditioning_scale=1.0, ) + controlnet_conditioning_scale=1.0, + ) print("==> Test img2img_control performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -385,7 +381,8 @@ def main(args): height=height, width=width, control_image=controlnet_cond, - controlnet_conditioning_scale=1.0, ).images + controlnet_conditioning_scale=1.0, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") @@ -396,15 +393,15 @@ def main(args): images[0].save(f"{folder}/img2img_control.png") if args.task_name in ["inpaint_legacy_control", "all"]: - pipe_inpaint = StableDiffusionControlNetInpaintPipeline( - **pipe.components) + pipe_inpaint = StableDiffusionControlNetInpaintPipeline(**pipe.components) pipe_inpaint.set_progress_bar_config(disable=True) - img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" + img_url = ( + "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" + ) mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png" init_image = load_image(img_url).resize((width, height)) mask_image = load_image(mask_url).resize((width, height)) - controlnet_cond = get_canny_image(init_image, args).resize( - (width, height)) + controlnet_cond = get_canny_image(init_image, args).resize((width, height)) prompt = "Face of a yellow cat, high resolution, sitting on a park bench" time_costs = [] task_name = "inpaint_legacy_control" @@ -416,7 +413,8 @@ def main(args): height=height, width=width, control_image=controlnet_cond, - controlnet_conditioning_scale=1.0, ) + controlnet_conditioning_scale=1.0, + ) print(f"==> Test {task_name} performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -429,7 +427,8 @@ def main(args): height=height, width=width, control_image=controlnet_cond, - controlnet_conditioning_scale=1.0, ).images + controlnet_conditioning_scale=1.0, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") diff --git a/ppdiffusers/deploy/export_model.py b/ppdiffusers/deploy/export_model.py index b7defe65362ce..00b9b4fd03f4a 100644 --- a/ppdiffusers/deploy/export_model.py +++ b/ppdiffusers/deploy/export_model.py @@ -19,26 +19,27 @@ import paddle -from ppdiffusers import (FastDeployRuntimeModel, - FastDeployStableDiffusionInpaintPipeline, - FastDeployStableDiffusionMegaPipeline, - StableDiffusionPipeline, UNet2DConditionModel) +from ppdiffusers import ( + FastDeployRuntimeModel, + FastDeployStableDiffusionInpaintPipeline, + FastDeployStableDiffusionMegaPipeline, + StableDiffusionPipeline, + UNet2DConditionModel, +) def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( - model_path: str, - output_path: str, - sample: bool=False, - height: int=None, - width: int=None, ): + model_path: str, + output_path: str, + sample: bool = False, + height: int = None, + width: int = None, +): # specify unet model with unet pre_temb_act opt enabled. - unet_model = UNet2DConditionModel.from_pretrained( - model_path, resnet_pre_temb_non_linearity=True, subfolder="unet") + unet_model = UNet2DConditionModel.from_pretrained(model_path, resnet_pre_temb_non_linearity=True, subfolder="unet") pipeline = StableDiffusionPipeline.from_pretrained( - model_path, - unet=unet_model, - safety_checker=None, - feature_extractor=None) + model_path, unet=unet_model, safety_checker=None, feature_extractor=None + ) # make sure we disable xformers pipeline.disable_xformers_memory_efficient_attention() output_path = Path(output_path) @@ -46,8 +47,7 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( latent_height = height // 8 if height is not None else None latent_width = width // 8 if width is not None else None # get arguments - cross_attention_dim = ( - pipeline.unet.config.cross_attention_dim) # 768 or 1024 or 1280 + cross_attention_dim = pipeline.unet.config.cross_attention_dim # 768 or 1024 or 1280 unet_channels = pipeline.unet.config.in_channels # 4 or 9 vae_in_channels = pipeline.vae.config.in_channels # 3 vae_latent_channels = pipeline.vae.config.latent_channels # 4 @@ -55,14 +55,12 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( f"cross_attention_dim: {cross_attention_dim}\n", f"unet_in_channels: {unet_channels}\n", f"vae_encoder_in_channels: {vae_in_channels}\n", - f"vae_decoder_latent_channels: {vae_latent_channels}", ) + f"vae_decoder_latent_channels: {vae_latent_channels}", + ) # 1. Convert text_encoder text_encoder = paddle.jit.to_static( pipeline.text_encoder, - input_spec=[ - paddle.static.InputSpec( - shape=[None, None], dtype="int64", name="input_ids") - ], # input_ids + input_spec=[paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")], # input_ids ) save_path = os.path.join(args.output_path, "text_encoder", "inference") paddle.jit.save(text_encoder, save_path) @@ -76,14 +74,16 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( paddle.static.InputSpec( shape=[None, unet_channels, latent_height, latent_width], dtype="float32", - name="sample", ), # sample - paddle.static.InputSpec( - shape=[1], dtype="float32", name="timestep"), # timestep + name="sample", + ), # sample + paddle.static.InputSpec(shape=[1], dtype="float32", name="timestep"), # timestep paddle.static.InputSpec( shape=[None, None, cross_attention_dim], dtype="float32", - name="encoder_hidden_states", ), # encoder_hidden_states - ], ) + name="encoder_hidden_states", + ), # encoder_hidden_states + ], + ) save_path = os.path.join(args.output_path, "unet", "inference") paddle.jit.save(unet, save_path) print(f"Save unet model in {save_path} successfully.") @@ -98,8 +98,7 @@ def forward_vae_encoder_sample(self, z): # 3. Convert vae encoder vae_encoder = pipeline.vae if sample: - vae_encoder.forward = MethodType(forward_vae_encoder_sample, - vae_encoder) + vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder) else: vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder) @@ -111,7 +110,8 @@ def forward_vae_encoder_sample(self, z): dtype="float32", name="sample", # N, C, H, W ), # latent - ], ) + ], + ) # Save vae_encoder in static graph model. save_path = os.path.join(args.output_path, "vae_encoder", "inference") paddle.jit.save(vae_encoder, save_path) @@ -130,8 +130,10 @@ def forward_vae_decoder(self, z): paddle.static.InputSpec( shape=[None, vae_latent_channels, latent_height, latent_width], dtype="float32", - name="latent_sample", ), # latent_sample - ], ) + name="latent_sample", + ), # latent_sample + ], + ) # Save vae_decoder in static graph model. save_path = os.path.join(args.output_path, "vae_decoder", "inference") paddle.jit.save(vae_decoder, save_path) @@ -144,18 +146,16 @@ def forward_vae_decoder(self, z): fd_pipe_cls = FastDeployStableDiffusionMegaPipeline fastdeploy_pipeline = fd_pipe_cls( - vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / - "vae_encoder"), - vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path / - "vae_decoder"), - text_encoder=FastDeployRuntimeModel.from_pretrained(output_path / - "text_encoder"), + vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"), + vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_decoder"), + text_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder"), unet=FastDeployRuntimeModel.from_pretrained(output_path / "unet"), tokenizer=pipeline.tokenizer, scheduler=pipeline.scheduler, safety_checker=None, feature_extractor=None, - requires_safety_checker=False, ) + requires_safety_checker=False, + ) fastdeploy_pipeline.save_pretrained(output_path) print("FastDeploy pipeline saved to", output_path) @@ -169,26 +169,25 @@ def forward_vae_decoder(self, z): required=True, help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).", ) - parser.add_argument( - "--output_path", - type=str, - required=True, - help="Path to the output model.") + parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.") parser.add_argument( "--sample", action="store_true", default=False, - help="Export the vae encoder in mode or sample", ) + help="Export the vae encoder in mode or sample", + ) parser.add_argument( "--height", type=int, default=None, - help="The height of output images. Default: None", ) + help="The height of output images. Default: None", + ) parser.add_argument( "--width", type=int, default=None, - help="The width of output images. Default: None", ) + help="The width of output images. Default: None", + ) args = parser.parse_args() convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( @@ -196,4 +195,5 @@ def forward_vae_decoder(self, z): args.output_path, args.sample, args.height, - args.width, ) + args.width, + ) diff --git a/ppdiffusers/deploy/infer.py b/ppdiffusers/deploy/infer.py index 8445343f255da..60152a7db32f4 100644 --- a/ppdiffusers/deploy/infer.py +++ b/ppdiffusers/deploy/infer.py @@ -25,8 +25,7 @@ from paddlenlp.trainer.argparser import strtobool from tqdm.auto import trange -from ppdiffusers import (DiffusionPipeline, - FastDeployStableDiffusionMegaPipeline) +from ppdiffusers import DiffusionPipeline, FastDeployStableDiffusionMegaPipeline from ppdiffusers.utils import load_image @@ -36,17 +35,20 @@ def parse_arguments(): parser.add_argument( "--model_dir", default="runwayml/stable-diffusion-v1-5@fastdeploy", - help="The model directory of diffusion_model.", ) + help="The model directory of diffusion_model.", + ) parser.add_argument( "--inference_steps", type=int, default=50, - help="The number of unet inference steps.", ) + help="The number of unet inference steps.", + ) parser.add_argument( "--benchmark_steps", type=int, default=1, - help="The number of performance benchmark steps.", ) + help="The number of performance benchmark steps.", + ) parser.add_argument( "--backend", type=str, @@ -66,7 +68,8 @@ def parse_arguments(): "huawei_ascend_npu", "kunlunxin_xpu", ], - help="The inference runtime device of models.", ) + help="The inference runtime device of models.", + ) parser.add_argument( "--task_name", type=str, @@ -91,22 +94,11 @@ def parse_arguments(): "raw", "lpw", ], - help="The parse_prompt_type can be one of [raw, lpw]. ", ) - parser.add_argument( - "--use_fp16", - type=strtobool, - default=True, - help="Wheter to use FP16 mode") - parser.add_argument( - "--use_bf16", - type=strtobool, - default=False, - help="Wheter to use BF16 mode") - parser.add_argument( - "--device_id", - type=int, - default=0, - help="The selected gpu id. -1 means use cpu") + help="The parse_prompt_type can be one of [raw, lpw]. ", + ) + parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode") + parser.add_argument("--use_bf16", type=strtobool, default=False, help="Wheter to use BF16 mode") + parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu") parser.add_argument( "--scheduler", type=str, @@ -127,7 +119,8 @@ def parse_arguments(): "kdpm2-ancestral", "kdpm2", ], - help="The scheduler type of stable diffusion.", ) + help="The scheduler type of stable diffusion.", + ) parser.add_argument( "--infer_op", type=str, @@ -137,23 +130,13 @@ def parse_arguments(): "raw", "all", ], - help="The type of infer op.", ) - parser.add_argument( - "--height", type=int, default=512, help="Height of input image") - parser.add_argument( - "--width", type=int, default=512, help="Width of input image") - parser.add_argument( - "--hr_resize_height", - type=int, - default=768, - help="HR Height of input image") - parser.add_argument( - "--hr_resize_width", - type=int, - default=768, - help="HR Width of input image") - parser.add_argument( - "--is_sd2_0", type=strtobool, default=False, help="Is sd2_0 model?") + help="The type of infer op.", + ) + parser.add_argument("--height", type=int, default=512, help="Height of input image") + parser.add_argument("--width", type=int, default=512, help="Width of input image") + parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image") + parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image") + parser.add_argument("--is_sd2_0", type=strtobool, default=False, help="Is sd2_0 model?") return parser.parse_args() @@ -169,15 +152,16 @@ def create_ort_runtime(device_id=0): def create_paddle_inference_runtime( - use_trt=False, - dynamic_shape=None, - use_fp16=False, - use_bf16=False, - device_id=0, - disable_paddle_trt_ops=[], - disable_paddle_pass=[], - paddle_stream=None, - workspace=None, ): + use_trt=False, + dynamic_shape=None, + use_fp16=False, + use_bf16=False, + device_id=0, + disable_paddle_trt_ops=[], + disable_paddle_pass=[], + paddle_stream=None, + workspace=None, +): assert not use_fp16 or not use_bf16, "use_fp16 and use_bf16 are mutually exclusive" option = fd.RuntimeOption() option.use_paddle_backend() @@ -218,7 +202,8 @@ def create_paddle_inference_runtime( key, shape_dict["min_shape"], shape_dict.get("opt_shape", None), - shape_dict.get("max_shape", None), ) + shape_dict.get("max_shape", None), + ) return option @@ -229,8 +214,10 @@ def create_paddle_lite_runtime(device="cpu", device_id=0, use_fp16=False): option.use_ascend() option.set_lite_device_names(["huawei_ascend_npu"]) option.set_lite_context_properties( - "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS={};HUAWEI_ASCEND_NPU_PRECISION_MODE=allow_mix_precision". - format(device_id)) + "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS={};HUAWEI_ASCEND_NPU_PRECISION_MODE=allow_mix_precision".format( + device_id + ) + ) elif device == "kunlunxin_xpu": # TODO(shentanyue): Add kunlunxin_xpu code # https://github.com/PaddlePaddle/FastDeploy/blob/4c3e7030e151528d304619901c794481bb2f6037/examples/multimodal/stable_diffusion/infer.py#L178-L195 @@ -242,7 +229,8 @@ def create_paddle_lite_runtime(device="cpu", device_id=0, use_fp16=False): autotune_file="", precision="int16", adaptive_seqlen=True, - enable_multi_stream=True, ) + enable_multi_stream=True, + ) if use_fp16: option.enable_lite_fp16() else: @@ -250,10 +238,7 @@ def create_paddle_lite_runtime(device="cpu", device_id=0, use_fp16=False): return option -def create_trt_runtime(workspace=(1 << 31), - dynamic_shape=None, - use_fp16=False, - device_id=0): +def create_trt_runtime(workspace=(1 << 31), dynamic_shape=None, use_fp16=False, device_id=0): option = fd.RuntimeOption() option.use_trt_backend() option.use_gpu(device_id) @@ -267,7 +252,8 @@ def create_trt_runtime(workspace=(1 << 31), key, min_shape=shape_dict["min_shape"], opt_shape=shape_dict.get("opt_shape", None), - max_shape=shape_dict.get("max_shape", None), ) + max_shape=shape_dict.get("max_shape", None), + ) return option @@ -277,8 +263,7 @@ def main(args): paddle_stream = None else: paddle.set_device(f"gpu:{args.device_id}") - paddle_stream = paddle.device.cuda.current_stream( - args.device_id).cuda_stream + paddle_stream = paddle.device.cuda.current_stream(args.device_id).cuda_stream seed = 1024 vae_in_channels = 4 @@ -314,12 +299,9 @@ def main(args): vae_decoder_dynamic_shape = { "latent_sample": { - "min_shape": - [1, vae_in_channels, min_image_size // 8, min_image_size // 8], - "max_shape": - [1, vae_in_channels, max_image_size // 8, max_image_size // 8], - "opt_shape": - [1, vae_in_channels, min_image_size // 8, min_image_size // 8], + "min_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8], + "max_shape": [1, vae_in_channels, max_image_size // 8, max_image_size // 8], + "opt_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8], } } @@ -361,37 +343,38 @@ def main(args): text_encoder=create_ort_runtime(device_id=args.device_id), vae_encoder=create_ort_runtime(device_id=args.device_id), vae_decoder=create_ort_runtime(device_id=args.device_id), - unet=create_ort_runtime(device_id=args.device_id), ) + unet=create_ort_runtime(device_id=args.device_id), + ) elif args.backend == "paddlelite": runtime_options = dict( - text_encoder=create_paddle_lite_runtime( - device=args.device, device_id=args.device_id, use_fp16=False), - vae_encoder=create_paddle_lite_runtime( - device=args.device, device_id=args.device_id, use_fp16=False), - vae_decoder=create_paddle_lite_runtime( - device=args.device, device_id=args.device_id, use_fp16=False), - unet=create_paddle_lite_runtime( - device=args.device, - device_id=args.device_id, - use_fp16=args.use_fp16), ) + text_encoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False), + vae_encoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False), + vae_decoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False), + unet=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=args.use_fp16), + ) elif args.backend == "tensorrt": runtime_options = dict( text_encoder=create_trt_runtime( dynamic_shape=text_encoder_dynamic_shape, use_fp16=args.use_fp16, - device_id=args.device_id, ), + device_id=args.device_id, + ), vae_encoder=create_trt_runtime( dynamic_shape=vae_encoder_dynamic_shape, use_fp16=args.use_fp16, - device_id=args.device_id, ), + device_id=args.device_id, + ), vae_decoder=create_trt_runtime( dynamic_shape=vae_decoder_dynamic_shape, use_fp16=args.use_fp16, - device_id=args.device_id, ), + device_id=args.device_id, + ), unet=create_trt_runtime( dynamic_shape=unet_dynamic_shape, use_fp16=args.use_fp16, - device_id=args.device_id, ), ) + device_id=args.device_id, + ), + ) elif args.backend == "paddle" or args.backend == "paddle_tensorrt": args.use_trt = args.backend == "paddle_tensorrt" runtime_options = dict( @@ -402,31 +385,37 @@ def main(args): use_bf16=args.use_bf16, device_id=args.device_id, disable_paddle_trt_ops=["arg_max", "range", "lookup_table_v2"], - paddle_stream=paddle_stream, ), + paddle_stream=paddle_stream, + ), vae_encoder=create_paddle_inference_runtime( use_trt=args.use_trt, dynamic_shape=vae_encoder_dynamic_shape, use_fp16=args.use_fp16, use_bf16=args.use_bf16, device_id=args.device_id, - paddle_stream=paddle_stream, ), + paddle_stream=paddle_stream, + ), vae_decoder=create_paddle_inference_runtime( use_trt=args.use_trt, dynamic_shape=vae_decoder_dynamic_shape, use_fp16=args.use_fp16, use_bf16=args.use_bf16, device_id=args.device_id, - paddle_stream=paddle_stream, ), + paddle_stream=paddle_stream, + ), unet=create_paddle_inference_runtime( use_trt=args.use_trt, dynamic_shape=unet_dynamic_shape, use_fp16=args.use_fp16, use_bf16=args.use_bf16, device_id=args.device_id, - paddle_stream=paddle_stream, ), ) + paddle_stream=paddle_stream, + ), + ) pipe = FastDeployStableDiffusionMegaPipeline.from_pretrained( args.model_dir, - runtime_options=runtime_options, ) + runtime_options=runtime_options, + ) pipe.set_progress_bar_config(disable=True) pipe.change_scheduler(args.scheduler) parse_prompt_type = args.parse_prompt_type @@ -440,9 +429,7 @@ def main(args): else: infer_op_list = [args.infer_op] if args.device == "kunlunxin_xpu" or args.backend == "paddle": - print( - "When device is kunlunxin_xpu or backend is paddle, we will use `raw` infer op." - ) + print("When device is kunlunxin_xpu or backend is paddle, we will use `raw` infer op.") infer_op_list = ["raw"] for infer_op in infer_op_list: @@ -452,8 +439,7 @@ def main(args): "text_encoder": infer_op, "unet": infer_op, } - folder = (f"infer_op_{infer_op}_fp16" - if args.use_fp16 else f"infer_op_{infer_op}_fp32") + folder = f"infer_op_{infer_op}_fp16" if args.use_fp16 else f"infer_op_{infer_op}_fp32" os.makedirs(folder, exist_ok=True) if args.task_name in ["text2img", "all"]: # text2img @@ -466,7 +452,8 @@ def main(args): height=height, width=width, parse_prompt_type=parse_prompt_type, - infer_op_dict=infer_op_dict, ) + infer_op_dict=infer_op_dict, + ) print("==> Test text2img performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -477,7 +464,8 @@ def main(args): height=height, width=width, parse_prompt_type=parse_prompt_type, - infer_op_dict=infer_op_dict, ).images + infer_op_dict=infer_op_dict, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") @@ -501,7 +489,8 @@ def main(args): height=height, width=width, parse_prompt_type=parse_prompt_type, - infer_op_dict=infer_op_dict, ) + infer_op_dict=infer_op_dict, + ) print("==> Test img2img performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -513,7 +502,8 @@ def main(args): height=height, width=width, parse_prompt_type=parse_prompt_type, - infer_op_dict=infer_op_dict, ).images + infer_op_dict=infer_op_dict, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") @@ -524,7 +514,9 @@ def main(args): images[0].save(f"{folder}/img2img.png") if args.task_name in ["inpaint", "inpaint_legacy", "all"]: - img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" + img_url = ( + "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" + ) mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png" init_image = load_image(img_url) mask_image = load_image(mask_url) @@ -545,7 +537,8 @@ def main(args): height=height, width=width, parse_prompt_type=parse_prompt_type, - infer_op_dict=infer_op_dict, ) + infer_op_dict=infer_op_dict, + ) print(f"==> Test {task_name} performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -558,7 +551,8 @@ def main(args): height=height, width=width, parse_prompt_type=parse_prompt_type, - infer_op_dict=infer_op_dict, ).images + infer_op_dict=infer_op_dict, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") @@ -600,7 +594,8 @@ def main(args): hr_resize_height=hr_resize_height, enable_hr=True, parse_prompt_type=parse_prompt_type, - infer_op_dict=infer_op_dict, ) + infer_op_dict=infer_op_dict, + ) print("==> Test hiresfix performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -614,7 +609,8 @@ def main(args): hr_resize_width=hr_resize_width, hr_resize_height=hr_resize_height, enable_hr=True, - infer_op_dict=infer_op_dict, ).images + infer_op_dict=infer_op_dict, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") @@ -626,7 +622,9 @@ def main(args): if args.task_name in ["cycle_diffusion"]: pipe.change_scheduler("ddim") - image_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/ride_on_horse.png" + image_url = ( + "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/ride_on_horse.png" + ) init_image = load_image(image_url) source_prompt = "An astronaut riding a horse" prompt = "An astronaut riding an elephant" @@ -644,7 +642,8 @@ def main(args): height=height, width=width, parse_prompt_type=parse_prompt_type, - infer_op_dict=infer_op_dict, ).images[0] + infer_op_dict=infer_op_dict, + ).images[0] print("==> Test cycle diffusion performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -661,7 +660,8 @@ def main(args): height=height, width=width, parse_prompt_type=parse_prompt_type, - infer_op_dict=infer_op_dict, ).images + infer_op_dict=infer_op_dict, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") @@ -692,11 +692,13 @@ def main(args): time_costs = [] # warmup mixture_tiling_pipe( - prompt=[[ - "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", - "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", - "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", - ]], + prompt=[ + [ + "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", + "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", + "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", + ] + ], tile_height=512, tile_width=512, tile_row_overlap=0, @@ -704,16 +706,19 @@ def main(args): guidance_scale=8, seed=7178915308, num_inference_steps=50, - infer_op_dict=None, ) + infer_op_dict=None, + ) print("==> Test mixture tiling.") for step in trange(args.benchmark_steps): start = time.time() images = mixture_tiling_pipe( - prompt=[[ - "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", - # "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", - # "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", - ]], + prompt=[ + [ + "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", + # "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", + # "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", + ] + ], tile_height=512, tile_width=512, tile_row_overlap=0, @@ -721,7 +726,8 @@ def main(args): guidance_scale=8, seed=7178915308, num_inference_steps=50, - infer_op_dict=None, )["images"] + infer_op_dict=None, + )["images"] latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") diff --git a/ppdiffusers/deploy/infer_dygraph.py b/ppdiffusers/deploy/infer_dygraph.py index 4516d4b4a6d4e..f2b42a7b1daaa 100644 --- a/ppdiffusers/deploy/infer_dygraph.py +++ b/ppdiffusers/deploy/infer_dygraph.py @@ -35,17 +35,20 @@ def parse_arguments(): parser.add_argument( "--model_dir", default="runwayml/stable-diffusion-v1-5", - help="The model directory of diffusion_model.", ) + help="The model directory of diffusion_model.", + ) parser.add_argument( "--inference_steps", type=int, default=50, - help="The number of unet inference steps.", ) + help="The number of unet inference steps.", + ) parser.add_argument( "--benchmark_steps", type=int, default=1, - help="The number of performance benchmark steps.", ) + help="The number of performance benchmark steps.", + ) parser.add_argument( "--task_name", type=str, @@ -69,23 +72,17 @@ def parse_arguments(): "raw", "lpw", ], - help="The parse_prompt_type can be one of [raw, lpw]. ", ) - parser.add_argument( - "--use_fp16", - type=strtobool, - default=True, - help="Wheter to use FP16 mode") + help="The parse_prompt_type can be one of [raw, lpw]. ", + ) + parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode") parser.add_argument( "--attention_type", type=str, default="raw", choices=["raw", "cutlass", "flash", "all"], - help="attention_type.", ) - parser.add_argument( - "--device_id", - type=int, - default=0, - help="The selected gpu id. -1 means use cpu") + help="attention_type.", + ) + parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu") parser.add_argument( "--scheduler", type=str, @@ -105,21 +102,12 @@ def parse_arguments(): "kdpm2-ancestral", "kdpm2", ], - help="The scheduler type of stable diffusion.", ) - parser.add_argument( - "--height", type=int, default=512, help="Height of input image") - parser.add_argument( - "--width", type=int, default=512, help="Width of input image") - parser.add_argument( - "--hr_resize_height", - type=int, - default=768, - help="HR Height of input image") - parser.add_argument( - "--hr_resize_width", - type=int, - default=768, - help="HR Width of input image") + help="The scheduler type of stable diffusion.", + ) + parser.add_argument("--height", type=int, default=512, help="Height of input image") + parser.add_argument("--width", type=int, default=512, help="Width of input image") + parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image") + parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image") return parser.parse_args() @@ -137,7 +125,8 @@ def main(args): feature_extractor=None, requires_safety_checker=False, paddle_dtype=paddle_dtype, - custom_pipeline="stable_diffusion_mega", ) + custom_pipeline="stable_diffusion_mega", + ) pipe.set_progress_bar_config(disable=True) pipe.change_scheduler(args.scheduler) parse_prompt_type = args.parse_prompt_type @@ -162,16 +151,13 @@ def main(args): raise ValueError(e) if not args.use_fp16 and attention_type == "flash": - print( - "Flash attention is not supported dtype=float32! Please use float16 or bfloat16. We will skip this!" - ) + print("Flash attention is not supported dtype=float32! Please use float16 or bfloat16. We will skip this!") continue width = args.width height = args.height hr_resize_width = args.hr_resize_width hr_resize_height = args.hr_resize_height - folder = (f"attn_{attention_type}_fp16" - if args.use_fp16 else f"attn_{attention_type}_fp32") + folder = f"attn_{attention_type}_fp16" if args.use_fp16 else f"attn_{attention_type}_fp32" os.makedirs(folder, exist_ok=True) if args.task_name in ["text2img", "all"]: # text2img @@ -183,7 +169,8 @@ def main(args): num_inference_steps=10, height=height, width=width, - parse_prompt_type=parse_prompt_type, ) + parse_prompt_type=parse_prompt_type, + ) print("==> Test text2img performance.") paddle.seed(seed) for step in trange(args.benchmark_steps): @@ -193,7 +180,8 @@ def main(args): num_inference_steps=args.inference_steps, height=height, width=width, - parse_prompt_type=parse_prompt_type, ).images + parse_prompt_type=parse_prompt_type, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") @@ -216,7 +204,8 @@ def main(args): num_inference_steps=20, height=height, width=width, - parse_prompt_type=parse_prompt_type, ) + parse_prompt_type=parse_prompt_type, + ) print("==> Test img2img performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -227,7 +216,8 @@ def main(args): num_inference_steps=args.inference_steps, height=height, width=width, - parse_prompt_type=parse_prompt_type, ).images + parse_prompt_type=parse_prompt_type, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") @@ -238,7 +228,9 @@ def main(args): images[0].save(f"{folder}/img2img.png") if args.task_name in ["inpaint", "inpaint_legacy", "all"]: - img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" + img_url = ( + "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" + ) mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png" init_image = load_image(img_url) mask_image = load_image(mask_url) @@ -263,7 +255,8 @@ def main(args): num_inference_steps=20, height=height, width=width, - parse_prompt_type=parse_prompt_type, ) + parse_prompt_type=parse_prompt_type, + ) print(f"==> Test {task_name} performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -275,7 +268,8 @@ def main(args): num_inference_steps=args.inference_steps, height=height, width=width, - parse_prompt_type=parse_prompt_type, ).images + parse_prompt_type=parse_prompt_type, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") @@ -288,7 +282,9 @@ def main(args): if args.task_name in ["cycle_diffusion", "all"]: pipe.change_scheduler("ddim") - image_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/ride_on_horse.png" + image_url = ( + "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/ride_on_horse.png" + ) init_image = load_image(image_url) source_prompt = "An astronaut riding a horse" prompt = "An astronaut riding an elephant" @@ -305,7 +301,8 @@ def main(args): source_guidance_scale=1, height=height, width=width, - parse_prompt_type=parse_prompt_type, ).images[0] + parse_prompt_type=parse_prompt_type, + ).images[0] print("==> Test cycle diffusion performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -321,7 +318,8 @@ def main(args): source_guidance_scale=1, height=height, width=width, - parse_prompt_type=parse_prompt_type, ).images + parse_prompt_type=parse_prompt_type, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") @@ -345,7 +343,8 @@ def main(args): hr_resize_width=hr_resize_width, hr_resize_height=hr_resize_height, enable_hr=True, - parse_prompt_type=parse_prompt_type, ) + parse_prompt_type=parse_prompt_type, + ) print("==> Test hiresfix performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -359,7 +358,8 @@ def main(args): hr_resize_width=hr_resize_width, hr_resize_height=hr_resize_height, enable_hr=True, - parse_prompt_type=parse_prompt_type, ).images + parse_prompt_type=parse_prompt_type, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") diff --git a/ppdiffusers/deploy/infer_dygraph_torch.py b/ppdiffusers/deploy/infer_dygraph_torch.py index fad812b22d8df..0f50cdd5a7502 100644 --- a/ppdiffusers/deploy/infer_dygraph_torch.py +++ b/ppdiffusers/deploy/infer_dygraph_torch.py @@ -18,19 +18,27 @@ import torch -torch.nn.functional.scaled_dot_product_attention_ = ( - torch.nn.functional.scaled_dot_product_attention) +torch.nn.functional.scaled_dot_product_attention_ = torch.nn.functional.scaled_dot_product_attention delattr(torch.nn.functional, "scaled_dot_product_attention") import numpy as np from diffusers import ( - CycleDiffusionPipeline, DDIMScheduler, DDPMScheduler, - DEISMultistepScheduler, DiffusionPipeline, DPMSolverMultistepScheduler, - DPMSolverSinglestepScheduler, EulerAncestralDiscreteScheduler, - EulerDiscreteScheduler, HeunDiscreteScheduler, - KDPM2AncestralDiscreteScheduler, KDPM2DiscreteScheduler, - LMSDiscreteScheduler, PNDMScheduler, UniPCMultistepScheduler) -from diffusers.models.attention_processor import (AttnProcessor, - AttnProcessor2_0) + CycleDiffusionPipeline, + DDIMScheduler, + DDPMScheduler, + DEISMultistepScheduler, + DiffusionPipeline, + DPMSolverMultistepScheduler, + DPMSolverSinglestepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + HeunDiscreteScheduler, + KDPM2AncestralDiscreteScheduler, + KDPM2DiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + UniPCMultistepScheduler, +) +from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0 from diffusers.utils import load_image from tqdm.auto import trange @@ -52,46 +60,40 @@ def change_scheduler(self, scheduler_type="ddim"): self.orginal_scheduler_config = self.scheduler.config scheduler_type = scheduler_type.lower() if scheduler_type == "pndm": - scheduler = PNDMScheduler.from_config( - self.orginal_scheduler_config, skip_prk_steps=True) + scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True) elif scheduler_type == "lms": - scheduler = LMSDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "heun": - scheduler = HeunDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "euler": - scheduler = EulerDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "euler-ancestral": - scheduler = EulerAncestralDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "dpm-multi": - scheduler = DPMSolverMultistepScheduler.from_config( - self.orginal_scheduler_config) + scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "dpm-single": - scheduler = DPMSolverSinglestepScheduler.from_config( - self.orginal_scheduler_config) + scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "kdpm2-ancestral": - scheduler = KDPM2AncestralDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "kdpm2": - scheduler = KDPM2DiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "unipc-multi": - scheduler = UniPCMultistepScheduler.from_config( - self.orginal_scheduler_config) + scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "ddim": scheduler = DDIMScheduler.from_config( self.orginal_scheduler_config, steps_offset=1, clip_sample=False, - set_alpha_to_one=False, ) + set_alpha_to_one=False, + ) elif scheduler_type == "ddpm": - scheduler = DDPMScheduler.from_config(self.orginal_scheduler_config, ) + scheduler = DDPMScheduler.from_config( + self.orginal_scheduler_config, + ) elif scheduler_type == "deis-multi": scheduler = DEISMultistepScheduler.from_config( - self.orginal_scheduler_config, ) + self.orginal_scheduler_config, + ) else: raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") return scheduler @@ -103,17 +105,20 @@ def parse_arguments(): parser.add_argument( "--pretrained_model_name_or_path", default="runwayml/stable-diffusion-v1-5", - help="The model directory of diffusion_model.", ) + help="The model directory of diffusion_model.", + ) parser.add_argument( "--inference_steps", type=int, default=50, - help="The number of unet inference steps.", ) + help="The number of unet inference steps.", + ) parser.add_argument( "--benchmark_steps", type=int, default=10, - help="The number of performance benchmark steps.", ) + help="The number of performance benchmark steps.", + ) parser.add_argument( "--task_name", type=str, @@ -136,20 +141,17 @@ def parse_arguments(): "raw", "lpw", ], - help="The parse_prompt_type can be one of [raw, lpw]. ", ) + help="The parse_prompt_type can be one of [raw, lpw]. ", + ) parser.add_argument( "--channels_last", type=strtobool, default=False, - help="Wheter to use channels_last", ) - parser.add_argument( - "--use_fp16", - type=strtobool, - default=True, - help="Wheter to use FP16 mode") + help="Wheter to use channels_last", + ) + parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode") parser.add_argument("--tf32", type=strtobool, default=True, help="tf32") - parser.add_argument( - "--compile", type=strtobool, default=False, help="compile") + parser.add_argument("--compile", type=strtobool, default=False, help="compile") parser.add_argument( "--attention_type", type=str, @@ -158,12 +160,9 @@ def parse_arguments(): "raw", "sdp", ], - help="attention_type.", ) - parser.add_argument( - "--device_id", - type=int, - default=0, - help="The selected gpu id. -1 means use cpu") + help="attention_type.", + ) + parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu") parser.add_argument( "--scheduler", type=str, @@ -183,11 +182,10 @@ def parse_arguments(): "kdpm2-ancestral", "kdpm2", ], - help="The scheduler type of stable diffusion.", ) - parser.add_argument( - "--height", type=int, default=512, help="Height of input image") - parser.add_argument( - "--width", type=int, default=512, help="Width of input image") + help="The scheduler type of stable diffusion.", + ) + parser.add_argument("--height", type=int, default=512, help="Height of input image") + parser.add_argument("--width", type=int, default=512, help="Width of input image") return parser.parse_args() @@ -246,8 +244,8 @@ def main(args): feature_extractor=None, requires_safety_checker=False, torch_dtype=torch_dtype, - custom_pipeline="stable_diffusion_mega" - if args.parse_prompt_type == "raw" else "lpw_stable_diffusion", ) + custom_pipeline="stable_diffusion_mega" if args.parse_prompt_type == "raw" else "lpw_stable_diffusion", + ) scheduler = change_scheduler(pipe, args.scheduler) pipe.scheduler = scheduler if args.device_id >= 0: @@ -259,11 +257,9 @@ def main(args): args.attention_type = [args.attention_type] for attention_type in args.attention_type: - attn_prrocessor_cls = (AttnProcessor - if attention_type == "raw" else AttnProcessor2_0) + attn_prrocessor_cls = AttnProcessor if attention_type == "raw" else AttnProcessor2_0 if attention_type == "sdp": - torch.nn.functional.scaled_dot_product_attention = ( - torch.nn.functional.scaled_dot_product_attention_) + torch.nn.functional.scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention_ set_attn_processor(pipe.unet, attn_prrocessor_cls()) set_attn_processor(pipe.vae, attn_prrocessor_cls()) @@ -272,15 +268,13 @@ def main(args): if args.compile: print("Run torch compile") - pipe.unet = torch.compile( - pipe.unet, mode="reduce-overhead", fullgraph=True) + pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) width = args.width height = args.height pipe.set_progress_bar_config(disable=True) - folder = (f"torch_attn_{attention_type}_fp16" - if args.use_fp16 else f"torch_attn_{attention_type}_fp32") + folder = f"torch_attn_{attention_type}_fp16" if args.use_fp16 else f"torch_attn_{attention_type}_fp32" os.makedirs(folder, exist_ok=True) if args.task_name in ["text2img", "all"]: # text2img @@ -291,7 +285,8 @@ def main(args): prompt, num_inference_steps=10, height=height, - width=width, ) + width=width, + ) print("==> Test text2img performance.") torch.cuda.manual_seed(seed) for step in trange(args.benchmark_steps): @@ -300,7 +295,8 @@ def main(args): prompt, num_inference_steps=args.inference_steps, height=height, - width=width, ).images + width=width, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") @@ -322,7 +318,8 @@ def main(args): image=init_image, num_inference_steps=20, height=height, - width=width, ) + width=width, + ) print("==> Test img2img performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -332,7 +329,8 @@ def main(args): image=init_image, num_inference_steps=args.inference_steps, height=height, - width=width, ).images + width=width, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") @@ -343,7 +341,9 @@ def main(args): images[0].save(f"{folder}/img2img.png") if args.task_name in ["inpaint", "inpaint_legacy", "all"]: - img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" + img_url = ( + "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" + ) mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png" init_image = load_image(img_url).resize((width, height)) mask_image = load_image(mask_url).resize((width, height)) @@ -365,7 +365,8 @@ def main(args): prompt, image=init_image, mask_image=mask_image, - num_inference_steps=20, ) + num_inference_steps=20, + ) print(f"==> Test {task_name} performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -374,7 +375,8 @@ def main(args): prompt, image=init_image, mask_image=mask_image, - num_inference_steps=args.inference_steps, ).images + num_inference_steps=args.inference_steps, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") @@ -395,7 +397,8 @@ def main(args): scheduler=scheduler, safety_checker=None, feature_extractor=None, - requires_safety_checker=False, ) + requires_safety_checker=False, + ) cycle_pipe.set_progress_bar_config(disable=True) scheduler = change_scheduler(cycle_pipe, "ddim") cycle_pipe.scheduler = scheduler @@ -413,7 +416,8 @@ def main(args): eta=0.1, strength=0.8, guidance_scale=2, - source_guidance_scale=1, ).images[0] + source_guidance_scale=1, + ).images[0] print("==> Test cycle diffusion performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -426,7 +430,8 @@ def main(args): eta=0.1, strength=0.8, guidance_scale=2, - source_guidance_scale=1, ).images + source_guidance_scale=1, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") diff --git a/ppdiffusers/deploy/stable_diffusion_image_variation/export_model.py b/ppdiffusers/deploy/stable_diffusion_image_variation/export_model.py index 3fe17fd46e9c8..d26495eaa34ba 100644 --- a/ppdiffusers/deploy/stable_diffusion_image_variation/export_model.py +++ b/ppdiffusers/deploy/stable_diffusion_image_variation/export_model.py @@ -20,21 +20,23 @@ import paddle from ppdiffusers import ( - FastDeployRuntimeModel, FastDeployStableDiffusionImageVariationPipeline, - StableDiffusionImageVariationPipeline, UNet2DConditionModel) + FastDeployRuntimeModel, + FastDeployStableDiffusionImageVariationPipeline, + StableDiffusionImageVariationPipeline, + UNet2DConditionModel, +) def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( - model_path: str, - output_path: str, - sample: bool=False, - height: int=None, - width: int=None, ): + model_path: str, + output_path: str, + sample: bool = False, + height: int = None, + width: int = None, +): # specify unet model with unet pre_temb_act opt enabled. - unet_model = UNet2DConditionModel.from_pretrained( - model_path, resnet_pre_temb_non_linearity=True, subfolder="unet") - pipeline = StableDiffusionImageVariationPipeline.from_pretrained( - model_path, unet=unet_model, safety_checker=None) + unet_model = UNet2DConditionModel.from_pretrained(model_path, resnet_pre_temb_non_linearity=True, subfolder="unet") + pipeline = StableDiffusionImageVariationPipeline.from_pretrained(model_path, unet=unet_model, safety_checker=None) # make sure we disable xformers pipeline.disable_xformers_memory_efficient_attention() output_path = Path(output_path) @@ -42,8 +44,7 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( latent_height = height // 8 if height is not None else None latent_width = width // 8 if width is not None else None # get arguments - cross_attention_dim = ( - pipeline.unet.config.cross_attention_dim) # 768 or 1024 or 1280 + cross_attention_dim = pipeline.unet.config.cross_attention_dim # 768 or 1024 or 1280 unet_channels = pipeline.unet.config.in_channels # 4 or 9 vae_in_channels = pipeline.vae.config.in_channels # 3 vae_latent_channels = pipeline.vae.config.latent_channels # 4 @@ -51,13 +52,13 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( f"cross_attention_dim: {cross_attention_dim}\n", f"unet_in_channels: {unet_channels}\n", f"vae_encoder_in_channels: {vae_in_channels}\n", - f"vae_decoder_latent_channels: {vae_latent_channels}", ) + f"vae_decoder_latent_channels: {vae_latent_channels}", + ) # 1. Convert image_encoder image_encoder = paddle.jit.to_static( pipeline.image_encoder, input_spec=[ - paddle.static.InputSpec( - shape=[None, 3, 224, 224], dtype="float32", name="pixel_values") + paddle.static.InputSpec(shape=[None, 3, 224, 224], dtype="float32", name="pixel_values") ], # pixel_values ) save_path = os.path.join(args.output_path, "image_encoder", "inference") @@ -72,14 +73,16 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( paddle.static.InputSpec( shape=[None, unet_channels, latent_height, latent_width], dtype="float32", - name="sample", ), # sample - paddle.static.InputSpec( - shape=[1], dtype="float32", name="timestep"), # timestep + name="sample", + ), # sample + paddle.static.InputSpec(shape=[1], dtype="float32", name="timestep"), # timestep paddle.static.InputSpec( shape=[None, None, cross_attention_dim], dtype="float32", - name="encoder_hidden_states", ), # encoder_hidden_states - ], ) + name="encoder_hidden_states", + ), # encoder_hidden_states + ], + ) save_path = os.path.join(args.output_path, "unet", "inference") paddle.jit.save(unet, save_path) print(f"Save unet model in {save_path} successfully.") @@ -94,8 +97,7 @@ def forward_vae_encoder_sample(self, z): # 3. Convert vae encoder vae_encoder = pipeline.vae if sample: - vae_encoder.forward = MethodType(forward_vae_encoder_sample, - vae_encoder) + vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder) else: vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder) @@ -107,7 +109,8 @@ def forward_vae_encoder_sample(self, z): dtype="float32", name="sample", # N, C, H, W ), # latent - ], ) + ], + ) # Save vae_encoder in static graph model. save_path = os.path.join(args.output_path, "vae_encoder", "inference") paddle.jit.save(vae_encoder, save_path) @@ -126,8 +129,10 @@ def forward_vae_decoder(self, z): paddle.static.InputSpec( shape=[None, vae_latent_channels, latent_height, latent_width], dtype="float32", - name="latent_sample", ), # latent_sample - ], ) + name="latent_sample", + ), # latent_sample + ], + ) # Save vae_decoder in static graph model. save_path = os.path.join(args.output_path, "vae_decoder", "inference") paddle.jit.save(vae_decoder, save_path) @@ -137,17 +142,15 @@ def forward_vae_decoder(self, z): fd_pipe_cls = FastDeployStableDiffusionImageVariationPipeline fastdeploy_pipeline = fd_pipe_cls( - vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / - "vae_encoder"), - vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path / - "vae_decoder"), - image_encoder=FastDeployRuntimeModel.from_pretrained(output_path / - "image_encoder"), + vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"), + vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_decoder"), + image_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "image_encoder"), unet=FastDeployRuntimeModel.from_pretrained(output_path / "unet"), scheduler=pipeline.scheduler, safety_checker=None, feature_extractor=pipeline.feature_extractor, - requires_safety_checker=False, ) + requires_safety_checker=False, + ) fastdeploy_pipeline.save_pretrained(output_path) print("FastDeploy pipeline saved to", output_path) @@ -161,26 +164,25 @@ def forward_vae_decoder(self, z): required=True, help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).", ) - parser.add_argument( - "--output_path", - type=str, - required=True, - help="Path to the output model.") + parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.") parser.add_argument( "--sample", action="store_true", default=False, - help="Export the vae encoder in mode or sample", ) + help="Export the vae encoder in mode or sample", + ) parser.add_argument( "--height", type=int, default=None, - help="The height of output images. Default: None", ) + help="The height of output images. Default: None", + ) parser.add_argument( "--width", type=int, default=None, - help="The width of output images. Default: None", ) + help="The width of output images. Default: None", + ) args = parser.parse_args() convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( @@ -188,4 +190,5 @@ def forward_vae_decoder(self, z): args.output_path, args.sample, args.height, - args.width, ) + args.width, + ) diff --git a/ppdiffusers/deploy/stable_diffusion_image_variation/infer.py b/ppdiffusers/deploy/stable_diffusion_image_variation/infer.py index 55b908c4787b2..dcb4b78edb046 100644 --- a/ppdiffusers/deploy/stable_diffusion_image_variation/infer.py +++ b/ppdiffusers/deploy/stable_diffusion_image_variation/infer.py @@ -32,17 +32,20 @@ def parse_arguments(): parser.add_argument( "--model_dir", default="lambdalabs/sd-image-variations-diffusers@fastdeploy", - help="The model directory of diffusion_model.", ) + help="The model directory of diffusion_model.", + ) parser.add_argument( "--inference_steps", type=int, default=50, - help="The number of unet inference steps.", ) + help="The number of unet inference steps.", + ) parser.add_argument( "--benchmark_steps", type=int, default=1, - help="The number of performance benchmark steps.", ) + help="The number of performance benchmark steps.", + ) parser.add_argument( "--backend", type=str, @@ -62,7 +65,8 @@ def parse_arguments(): "huawei_ascend_npu", "kunlunxin_xpu", ], - help="The inference runtime device of models.", ) + help="The inference runtime device of models.", + ) parser.add_argument( "--parse_prompt_type", type=str, @@ -71,22 +75,11 @@ def parse_arguments(): "raw", "lpw", ], - help="The parse_prompt_type can be one of [raw, lpw]. ", ) - parser.add_argument( - "--use_fp16", - type=strtobool, - default=True, - help="Wheter to use FP16 mode") - parser.add_argument( - "--use_bf16", - type=strtobool, - default=False, - help="Wheter to use BF16 mode") - parser.add_argument( - "--device_id", - type=int, - default=0, - help="The selected gpu id. -1 means use cpu") + help="The parse_prompt_type can be one of [raw, lpw]. ", + ) + parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode") + parser.add_argument("--use_bf16", type=strtobool, default=False, help="Wheter to use BF16 mode") + parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu") parser.add_argument( "--scheduler", type=str, @@ -107,7 +100,8 @@ def parse_arguments(): "kdpm2-ancestral", "kdpm2", ], - help="The scheduler type of stable diffusion.", ) + help="The scheduler type of stable diffusion.", + ) parser.add_argument( "--infer_op", type=str, @@ -117,23 +111,13 @@ def parse_arguments(): "raw", "all", ], - help="The type of infer op.", ) - parser.add_argument( - "--height", type=int, default=512, help="Height of input image") - parser.add_argument( - "--width", type=int, default=512, help="Width of input image") - parser.add_argument( - "--hr_resize_height", - type=int, - default=768, - help="HR Height of input image") - parser.add_argument( - "--hr_resize_width", - type=int, - default=768, - help="HR Width of input image") - parser.add_argument( - "--is_sd2_0", type=strtobool, default=False, help="Is sd2_0 model?") + help="The type of infer op.", + ) + parser.add_argument("--height", type=int, default=512, help="Height of input image") + parser.add_argument("--width", type=int, default=512, help="Width of input image") + parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image") + parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image") + parser.add_argument("--is_sd2_0", type=strtobool, default=False, help="Is sd2_0 model?") return parser.parse_args() @@ -149,15 +133,16 @@ def create_ort_runtime(device_id=0): def create_paddle_inference_runtime( - use_trt=False, - dynamic_shape=None, - use_fp16=False, - use_bf16=False, - device_id=0, - disable_paddle_trt_ops=[], - disable_paddle_pass=[], - paddle_stream=None, - workspace=None, ): + use_trt=False, + dynamic_shape=None, + use_fp16=False, + use_bf16=False, + device_id=0, + disable_paddle_trt_ops=[], + disable_paddle_pass=[], + paddle_stream=None, + workspace=None, +): assert not use_fp16 or not use_bf16, "use_fp16 and use_bf16 are mutually exclusive" option = fd.RuntimeOption() option.use_paddle_backend() @@ -198,7 +183,8 @@ def create_paddle_inference_runtime( key, shape_dict["min_shape"], shape_dict.get("opt_shape", None), - shape_dict.get("max_shape", None), ) + shape_dict.get("max_shape", None), + ) return option @@ -209,8 +195,10 @@ def create_paddle_lite_runtime(device="cpu", device_id=0, use_fp16=False): option.use_ascend() option.set_lite_device_names(["huawei_ascend_npu"]) option.set_lite_context_properties( - "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS={};HUAWEI_ASCEND_NPU_PRECISION_MODE=allow_mix_precision". - format(device_id)) + "HUAWEI_ASCEND_NPU_SELECTED_DEVICE_IDS={};HUAWEI_ASCEND_NPU_PRECISION_MODE=allow_mix_precision".format( + device_id + ) + ) elif device == "kunlunxin_xpu": # TODO(shentanyue): Add kunlunxin_xpu code # https://github.com/PaddlePaddle/FastDeploy/blob/4c3e7030e151528d304619901c794481bb2f6037/examples/multimodal/stable_diffusion/infer.py#L178-L195 @@ -222,7 +210,8 @@ def create_paddle_lite_runtime(device="cpu", device_id=0, use_fp16=False): autotune_file="", precision="int16", adaptive_seqlen=True, - enable_multi_stream=True, ) + enable_multi_stream=True, + ) if use_fp16: option.enable_lite_fp16() else: @@ -230,10 +219,7 @@ def create_paddle_lite_runtime(device="cpu", device_id=0, use_fp16=False): return option -def create_trt_runtime(workspace=(1 << 31), - dynamic_shape=None, - use_fp16=False, - device_id=0): +def create_trt_runtime(workspace=(1 << 31), dynamic_shape=None, use_fp16=False, device_id=0): option = fd.RuntimeOption() option.use_trt_backend() option.use_gpu(device_id) @@ -247,7 +233,8 @@ def create_trt_runtime(workspace=(1 << 31), key, min_shape=shape_dict["min_shape"], opt_shape=shape_dict.get("opt_shape", None), - max_shape=shape_dict.get("max_shape", None), ) + max_shape=shape_dict.get("max_shape", None), + ) return option @@ -257,8 +244,7 @@ def main(args): paddle_stream = None else: paddle.set_device(f"gpu:{args.device_id}") - paddle_stream = paddle.device.cuda.current_stream( - args.device_id).cuda_stream + paddle_stream = paddle.device.cuda.current_stream(args.device_id).cuda_stream seed = 1024 vae_in_channels = 4 @@ -286,12 +272,9 @@ def main(args): vae_decoder_dynamic_shape = { "latent_sample": { - "min_shape": - [1, vae_in_channels, min_image_size // 8, min_image_size // 8], - "max_shape": - [1, vae_in_channels, max_image_size // 8, max_image_size // 8], - "opt_shape": - [1, vae_in_channels, min_image_size // 8, min_image_size // 8], + "min_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8], + "max_shape": [1, vae_in_channels, max_image_size // 8, max_image_size // 8], + "opt_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8], } } @@ -333,37 +316,38 @@ def main(args): text_encoder=create_ort_runtime(device_id=args.device_id), vae_encoder=create_ort_runtime(device_id=args.device_id), vae_decoder=create_ort_runtime(device_id=args.device_id), - unet=create_ort_runtime(device_id=args.device_id), ) + unet=create_ort_runtime(device_id=args.device_id), + ) elif args.backend == "paddlelite": runtime_options = dict( - text_encoder=create_paddle_lite_runtime( - device=args.device, device_id=args.device_id, use_fp16=False), - vae_encoder=create_paddle_lite_runtime( - device=args.device, device_id=args.device_id, use_fp16=False), - vae_decoder=create_paddle_lite_runtime( - device=args.device, device_id=args.device_id, use_fp16=False), - unet=create_paddle_lite_runtime( - device=args.device, - device_id=args.device_id, - use_fp16=args.use_fp16), ) + text_encoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False), + vae_encoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False), + vae_decoder=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=False), + unet=create_paddle_lite_runtime(device=args.device, device_id=args.device_id, use_fp16=args.use_fp16), + ) elif args.backend == "tensorrt": runtime_options = dict( image_encoder=create_trt_runtime( dynamic_shape=image_encoder_dynamic_shape, use_fp16=args.use_fp16, - device_id=args.device_id, ), + device_id=args.device_id, + ), vae_encoder=create_trt_runtime( dynamic_shape=vae_encoder_dynamic_shape, use_fp16=args.use_fp16, - device_id=args.device_id, ), + device_id=args.device_id, + ), vae_decoder=create_trt_runtime( dynamic_shape=vae_decoder_dynamic_shape, use_fp16=args.use_fp16, - device_id=args.device_id, ), + device_id=args.device_id, + ), unet=create_trt_runtime( dynamic_shape=unet_dynamic_shape, use_fp16=args.use_fp16, - device_id=args.device_id, ), ) + device_id=args.device_id, + ), + ) elif args.backend == "paddle" or args.backend == "paddle_tensorrt": args.use_trt = args.backend == "paddle_tensorrt" runtime_options = dict( @@ -374,31 +358,37 @@ def main(args): use_bf16=args.use_bf16, device_id=args.device_id, disable_paddle_trt_ops=["arg_max", "range", "lookup_table_v2"], - paddle_stream=paddle_stream, ), + paddle_stream=paddle_stream, + ), vae_encoder=create_paddle_inference_runtime( use_trt=args.use_trt, dynamic_shape=vae_encoder_dynamic_shape, use_fp16=args.use_fp16, use_bf16=args.use_bf16, device_id=args.device_id, - paddle_stream=paddle_stream, ), + paddle_stream=paddle_stream, + ), vae_decoder=create_paddle_inference_runtime( use_trt=args.use_trt, dynamic_shape=vae_decoder_dynamic_shape, use_fp16=args.use_fp16, use_bf16=args.use_bf16, device_id=args.device_id, - paddle_stream=paddle_stream, ), + paddle_stream=paddle_stream, + ), unet=create_paddle_inference_runtime( use_trt=args.use_trt, dynamic_shape=unet_dynamic_shape, use_fp16=args.use_fp16, use_bf16=args.use_bf16, device_id=args.device_id, - paddle_stream=paddle_stream, ), ) + paddle_stream=paddle_stream, + ), + ) pipe = FastDeployStableDiffusionImageVariationPipeline.from_pretrained( args.model_dir, - runtime_options=runtime_options, ) + runtime_options=runtime_options, + ) pipe.set_progress_bar_config(disable=True) pipe.change_scheduler(args.scheduler) # parse_prompt_type = args.parse_prompt_type @@ -412,9 +402,7 @@ def main(args): else: infer_op_list = [args.infer_op] if args.device == "kunlunxin_xpu" or args.backend == "paddle": - print( - "When device is kunlunxin_xpu or backend is paddle, we will use `raw` infer op." - ) + print("When device is kunlunxin_xpu or backend is paddle, we will use `raw` infer op.") infer_op_list = ["raw"] for infer_op in infer_op_list: @@ -424,12 +412,13 @@ def main(args): "image_encoder": infer_op, "unet": infer_op, } - folder = (f"infer_op_{infer_op}_fp16" - if args.use_fp16 else f"infer_op_{infer_op}_fp32") + folder = f"infer_op_{infer_op}_fp16" if args.use_fp16 else f"infer_op_{infer_op}_fp32" os.makedirs(folder, exist_ok=True) # image_variation - img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png" + img_url = ( + "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png" + ) init_image = load_image(img_url) time_costs = [] # warmup @@ -438,7 +427,8 @@ def main(args): num_inference_steps=20, height=height, width=width, - infer_op_dict=infer_op_dict, ) + infer_op_dict=infer_op_dict, + ) print("==> Test image_variation performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -448,7 +438,8 @@ def main(args): num_inference_steps=args.inference_steps, height=height, width=width, - infer_op_dict=infer_op_dict, ).images + infer_op_dict=infer_op_dict, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") diff --git a/ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph.py b/ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph.py index 0b9e67bda034b..17b2290357f54 100644 --- a/ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph.py +++ b/ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph.py @@ -35,17 +35,20 @@ def parse_arguments(): parser.add_argument( "--model_dir", default="runwayml/stable-diffusion-v1-5", - help="The model directory of diffusion_model.", ) + help="The model directory of diffusion_model.", + ) parser.add_argument( "--inference_steps", type=int, default=50, - help="The number of unet inference steps.", ) + help="The number of unet inference steps.", + ) parser.add_argument( "--benchmark_steps", type=int, default=1, - help="The number of performance benchmark steps.", ) + help="The number of performance benchmark steps.", + ) parser.add_argument( "--parse_prompt_type", type=str, @@ -54,37 +57,21 @@ def parse_arguments(): "raw", "lpw", ], - help="The parse_prompt_type can be one of [raw, lpw]. ", ) - parser.add_argument( - "--use_fp16", - type=strtobool, - default=True, - help="Wheter to use FP16 mode") + help="The parse_prompt_type can be one of [raw, lpw]. ", + ) + parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode") parser.add_argument( "--attention_type", type=str, default="raw", choices=["raw", "cutlass", "flash", "all"], - help="attention_type.", ) - parser.add_argument( - "--device_id", - type=int, - default=0, - help="The selected gpu id. -1 means use cpu") - parser.add_argument( - "--height", type=int, default=512, help="Height of input image") - parser.add_argument( - "--width", type=int, default=512, help="Width of input image") - parser.add_argument( - "--hr_resize_height", - type=int, - default=768, - help="HR Height of input image") - parser.add_argument( - "--hr_resize_width", - type=int, - default=768, - help="HR Width of input image") + help="attention_type.", + ) + parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu") + parser.add_argument("--height", type=int, default=512, help="Height of input image") + parser.add_argument("--width", type=int, default=512, help="Width of input image") + parser.add_argument("--hr_resize_height", type=int, default=768, help="HR Height of input image") + parser.add_argument("--hr_resize_width", type=int, default=768, help="HR Width of input image") return parser.parse_args() @@ -99,7 +86,8 @@ def main(args): pipe = StableDiffusionImageVariationPipeline.from_pretrained( args.model_dir, safety_checker=None, - requires_safety_checker=False, ) + requires_safety_checker=False, + ) pipe.set_progress_bar_config(disable=True) # parse_prompt_type = args.parse_prompt_type if args.attention_type == "all": @@ -126,12 +114,13 @@ def main(args): height = args.height # hr_resize_width = args.hr_resize_width # hr_resize_height = args.hr_resize_height - folder = (f"attn_{attention_type}_fp16" - if args.use_fp16 else f"attn_{attention_type}_fp32") + folder = f"attn_{attention_type}_fp16" if args.use_fp16 else f"attn_{attention_type}_fp32" os.makedirs(folder, exist_ok=True) # image_variation - img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png" + img_url = ( + "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png" + ) init_image = load_image(img_url) time_costs = [] # warmup @@ -139,7 +128,8 @@ def main(args): image=init_image, num_inference_steps=20, height=height, - width=width, ) + width=width, + ) print("==> Test image_variation performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -148,7 +138,8 @@ def main(args): image=init_image, num_inference_steps=args.inference_steps, height=height, - width=width, ).images + width=width, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") diff --git a/ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph_torch.py b/ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph_torch.py index 126e4f0819e6c..fb1530d071d21 100644 --- a/ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph_torch.py +++ b/ppdiffusers/deploy/stable_diffusion_image_variation/infer_dygraph_torch.py @@ -18,19 +18,26 @@ import torch -torch.nn.functional.scaled_dot_product_attention_ = ( - torch.nn.functional.scaled_dot_product_attention) +torch.nn.functional.scaled_dot_product_attention_ = torch.nn.functional.scaled_dot_product_attention delattr(torch.nn.functional, "scaled_dot_product_attention") import numpy as np from diffusers import ( - DDIMScheduler, DDPMScheduler, DEISMultistepScheduler, - DPMSolverMultistepScheduler, DPMSolverSinglestepScheduler, - EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, - HeunDiscreteScheduler, KDPM2AncestralDiscreteScheduler, - KDPM2DiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler, - StableDiffusionImageVariationPipeline, UniPCMultistepScheduler) -from diffusers.models.attention_processor import (AttnProcessor, - AttnProcessor2_0) + DDIMScheduler, + DDPMScheduler, + DEISMultistepScheduler, + DPMSolverMultistepScheduler, + DPMSolverSinglestepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + HeunDiscreteScheduler, + KDPM2AncestralDiscreteScheduler, + KDPM2DiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + StableDiffusionImageVariationPipeline, + UniPCMultistepScheduler, +) +from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0 from diffusers.utils import load_image from tqdm.auto import trange @@ -52,46 +59,40 @@ def change_scheduler(self, scheduler_type="ddim"): self.orginal_scheduler_config = self.scheduler.config scheduler_type = scheduler_type.lower() if scheduler_type == "pndm": - scheduler = PNDMScheduler.from_config( - self.orginal_scheduler_config, skip_prk_steps=True) + scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True) elif scheduler_type == "lms": - scheduler = LMSDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "heun": - scheduler = HeunDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "euler": - scheduler = EulerDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "euler-ancestral": - scheduler = EulerAncestralDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "dpm-multi": - scheduler = DPMSolverMultistepScheduler.from_config( - self.orginal_scheduler_config) + scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "dpm-single": - scheduler = DPMSolverSinglestepScheduler.from_config( - self.orginal_scheduler_config) + scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "kdpm2-ancestral": - scheduler = KDPM2AncestralDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "kdpm2": - scheduler = KDPM2DiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "unipc-multi": - scheduler = UniPCMultistepScheduler.from_config( - self.orginal_scheduler_config) + scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "ddim": scheduler = DDIMScheduler.from_config( self.orginal_scheduler_config, steps_offset=1, clip_sample=False, - set_alpha_to_one=False, ) + set_alpha_to_one=False, + ) elif scheduler_type == "ddpm": - scheduler = DDPMScheduler.from_config(self.orginal_scheduler_config, ) + scheduler = DDPMScheduler.from_config( + self.orginal_scheduler_config, + ) elif scheduler_type == "deis-multi": scheduler = DEISMultistepScheduler.from_config( - self.orginal_scheduler_config, ) + self.orginal_scheduler_config, + ) else: raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") return scheduler @@ -103,17 +104,20 @@ def parse_arguments(): parser.add_argument( "--pretrained_model_name_or_path", default="runwayml/stable-diffusion-v1-5", - help="The model directory of diffusion_model.", ) + help="The model directory of diffusion_model.", + ) parser.add_argument( "--inference_steps", type=int, default=50, - help="The number of unet inference steps.", ) + help="The number of unet inference steps.", + ) parser.add_argument( "--benchmark_steps", type=int, default=10, - help="The number of performance benchmark steps.", ) + help="The number of performance benchmark steps.", + ) parser.add_argument( "--parse_prompt_type", type=str, @@ -122,20 +126,17 @@ def parse_arguments(): "raw", "lpw", ], - help="The parse_prompt_type can be one of [raw, lpw]. ", ) + help="The parse_prompt_type can be one of [raw, lpw]. ", + ) parser.add_argument( "--channels_last", type=strtobool, default=False, - help="Wheter to use channels_last", ) - parser.add_argument( - "--use_fp16", - type=strtobool, - default=True, - help="Wheter to use FP16 mode") + help="Wheter to use channels_last", + ) + parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode") parser.add_argument("--tf32", type=strtobool, default=True, help="tf32") - parser.add_argument( - "--compile", type=strtobool, default=False, help="compile") + parser.add_argument("--compile", type=strtobool, default=False, help="compile") parser.add_argument( "--attention_type", type=str, @@ -144,12 +145,9 @@ def parse_arguments(): "raw", "sdp", ], - help="attention_type.", ) - parser.add_argument( - "--device_id", - type=int, - default=0, - help="The selected gpu id. -1 means use cpu") + help="attention_type.", + ) + parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu") parser.add_argument( "--scheduler", type=str, @@ -169,11 +167,10 @@ def parse_arguments(): "kdpm2-ancestral", "kdpm2", ], - help="The scheduler type of stable diffusion.", ) - parser.add_argument( - "--height", type=int, default=512, help="Height of input image") - parser.add_argument( - "--width", type=int, default=512, help="Width of input image") + help="The scheduler type of stable diffusion.", + ) + parser.add_argument("--height", type=int, default=512, help="Height of input image") + parser.add_argument("--width", type=int, default=512, help="Width of input image") return parser.parse_args() @@ -230,7 +227,8 @@ def main(args): args.pretrained_model_name_or_path, safety_checker=None, requires_safety_checker=False, - torch_dtype=torch_dtype, ) + torch_dtype=torch_dtype, + ) scheduler = change_scheduler(pipe, args.scheduler) pipe.scheduler = scheduler if args.device_id >= 0: @@ -242,11 +240,9 @@ def main(args): args.attention_type = [args.attention_type] for attention_type in args.attention_type: - attn_prrocessor_cls = (AttnProcessor - if attention_type == "raw" else AttnProcessor2_0) + attn_prrocessor_cls = AttnProcessor if attention_type == "raw" else AttnProcessor2_0 if attention_type == "sdp": - torch.nn.functional.scaled_dot_product_attention = ( - torch.nn.functional.scaled_dot_product_attention_) + torch.nn.functional.scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention_ set_attn_processor(pipe.unet, attn_prrocessor_cls()) set_attn_processor(pipe.vae, attn_prrocessor_cls()) @@ -255,19 +251,19 @@ def main(args): if args.compile: print("Run torch compile") - pipe.unet = torch.compile( - pipe.unet, mode="reduce-overhead", fullgraph=True) + pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) width = args.width height = args.height pipe.set_progress_bar_config(disable=True) - folder = (f"torch_attn_{attention_type}_fp16" - if args.use_fp16 else f"torch_attn_{attention_type}_fp32") + folder = f"torch_attn_{attention_type}_fp16" if args.use_fp16 else f"torch_attn_{attention_type}_fp32" os.makedirs(folder, exist_ok=True) # image_vairation - img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png" + img_url = ( + "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png" + ) init_image = load_image(img_url).resize((width, height)) time_costs = [] # warmup @@ -275,7 +271,8 @@ def main(args): image=init_image, num_inference_steps=20, height=height, - width=width, ) + width=width, + ) print("==> Test image_vairation performance.") for step in trange(args.benchmark_steps): start = time.time() @@ -284,7 +281,8 @@ def main(args): image=init_image, num_inference_steps=args.inference_steps, height=height, - width=width, ).images + width=width, + ).images latency = time.time() - start time_costs += [latency] # print(f"No {step:3d} time cost: {latency:2f} s") diff --git a/ppdiffusers/examples/Stable-CycleDiffusion/app.py b/ppdiffusers/examples/Stable-CycleDiffusion/app.py index 121d115b10745..705f42d3d6fa3 100644 --- a/ppdiffusers/examples/Stable-CycleDiffusion/app.py +++ b/ppdiffusers/examples/Stable-CycleDiffusion/app.py @@ -37,7 +37,8 @@ pipe = CycleDiffusionPipeline.from_pretrained( model_id_or_path, use_auth_token=os.environ.get("USER_TOKEN"), - paddle_dtype=paddle_dtype, ) + paddle_dtype=paddle_dtype, +) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) tokenizer = pipe.tokenizer @@ -45,17 +46,11 @@ class LocalBlend: def __call__(self, x_t, attention_store): k = 1 - maps = attention_store["down_cross"][2:4] + attention_store[ - "up_cross"][:3] - maps = [ - item.reshape( - [self.alpha_layers.shape[0], -1, 1, 16, 16, MAX_NUM_WORDS]) - for item in maps - ] + maps = attention_store["down_cross"][2:4] + attention_store["up_cross"][:3] + maps = [item.reshape([self.alpha_layers.shape[0], -1, 1, 16, 16, MAX_NUM_WORDS]) for item in maps] maps = paddle.concat(maps, axis=1) maps = (maps * self.alpha_layers).sum(-1).mean(1) - mask = F.max_pool2d( - maps, (k * 2 + 1, k * 2 + 1), (1, 1), padding=(k, k)) + mask = F.max_pool2d(maps, (k * 2 + 1, k * 2 + 1), (1, 1), padding=(k, k)) mask = F.interpolate(mask, size=(x_t.shape[2:])) mask = mask / mask.max(2, keepdim=True)[0].max(3, keepdim=True)[0] mask = mask > self.threshold @@ -150,8 +145,7 @@ def between_steps(self): def get_average_attention(self): average_attention = { - key: [item / self.cur_step for item in self.attention_store[key]] - for key in self.attention_store + key: [item / self.cur_step for item in self.attention_store[key]] for key in self.attention_store } return average_attention @@ -174,8 +168,7 @@ def step_callback(self, x_t): def replace_self_attention(self, attn_base, att_replace): if att_replace.shape[2] <= 16**2: - return attn_base.unsqueeze(0).expand( - [att_replace.shape[0], *attn_base.shape]) + return attn_base.unsqueeze(0).expand([att_replace.shape[0], *attn_base.shape]) else: return att_replace @@ -185,36 +178,35 @@ def replace_cross_attention(self, attn_base, att_replace): def forward(self, attn, is_cross: bool, place_in_unet: str): super(AttentionControlEdit, self).forward(attn, is_cross, place_in_unet) - if is_cross or (self.num_self_replace[0] <= self.cur_step < - self.num_self_replace[1]): + if is_cross or (self.num_self_replace[0] <= self.cur_step < self.num_self_replace[1]): attn_base, attn_repalce = attn[0], attn[1:] if is_cross: alpha_words = self.cross_replace_alpha[self.cur_step] - attn_replace_new = (self.replace_cross_attention( - attn_base, attn_repalce) * alpha_words + - (1 - alpha_words) * attn_repalce) + attn_replace_new = ( + self.replace_cross_attention(attn_base, attn_repalce) * alpha_words + + (1 - alpha_words) * attn_repalce + ) attn[1:] = attn_replace_new else: attn[1:] = self.replace_self_attention(attn_base, attn_repalce) return attn def __init__( - self, - prompts, - num_steps: int, - cross_replace_steps: Union[float, Tuple[float, float], Dict[ - str, Tuple[float, float]]], - self_replace_steps: Union[float, Tuple[float, float]], - local_blend: Optional[LocalBlend], ): + self, + prompts, + num_steps: int, + cross_replace_steps: Union[float, Tuple[float, float], Dict[str, Tuple[float, float]]], + self_replace_steps: Union[float, Tuple[float, float]], + local_blend: Optional[LocalBlend], + ): super(AttentionControlEdit, self).__init__() self.batch_size = len(prompts) self.cross_replace_alpha = ptp_utils.get_time_words_attention_alpha( - prompts, num_steps, cross_replace_steps, - tokenizer).cast(paddle_dtype) + prompts, num_steps, cross_replace_steps, tokenizer + ).cast(paddle_dtype) if type(self_replace_steps) is float or type(self_replace_steps) is int: self_replace_steps = 0, self_replace_steps - self.num_self_replace = int(num_steps * self_replace_steps[0]), int( - num_steps * self_replace_steps[1]) + self.num_self_replace = int(num_steps * self_replace_steps[0]), int(num_steps * self_replace_steps[1]) self.local_blend = local_blend @@ -223,17 +215,17 @@ def replace_cross_attention(self, attn_base, att_replace): return paddle.einsum("hpw,bwn->bhpn", attn_base, self.mapper) def __init__( - self, - prompts, - num_steps: int, - cross_replace_steps: float, - self_replace_steps: float, - local_blend: Optional[LocalBlend]=None, ): - super(AttentionReplace, self).__init__(prompts, num_steps, - cross_replace_steps, - self_replace_steps, local_blend) - self.mapper = seq_aligner.get_replacement_mapper( - prompts, tokenizer).cast(paddle_dtype) + self, + prompts, + num_steps: int, + cross_replace_steps: float, + self_replace_steps: float, + local_blend: Optional[LocalBlend] = None, + ): + super(AttentionReplace, self).__init__( + prompts, num_steps, cross_replace_steps, self_replace_steps, local_blend + ) + self.mapper = seq_aligner.get_replacement_mapper(prompts, tokenizer).cast(paddle_dtype) class AttentionRefine(AttentionControlEdit): @@ -243,35 +235,33 @@ def replace_cross_attention(self, attn_base, att_replace): # pt: a[:, :, b].shape = torch.Size([8, 4096, 1, 77]) # pd: a.take_along_axis(b.unsqueeze(0), axis=-1).unsqueeze(-2) - attn_base_replace = (attn_base.take_along_axis( - self.mapper.unsqueeze(0), axis=-1).unsqueeze(-2) - .transpose([2, 0, 1, 3])) - attn_replace = attn_base_replace * self.alphas + att_replace * ( - 1 - self.alphas) + attn_base_replace = ( + attn_base.take_along_axis(self.mapper.unsqueeze(0), axis=-1).unsqueeze(-2).transpose([2, 0, 1, 3]) + ) + attn_replace = attn_base_replace * self.alphas + att_replace * (1 - self.alphas) return attn_replace def __init__( - self, - prompts, - num_steps: int, - cross_replace_steps: float, - self_replace_steps: float, - local_blend: Optional[LocalBlend]=None, ): - super(AttentionRefine, self).__init__(prompts, num_steps, - cross_replace_steps, - self_replace_steps, local_blend) - self.mapper, alphas = seq_aligner.get_refinement_mapper(prompts, - tokenizer) + self, + prompts, + num_steps: int, + cross_replace_steps: float, + self_replace_steps: float, + local_blend: Optional[LocalBlend] = None, + ): + super(AttentionRefine, self).__init__(prompts, num_steps, cross_replace_steps, self_replace_steps, local_blend) + self.mapper, alphas = seq_aligner.get_refinement_mapper(prompts, tokenizer) alphas = alphas.cast(paddle_dtype) self.alphas = alphas.reshape([alphas.shape[0], 1, 1, alphas.shape[1]]) def get_equalizer( - text: str, - word_select: Union[int, Tuple[int, ...]], - values: Union[List[float], Tuple[float, ...]], ): + text: str, + word_select: Union[int, Tuple[int, ...]], + values: Union[List[float], Tuple[float, ...]], +): if type(word_select) is int or type(word_select) is str: - word_select = (word_select, ) + word_select = (word_select,) equalizer = paddle.ones([len(values), 77]) values = paddle.to_tensor(values, dtype=paddle_dtype) for word in word_select: @@ -281,19 +271,20 @@ def get_equalizer( def inference( - source_prompt, - target_prompt, - source_guidance_scale=1, - guidance_scale=5, - num_inference_steps=100, - width=512, - height=512, - seed=0, - img=None, - strength=0.7, - cross_attention_control="None", - cross_replace_steps=0.8, - self_replace_steps=0.4, ): + source_prompt, + target_prompt, + source_guidance_scale=1, + guidance_scale=5, + num_inference_steps=100, + width=512, + height=512, + seed=0, + img=None, + strength=0.7, + cross_attention_control="None", + cross_replace_steps=0.8, + self_replace_steps=0.4, +): paddle.seed(seed) @@ -312,21 +303,22 @@ def inference( [source_prompt, target_prompt], num_inference_steps, cross_replace_steps=cross_replace_steps, - self_replace_steps=self_replace_steps, ) + self_replace_steps=self_replace_steps, + ) ptp_utils.register_attention_control(pipe, controller) elif cross_attention_control == "Refine": controller = AttentionRefine( [source_prompt, target_prompt], num_inference_steps, cross_replace_steps=cross_replace_steps, - self_replace_steps=self_replace_steps, ) + self_replace_steps=self_replace_steps, + ) ptp_utils.register_attention_control(pipe, controller) elif cross_attention_control == "None": controller = EmptyControl() ptp_utils.register_attention_control(pipe, controller) else: - raise ValueError("Unknown cross_attention_control: {}".format( - cross_attention_control)) + raise ValueError("Unknown cross_attention_control: {}".format(cross_attention_control)) with paddle.amp.auto_cast(True, level="O2"): results = pipe( @@ -337,7 +329,8 @@ def inference( eta=0.1, strength=strength, guidance_scale=guidance_scale, - source_guidance_scale=source_guidance_scale, ) + source_guidance_scale=source_guidance_scale, + ) if pipe.safety_checker is None: return results.images[0] else: @@ -354,7 +347,8 @@ def replace_nsfw_images(results): css = """.cycle-diffusion-div div{display:inline-flex;align-items:center;gap:.8rem;font-size:1.75rem}.cycle-diffusion-div div h1{font-weight:900;margin-bottom:7px}.cycle-diffusion-div p{margin-bottom:10px;font-size:94%}.cycle-diffusion-div p a{text-decoration:underline}.tabs{margin-top:0;margin-bottom:0}#gallery{min-height:20rem} """ with gr.Blocks(css=css) as demo: - gr.HTML(""" + gr.HTML( + """

CycleDiffusion with Stable Diffusion

@@ -370,9 +364,11 @@ def replace_nsfw_images(results): 2. Click the "Run CycleDiffusion" button.

- """) + """ + ) with gr.Accordion("See Details", open=False): - gr.HTML(""" + gr.HTML( + """

How to use:
@@ -396,14 +392,14 @@ def replace_nsfw_images(results): 1. 20s on A10G.

- """) + """ + ) with gr.Row(): with gr.Column(scale=55): with gr.Group(): - img = gr.Image( - label="Input image", height=512, tool="editor", type="pil") + img = gr.Image(label="Input image", height=512, tool="editor", type="pil") image_out = gr.Image(label="Output image", height=512) # gallery = gr.Gallery( @@ -422,7 +418,8 @@ def replace_nsfw_images(results): label="Source guidance scale", value=1, minimum=1, - maximum=10, ) + maximum=10, + ) with gr.Row(): target_prompt = gr.Textbox( label="Target prompt", @@ -432,14 +429,16 @@ def replace_nsfw_images(results): label="Target guidance scale", value=5, minimum=1, - maximum=10, ) + maximum=10, + ) with gr.Row(): strength = gr.Slider( label="Strength", value=0.7, minimum=0.5, maximum=1, - step=0.01, ) + step=0.01, + ) with gr.Row(): generate1 = gr.Button(value="Run CycleDiffusion") @@ -449,7 +448,8 @@ def replace_nsfw_images(results): cross_attention_control = gr.Radio( label="CAC type", choices=["None", "Replace", "Refine"], - value="None", ) + value="None", + ) with gr.Row(): # If not "None", the following two parameters will be used. cross_replace_steps = gr.Slider( @@ -457,13 +457,15 @@ def replace_nsfw_images(results): value=0.8, minimum=0.0, maximum=1, - step=0.01, ) + step=0.01, + ) self_replace_steps = gr.Slider( label="Self replace steps", value=0.4, minimum=0.0, maximum=1, - step=0.01, ) + step=0.01, + ) with gr.Row(): generate2 = gr.Button(value="Run CycleDiffusion") @@ -475,23 +477,13 @@ def replace_nsfw_images(results): value=100, minimum=25, maximum=500, - step=1, ) - width = gr.Slider( - label="Width", - value=512, - minimum=512, - maximum=1024, - step=8) - height = gr.Slider( - label="Height", - value=512, - minimum=512, - maximum=1024, - step=8) + step=1, + ) + width = gr.Slider(label="Width", value=512, minimum=512, maximum=1024, step=8) + height = gr.Slider(label="Height", value=512, minimum=512, maximum=1024, step=8) with gr.Row(): - seed = gr.Slider( - 0, 2147483647, label="Seed", value=0, step=1) + seed = gr.Slider(0, 2147483647, label="Seed", value=0, step=1) with gr.Row(): generate3 = gr.Button(value="Run CycleDiffusion") @@ -714,11 +706,14 @@ def replace_nsfw_images(results): ], image_out, inference, - cache_examples=True, ) + cache_examples=True, + ) - gr.Markdown(""" + gr.Markdown( + """ Space built with PPDiffusers 🧨 by PaddleNLP. [![Twitter Follow](https://img.shields.io/twitter/follow/ChenHenryWu?style=social)](https://twitter.com/ChenHenryWu) - """) + """ + ) demo.launch(debug=True, share=True, server_name="0.0.0.0", server_port=8581) diff --git a/ppdiffusers/examples/Stable-CycleDiffusion/ptp_utils.py b/ppdiffusers/examples/Stable-CycleDiffusion/ptp_utils.py index d09df121e1427..15df9ac4402ff 100644 --- a/ppdiffusers/examples/Stable-CycleDiffusion/ptp_utils.py +++ b/ppdiffusers/examples/Stable-CycleDiffusion/ptp_utils.py @@ -22,13 +22,9 @@ def register_attention_control(model, controller): def ca_forward(self, place_in_unet): - def forward(hidden_states, - encoder_hidden_states=None, - attention_mask=None, - **cross_attention_kwargs): + def forward(hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs): batch_size, sequence_length, _ = hidden_states.shape - attention_mask = self.prepare_attention_mask( - attention_mask, sequence_length, batch_size) + attention_mask = self.prepare_attention_mask(attention_mask, sequence_length, batch_size) query = self.to_q(hidden_states) query = self.head_to_batch_dim(query) @@ -41,11 +37,9 @@ def forward(hidden_states, key = self.head_to_batch_dim(key) value = self.head_to_batch_dim(value) - attention_probs = self.get_attention_scores(query, key, - attention_mask) + attention_probs = self.get_attention_scores(query, key, attention_mask) - attention_probs = controller(attention_probs, is_cross, - place_in_unet) + attention_probs = controller(attention_probs, is_cross, place_in_unet) hidden_states = paddle.matmul(attention_probs, value) hidden_states = self.batch_to_head_dim(hidden_states) @@ -82,17 +76,12 @@ def register_recr(net_, count, place_in_unet): def get_word_inds(text: str, word_place: int, tokenizer): split_text = text.split(" ") if type(word_place) is str: - word_place = [ - i for i, word in enumerate(split_text) if word_place == word - ] + word_place = [i for i, word in enumerate(split_text) if word_place == word] elif type(word_place) is int: word_place = [word_place] out = [] if len(word_place) > 0: - words_encode = [ - tokenizer.decode([item]).strip("#") - for item in tokenizer.encode(text).input_ids - ][1:-1] + words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text).input_ids][1:-1] cur_len, ptr = 0, 0 for i in range(len(words_encode)): @@ -106,14 +95,14 @@ def get_word_inds(text: str, word_place: int, tokenizer): def update_alpha_time_word( - alpha, - bounds: Union[float, Tuple[float, float]], - prompt_ind: int, - word_inds: Optional[paddle.Tensor]=None, ): + alpha, + bounds: Union[float, Tuple[float, float]], + prompt_ind: int, + word_inds: Optional[paddle.Tensor] = None, +): if type(bounds) is float or bounds == 0: bounds = 0, bounds - start, end = int(bounds[0] * alpha.shape[0]), int(bounds[1] * - alpha.shape[0]) + start, end = int(bounds[0] * alpha.shape[0]), int(bounds[1] * alpha.shape[0]) if word_inds is None: word_inds = paddle.arange(alpha.shape[2]) alpha[:start, prompt_ind, word_inds] = 0 @@ -123,32 +112,26 @@ def update_alpha_time_word( def get_time_words_attention_alpha( - prompts, - num_steps, - cross_replace_steps: Union[float, Tuple[float, float], Dict[str, Tuple[ - float, float]]], - tokenizer, - max_num_words=77, ): + prompts, + num_steps, + cross_replace_steps: Union[float, Tuple[float, float], Dict[str, Tuple[float, float]]], + tokenizer, + max_num_words=77, +): if type(cross_replace_steps) is not dict: cross_replace_steps = {"default_": cross_replace_steps} if "default_" not in cross_replace_steps: cross_replace_steps["default_"] = (0.0, 1.0) - alpha_time_words = paddle.zeros( - [num_steps + 1, len(prompts) - 1, max_num_words]) + alpha_time_words = paddle.zeros([num_steps + 1, len(prompts) - 1, max_num_words]) for i in range(len(prompts) - 1): - alpha_time_words = update_alpha_time_word( - alpha_time_words, cross_replace_steps["default_"], i) + alpha_time_words = update_alpha_time_word(alpha_time_words, cross_replace_steps["default_"], i) for key, item in cross_replace_steps.items(): if key != "default_": - inds = [ - get_word_inds(prompts[i], key, tokenizer) - for i in range(1, len(prompts)) - ] + inds = [get_word_inds(prompts[i], key, tokenizer) for i in range(1, len(prompts))] for i, ind in enumerate(inds): if len(ind) > 0: - alpha_time_words = update_alpha_time_word(alpha_time_words, - item, i, ind) + alpha_time_words = update_alpha_time_word(alpha_time_words, item, i, ind) alpha_time_words = alpha_time_words.reshape( - [num_steps + 1, len(prompts) - 1, 1, 1, - max_num_words]) # time, batch, heads, pixels, words + [num_steps + 1, len(prompts) - 1, 1, 1, max_num_words] + ) # time, batch, heads, pixels, words return alpha_time_words diff --git a/ppdiffusers/examples/Stable-CycleDiffusion/seq_aligner.py b/ppdiffusers/examples/Stable-CycleDiffusion/seq_aligner.py index e1b1bc7bb6ccf..24c30b91e7f7d 100644 --- a/ppdiffusers/examples/Stable-CycleDiffusion/seq_aligner.py +++ b/ppdiffusers/examples/Stable-CycleDiffusion/seq_aligner.py @@ -66,8 +66,7 @@ def global_align(x, y, score): for j in range(1, len(y) + 1): left = matrix[i, j - 1] + score.gap up = matrix[i - 1, j] + score.gap - diag = matrix[i - 1, j - 1] + score.mis_match_char(x[i - 1], - y[j - 1]) + diag = matrix[i - 1, j - 1] + score.mis_match_char(x[i - 1], y[j - 1]) matrix[i, j] = max(left, up, diag) if matrix[i, j] == left: trace_back[i, j] = 1 @@ -112,14 +111,20 @@ def get_mapper(x: str, y: str, tokenizer, max_len=77): score = ScoreParams(0, 1, -1) matrix, trace_back = global_align(x_seq, y_seq, score) mapper_base = get_aligned_sequences(x_seq, y_seq, trace_back)[-1] - alphas = paddle.ones([max_len, ]) - alphas[:mapper_base.shape[0]] = (mapper_base[:, 1] != -1).cast("float32") + alphas = paddle.ones( + [ + max_len, + ] + ) + alphas[: mapper_base.shape[0]] = (mapper_base[:, 1] != -1).cast("float32") mapper = paddle.zeros( - [max_len, ], - dtype=paddle.int64, ) - mapper[:mapper_base.shape[0]] = mapper_base[:, 1] - mapper[mapper_base.shape[0]:] = len(y_seq) + paddle.arange( - max_len - len(y_seq), dtype="int64") + [ + max_len, + ], + dtype=paddle.int64, + ) + mapper[: mapper_base.shape[0]] = mapper_base[:, 1] + mapper[mapper_base.shape[0] :] = len(y_seq) + paddle.arange(max_len - len(y_seq), dtype="int64") return mapper, alphas @@ -136,17 +141,12 @@ def get_refinement_mapper(prompts, tokenizer, max_len=77): def get_word_inds(text: str, word_place: int, tokenizer): split_text = text.split(" ") if type(word_place) is str: - word_place = [ - i for i, word in enumerate(split_text) if word_place == word - ] + word_place = [i for i, word in enumerate(split_text) if word_place == word] elif type(word_place) is int: word_place = [word_place] out = [] if len(word_place) > 0: - words_encode = [ - tokenizer.decode([item]).strip("#") - for item in tokenizer.encode(text).input_ids - ][1:-1] + words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text).input_ids][1:-1] cur_len, ptr = 0, 0 for i in range(len(words_encode)): @@ -175,8 +175,7 @@ def get_replacement_mapper_(x: str, y: str, tokenizer, max_len=77): cur_inds = 0 while i < max_len and j < max_len: if cur_inds < len(inds_source) and inds_source[cur_inds][0] == i: - inds_source_, inds_target_ = inds_source[cur_inds], inds_target[ - cur_inds] + inds_source_, inds_target_ = inds_source[cur_inds], inds_target[cur_inds] if len(inds_source_) == len(inds_target_): mapper[inds_source_, inds_target_] = 1 else: diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/autoencoder_datasets.py b/ppdiffusers/examples/autoencoder/vae/ldm/autoencoder_datasets.py index 81a81d63cc039..e086453002714 100644 --- a/ppdiffusers/examples/autoencoder/vae/ldm/autoencoder_datasets.py +++ b/ppdiffusers/examples/autoencoder/vae/ldm/autoencoder_datasets.py @@ -45,13 +45,10 @@ def __init__(self, paths, size=None, random_crop=False, labels=None): if self.size is not None and self.size > 0: self.rescaler = albumentations.SmallestMaxSize(max_size=self.size) if not self.random_crop: - self.cropper = albumentations.CenterCrop( - height=self.size, width=self.size) + self.cropper = albumentations.CenterCrop(height=self.size, width=self.size) else: - self.cropper = albumentations.RandomCrop( - height=self.size, width=self.size) - self.preprocessor = albumentations.Compose( - [self.rescaler, self.cropper]) + self.cropper = albumentations.RandomCrop(height=self.size, width=self.size) + self.preprocessor = albumentations.Compose([self.rescaler, self.cropper]) else: self.preprocessor = lambda **kwargs: kwargs @@ -102,12 +99,7 @@ def __init__(self, cause, keys=None, visited=None): super().__init__(message) -def retrieve(list_or_dict, - key, - splitval="/", - default=None, - expand=True, - pass_success=False): +def retrieve(list_or_dict, key, splitval="/", default=None, expand=True, pass_success=False): """Given a nested list or dict return the desired value at key expanding callable nodes if necessary and :attr:`expand` is ``True``. The expansion is done in-place. @@ -150,11 +142,10 @@ def retrieve(list_or_dict, if callable(list_or_dict): if not expand: raise KeyNotFoundError( - ValueError( - "Trying to get past callable node with expand=False." - ), + ValueError("Trying to get past callable node with expand=False."), keys=keys, - visited=visited, ) + visited=visited, + ) list_or_dict = list_or_dict() parent[last_key] = list_or_dict @@ -187,23 +178,19 @@ def retrieve(list_or_dict, return list_or_dict, success -def give_synsets_from_indices(indices, - path_to_yaml="data/imagenet_idx_to_synset.yaml"): +def give_synsets_from_indices(indices, path_to_yaml="data/imagenet_idx_to_synset.yaml"): synsets = [] with open(path_to_yaml) as f: di2s = yaml.load(f) for idx in indices: synsets.append(str(di2s[idx])) - print("Using {} different synsets for construction of Restriced Imagenet.". - format(len(synsets))) + print("Using {} different synsets for construction of Restriced Imagenet.".format(len(synsets))) return synsets def str_to_indices(string): """Expects a string in the format '32-123, 256, 280-321'""" - assert not string.endswith( - ","), "provided string '{}' ends with a comma, pls remove it".format( - string) + assert not string.endswith(","), "provided string '{}' ends with a comma, pls remove it".format(string) subs = string.split(",") indices = [] for sub in subs: @@ -236,8 +223,7 @@ def __init__(self, config=None): self.config = config if not type(self.config) == dict: self.config = {} - self.keep_orig_class_label = self.config.get("keep_orig_class_label", - False) + self.keep_orig_class_label = self.config.get("keep_orig_class_label", False) self.process_images = True # if False we skip loading & processing images and self.data contains filepaths self._prepare() self._prepare_synset_to_human() @@ -255,14 +241,15 @@ def _prepare(self): raise NotImplementedError() def _filter_relpaths(self, relpaths): - ignore = set(["n06596364_9591.JPEG", ]) - relpaths = [ - rpath for rpath in relpaths if not rpath.split("/")[-1] in ignore - ] + ignore = set( + [ + "n06596364_9591.JPEG", + ] + ) + relpaths = [rpath for rpath in relpaths if not rpath.split("/")[-1] in ignore] if "sub_indices" in self.config: indices = str_to_indices(self.config["sub_indices"]) - synsets = give_synsets_from_indices( - indices, path_to_yaml=self.idx2syn) # returns a list of strings + synsets = give_synsets_from_indices(indices, path_to_yaml=self.idx2syn) # returns a list of strings self.synset2idx = synset2idx(path_to_yaml=self.idx2syn) files = [] for rpath in relpaths: @@ -277,8 +264,7 @@ def _prepare_synset_to_human(self): SIZE = 2655750 URL = "https://heibox.uni-heidelberg.de/f/9f28e956cd304264bb82/?dl=1" self.human_dict = os.path.join(self.root, "synset_human.txt") - if (not os.path.exists(self.human_dict) or - not os.path.getsize(self.human_dict) == SIZE): + if not os.path.exists(self.human_dict) or not os.path.getsize(self.human_dict) == SIZE: download(URL, self.human_dict) def _prepare_idx_to_synset(self): @@ -289,8 +275,7 @@ def _prepare_idx_to_synset(self): def _prepare_human_to_integer_label(self): URL = "https://heibox.uni-heidelberg.de/f/2362b797d5be43b883f6/?dl=1" - self.human2integer = os.path.join(self.root, - "imagenet1000_clsidx_to_labels.txt") + self.human2integer = os.path.join(self.root, "imagenet1000_clsidx_to_labels.txt") if not os.path.exists(self.human2integer): download(URL, self.human2integer) with open(self.human2integer, "r") as f: @@ -306,15 +291,13 @@ def _load(self): self.relpaths = f.read().splitlines() l1 = len(self.relpaths) self.relpaths = self._filter_relpaths(self.relpaths) - print("Removed {} files from filelist during filtering.".format( - l1 - len(self.relpaths))) + print("Removed {} files from filelist during filtering.".format(l1 - len(self.relpaths))) self.synsets = [p.split("/")[0] for p in self.relpaths] self.abspaths = [os.path.join(self.datadir, p) for p in self.relpaths] unique_synsets = np.unique(self.synsets) - class_dict = dict((synset, i) - for i, synset in enumerate(unique_synsets)) + class_dict = dict((synset, i) for i, synset in enumerate(unique_synsets)) if not self.keep_orig_class_label: self.class_labels = [class_dict[s] for s in self.synsets] else: @@ -339,7 +322,8 @@ def _load(self): self.abspaths, labels=labels, size=self.size, - random_crop=self.random_crop, ) + random_crop=self.random_crop, + ) else: self.data = self.abspaths @@ -348,8 +332,12 @@ class ImageNetTrain(ImageNetBase): NAME = "ILSVRC2012_train" URL = "http://www.image-net.org/challenges/LSVRC/2012/" AT_HASH = "a306397ccf9c2ead27155983c254227c0fd938e2" - FILES = ["ILSVRC2012_img_train.tar", ] - SIZES = [147897477120, ] + FILES = [ + "ILSVRC2012_img_train.tar", + ] + SIZES = [ + 147897477120, + ] def __init__(self, process_images=True, data_root=None, **kwargs): self.process_images = process_images @@ -360,15 +348,13 @@ def _prepare(self): if self.data_root: self.root = os.path.join(self.data_root, self.NAME) else: - cachedir = os.environ.get("XDG_CACHE_HOME", - os.path.expanduser("~/.cache")) + cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")) self.root = os.path.join(cachedir, "autoencoders/data", self.NAME) self.datadir = os.path.join(self.root, "data") self.txt_filelist = os.path.join(self.root, "filelist.txt") self.expected_length = 1281167 - self.random_crop = retrieve( - self.config, "ImageNetTrain/random_crop", default=True) + self.random_crop = retrieve(self.config, "ImageNetTrain/random_crop", default=True) if not is_prepared(self.root): # prep print("Preparing dataset {} in {}".format(self.NAME, self.root)) @@ -376,8 +362,7 @@ def _prepare(self): datadir = self.datadir if not os.path.exists(datadir): path = os.path.join(self.root, self.FILES[0]) - if (not os.path.exists(path) or - not os.path.getsize(path) == self.SIZES[0]): + if not os.path.exists(path) or not os.path.getsize(path) == self.SIZES[0]: import academictorrents as at atpath = at.get(self.AT_HASH, datastore=self.root) @@ -391,7 +376,7 @@ def _prepare(self): print("Extracting sub-tars.") subpaths = sorted(glob.glob(os.path.join(datadir, "*.tar"))) for subpath in tqdm(subpaths): - subdir = subpath[:-len(".tar")] + subdir = subpath[: -len(".tar")] os.makedirs(subdir, exist_ok=True) with tarfile.open(subpath, "r:") as tar: tar.extractall(path=subdir) @@ -429,14 +414,12 @@ def _prepare(self): if self.data_root: self.root = os.path.join(self.data_root, self.NAME) else: - cachedir = os.environ.get("XDG_CACHE_HOME", - os.path.expanduser("~/.cache")) + cachedir = os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")) self.root = os.path.join(cachedir, "autoencoders/data", self.NAME) self.datadir = os.path.join(self.root, "data") self.txt_filelist = os.path.join(self.root, "filelist.txt") self.expected_length = 50000 - self.random_crop = retrieve( - self.config, "ImageNetValidation/random_crop", default=False) + self.random_crop = retrieve(self.config, "ImageNetValidation/random_crop", default=False) if not is_prepared(self.root): # prep print("Preparing dataset {} in {}".format(self.NAME, self.root)) @@ -444,8 +427,7 @@ def _prepare(self): datadir = self.datadir if not os.path.exists(datadir): path = os.path.join(self.root, self.FILES[0]) - if (not os.path.exists(path) or - not os.path.getsize(path) == self.SIZES[0]): + if not os.path.exists(path) or not os.path.getsize(path) == self.SIZES[0]: import academictorrents as at atpath = at.get(self.AT_HASH, datastore=self.root) @@ -457,8 +439,7 @@ def _prepare(self): tar.extractall(path=datadir) vspath = os.path.join(self.root, self.FILES[1]) - if (not os.path.exists(vspath) or - not os.path.getsize(vspath) == self.SIZES[1]): + if not os.path.exists(vspath) or not os.path.getsize(vspath) == self.SIZES[1]: download(self.VS_URL, vspath) with open(vspath, "r") as f: @@ -486,14 +467,15 @@ def _prepare(self): class ImageNetSR(Dataset): def __init__( - self, - size=None, - degradation=None, - downscale_f=4, - min_crop_f=0.5, - max_crop_f=1.0, - random_crop=True, - output_LR_image=False, ): + self, + size=None, + degradation=None, + downscale_f=4, + min_crop_f=0.5, + max_crop_f=1.0, + random_crop=True, + output_LR_image=False, + ): """ Imagenet Superresolution Dataloader Performs following ops in order: @@ -522,30 +504,22 @@ def __init__( assert max_crop_f <= 1.0 self.center_crop = not random_crop - self.image_rescaler = albumentations.SmallestMaxSize( - max_size=size, interpolation=cv2.INTER_AREA) + self.image_rescaler = albumentations.SmallestMaxSize(max_size=size, interpolation=cv2.INTER_AREA) - self.pil_interpolation = ( - False # gets reset later if incase interp_op is from pillow - ) + self.pil_interpolation = False # gets reset later if incase interp_op is from pillow if degradation == "bsrgan": - self.degradation_process = partial( - degradation_fn_bsr, sf=downscale_f) + self.degradation_process = partial(degradation_fn_bsr, sf=downscale_f) elif degradation == "bsrgan_light": - self.degradation_process = partial( - degradation_fn_bsr_light, sf=downscale_f) + self.degradation_process = partial(degradation_fn_bsr_light, sf=downscale_f) else: self.pil_interpolation = degradation.startswith("pil_") if self.pil_interpolation: interpolation_fn = degradation.replace("pil_", "") - self.degradation_process = partial( - TF.resize, - size=self.LR_size, - interpolation=interpolation_fn) + self.degradation_process = partial(TF.resize, size=self.LR_size, interpolation=interpolation_fn) else: interpolation_fn = { "cv_nearest": cv2.INTER_NEAREST, @@ -555,7 +529,8 @@ def __init__( "cv_lanczos": cv2.INTER_LANCZOS4, }[degradation] self.degradation_process = albumentations.SmallestMaxSize( - max_size=self.LR_size, interpolation=interpolation_fn) + max_size=self.LR_size, interpolation=interpolation_fn + ) def __len__(self): return len(self.base) @@ -570,17 +545,14 @@ def __getitem__(self, i): image = np.array(image).astype(np.uint8) min_side_len = min(image.shape[:2]) - crop_side_len = min_side_len * np.random.uniform( - self.min_crop_f, self.max_crop_f, size=None) + crop_side_len = min_side_len * np.random.uniform(self.min_crop_f, self.max_crop_f, size=None) crop_side_len = int(crop_side_len) if self.center_crop: - self.cropper = albumentations.CenterCrop( - height=crop_side_len, width=crop_side_len) + self.cropper = albumentations.CenterCrop(height=crop_side_len, width=crop_side_len) else: - self.cropper = albumentations.RandomCrop( - height=crop_side_len, width=crop_side_len) + self.cropper = albumentations.RandomCrop(height=crop_side_len, width=crop_side_len) image = self.cropper(image=image)["image"] image = self.image_rescaler(image=image)["image"] @@ -592,11 +564,9 @@ def __getitem__(self, i): LR_image = np.array(LR_image).astype(np.uint8) else: LR_image = self.degradation_process(image=image)["image"] - example["LR_image"] = (( - LR_image / 127.5 - 1.0).astype(np.float32).transpose([2, 0, 1])) + example["LR_image"] = (LR_image / 127.5 - 1.0).astype(np.float32).transpose([2, 0, 1]) - example["image"] = (image / 127.5 - 1.0).astype(np.float32).transpose( - [2, 0, 1]) + example["image"] = (image / 127.5 - 1.0).astype(np.float32).transpose([2, 0, 1]) return example @@ -608,7 +578,9 @@ def __init__(self, **kwargs): def get_base(self): with open("data/imagenet_train_hr_indices.p", "rb") as f: indices = pickle.load(f) - dset = ImageNetTrain(process_images=False, ) + dset = ImageNetTrain( + process_images=False, + ) return Subset(dset, indices) diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/__init__.py b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/__init__.py index 37224cba9a9d9..890a4eea89241 100644 --- a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/__init__.py +++ b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/__init__.py @@ -13,5 +13,4 @@ # limitations under the License. from .bsrgan import degradation_bsrgan_variant as degradation_fn_bsr -from .bsrgan_light import \ - degradation_bsrgan_variant as degradation_fn_bsr_light +from .bsrgan_light import degradation_bsrgan_variant as degradation_fn_bsr_light diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan.py b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan.py index a50493c2591ea..1efdbaa95c8ca 100644 --- a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan.py +++ b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan.py @@ -51,7 +51,7 @@ def modcrop_np(img, sf): """ w, h = img.shape[:2] im = np.copy(img) - return im[:w - w % sf, :h - h % sf, ...] + return im[: w - w % sf, : h - h % sf, ...] """ @@ -69,7 +69,7 @@ def analytic_kernel(k): # Loop over the small kernel to fill the big one for r in range(k_size): for c in range(k_size): - big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k + big_k[2 * r : 2 * r + k_size, 2 * c : 2 * c + k_size] += k[r, c] * k # Crop the edges of the big kernel to ignore very small values and increase run time of SR crop = k_size // 2 cropped_big_k = big_k[crop:-crop, crop:-crop] @@ -90,9 +90,9 @@ def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6): """ v = np.dot( - np.array( - [[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), - np.array([1.0, 0.0]), ) + np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), + np.array([1.0, 0.0]), + ) V = np.array([[v[0], v[1]], [v[1], -v[0]]]) D = np.array([[l1, 0], [0, l2]]) Sigma = np.dot(np.dot(V, D), np.linalg.inv(V)) @@ -161,11 +161,12 @@ def blur(x, k): def gen_kernel( - k_size=np.array([15, 15]), - scale_factor=np.array([4, 4]), - min_var=0.6, - max_var=10.0, - noise_level=0, ): + k_size=np.array([15, 15]), + scale_factor=np.array([4, 4]), + min_var=0.6, + max_var=10.0, + noise_level=0, +): """ " # modified version of https://github.com/assafshocher/BlindSR_dataset_generator # Kai Zhang @@ -180,14 +181,12 @@ def gen_kernel( # Set COV matrix using Lambdas and Theta LAMBDA = np.diag([lambda_1, lambda_2]) - Q = np.array( - [[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]) - SIGMA = Q @LAMBDA @Q.T + Q = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]) + SIGMA = Q @ LAMBDA @ Q.T INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :] # Set expectation position (shifting kernel for aligned image) - MU = k_size // 2 - 0.5 * (scale_factor - 1 - ) # - 0.5 * (scale_factor - k_size % 2) + MU = k_size // 2 - 0.5 * (scale_factor - 1) # - 0.5 * (scale_factor - k_size % 2) MU = MU[None, None, :, None] # Create meshgrid for Gaussian @@ -197,7 +196,7 @@ def gen_kernel( # Calcualte Gaussian for every pixel of the kernel ZZ = Z - MU ZZ_t = ZZ.transpose(0, 1, 3, 2) - raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @INV_SIGMA @ZZ)) * (1 + noise) + raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise) # shift the kernel so it will be centered # raw_kernel_centered = kernel_shift(raw_kernel, scale_factor) @@ -212,8 +211,7 @@ def fspecial_gaussian(hsize, sigma): hsize = [hsize, hsize] siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0] std = sigma - [x, y] = np.meshgrid( - np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1)) + [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1)) arg = -(x * x + y * y) / (2 * std * std) h = np.exp(arg) h[h < scipy.finfo(float).eps * h.max()] = 0 @@ -279,9 +277,7 @@ def srmd_degradation(x, k, sf=3): year={2018} } """ - x = ndimage.filters.convolve( - x, np.expand_dims( - k, axis=2), mode="wrap") # 'nearest' | 'mirror' + x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode="wrap") # 'nearest' | 'mirror' x = bicubic_degradation(x, sf=sf) return x @@ -359,13 +355,11 @@ def add_blur(img, sf=4): ksize=2 * random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, - l2=l2, ) + l2=l2, + ) else: - k = fspecial("gaussian", 2 * random.randint(2, 11) + 3, - wd * random.random()) - img = ndimage.filters.convolve( - img, np.expand_dims( - k, axis=2), mode="mirror") + k = fspecial("gaussian", 2 * random.randint(2, 11) + 3, wd * random.random()) + img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode="mirror") return img @@ -381,7 +375,8 @@ def add_resize(img, sf=4): img = cv2.resize( img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), - interpolation=random.choice([1, 2, 3]), ) + interpolation=random.choice([1, 2, 3]), + ) img = np.clip(img, 0.0, 1.0) return img @@ -391,18 +386,15 @@ def add_Gaussian_noise(img, noise_level1=2, noise_level2=25): noise_level = random.randint(noise_level1, noise_level2) rnum = np.random.rand() if rnum > 0.6: # add color Gaussian noise - img = img + np.random.normal(0, noise_level / 255.0, - img.shape).astype(np.float32) + img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32) elif rnum < 0.4: # add grayscale Gaussian noise - img = img + np.random.normal(0, noise_level / 255.0, - (*img.shape[:2], 1)).astype(np.float32) + img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32) else: # add noise L = noise_level2 / 255.0 D = np.diag(np.random.rand(3)) U = orth(np.random.rand(3, 3)) conv = np.dot(np.dot(np.transpose(U), D), U) - img = img + np.random.multivariate_normal( - [0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32) + img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32) img = np.clip(img, 0.0, 1.0) return img @@ -412,32 +404,28 @@ def add_speckle_noise(img, noise_level1=2, noise_level2=25): img = np.clip(img, 0.0, 1.0) rnum = random.random() if rnum > 0.6: - img += img * np.random.normal(0, noise_level / 255.0, - img.shape).astype(np.float32) + img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32) elif rnum < 0.4: - img += img * np.random.normal(0, noise_level / 255.0, - (*img.shape[:2], 1)).astype(np.float32) + img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32) else: L = noise_level2 / 255.0 D = np.diag(np.random.rand(3)) U = orth(np.random.rand(3, 3)) conv = np.dot(np.dot(np.transpose(U), D), U) - img += img * np.random.multivariate_normal( - [0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32) + img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32) img = np.clip(img, 0.0, 1.0) return img def add_Poisson_noise(img): img = np.clip((img * 255.0).round(), 0, 255) / 255.0 - vals = 10**(2 * random.random() + 2.0) # [2, 4] + vals = 10 ** (2 * random.random() + 2.0) # [2, 4] if random.random() < 0.5: img = np.random.poisson(img * vals).astype(np.float32) / vals else: img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114]) img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.0 - noise_gray = (np.random.poisson(img_gray * vals).astype(np.float32) / - vals - img_gray) + noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray img += noise_gray[:, :, np.newaxis] img = np.clip(img, 0.0, 1.0) return img @@ -446,8 +434,7 @@ def add_Poisson_noise(img): def add_JPEG_noise(img): quality_factor = random.randint(30, 95) img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR) - result, encimg = cv2.imencode( - ".jpg", img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor]) + result, encimg = cv2.imencode(".jpg", img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor]) img = cv2.imdecode(encimg, 1) img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB) return img @@ -457,11 +444,10 @@ def random_crop(lq, hq, sf=4, lq_patchsize=64): h, w = lq.shape[:2] rnd_h = random.randint(0, h - lq_patchsize) rnd_w = random.randint(0, w - lq_patchsize) - lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :] + lq = lq[rnd_h : rnd_h + lq_patchsize, rnd_w : rnd_w + lq_patchsize, :] rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf) - hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize - * sf, :] + hq = hq[rnd_h_H : rnd_h_H + lq_patchsize * sf, rnd_w_H : rnd_w_H + lq_patchsize * sf, :] return lq, hq @@ -482,7 +468,7 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None): sf_ori = sf h1, w1 = img.shape[:2] - img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop + img = img.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...] # mod crop h, w = img.shape[:2] if h < lq_patchsize * sf or w < lq_patchsize * sf: @@ -495,7 +481,8 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None): img = cv2.resize( img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])), - interpolation=random.choice([1, 2, 3]), ) + interpolation=random.choice([1, 2, 3]), + ) else: img = util.imresize_np(img, 1 / 2, True) img = np.clip(img, 0.0, 1.0) @@ -506,7 +493,8 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None): if idx1 > idx2: # keep downsample3 last shuffle_order[idx1], shuffle_order[idx2] = ( shuffle_order[idx2], - shuffle_order[idx1], ) + shuffle_order[idx1], + ) for i in shuffle_order: @@ -524,15 +512,13 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None): img = cv2.resize( img, (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])), - interpolation=random.choice([1, 2, 3]), ) + interpolation=random.choice([1, 2, 3]), + ) else: k = fspecial("gaussian", 25, random.uniform(0.1, 0.6 * sf)) k_shifted = shift_pixel(k, sf) - k_shifted = k_shifted / k_shifted.sum( - ) # blur with shifted kernel - img = ndimage.filters.convolve( - img, np.expand_dims( - k_shifted, axis=2), mode="mirror") + k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel + img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode="mirror") img = img[0::sf, 0::sf, ...] # nearest downsampling img = np.clip(img, 0.0, 1.0) @@ -541,7 +527,8 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None): img = cv2.resize( img, (int(1 / sf * a), int(1 / sf * b)), - interpolation=random.choice([1, 2, 3]), ) + interpolation=random.choice([1, 2, 3]), + ) img = np.clip(img, 0.0, 1.0) elif i == 4: @@ -585,7 +572,7 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None): _, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25 h1, w1 = image.shape[:2] - image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop + image = image.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...] # mod crop h, w = image.shape[:2] if sf == 4 and random.random() < scale2_prob: # downsample1 @@ -593,7 +580,8 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None): image = cv2.resize( image, (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])), - interpolation=random.choice([1, 2, 3]), ) + interpolation=random.choice([1, 2, 3]), + ) else: image = util.imresize_np(image, 1 / 2, True) image = np.clip(image, 0.0, 1.0) @@ -604,7 +592,8 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None): if idx1 > idx2: # keep downsample3 last shuffle_order[idx1], shuffle_order[idx2] = ( shuffle_order[idx2], - shuffle_order[idx1], ) + shuffle_order[idx1], + ) for i in shuffle_order: @@ -621,17 +610,14 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None): sf1 = random.uniform(1, 2 * sf) image = cv2.resize( image, - (int(1 / sf1 * image.shape[1]), - int(1 / sf1 * image.shape[0])), - interpolation=random.choice([1, 2, 3]), ) + (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])), + interpolation=random.choice([1, 2, 3]), + ) else: k = fspecial("gaussian", 25, random.uniform(0.1, 0.6 * sf)) k_shifted = shift_pixel(k, sf) - k_shifted = k_shifted / k_shifted.sum( - ) # blur with shifted kernel - image = ndimage.filters.convolve( - image, np.expand_dims( - k_shifted, axis=2), mode="mirror") + k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel + image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode="mirror") image = image[0::sf, 0::sf, ...] # nearest downsampling image = np.clip(image, 0.0, 1.0) @@ -640,7 +626,8 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None): image = cv2.resize( image, (int(1 / sf * a), int(1 / sf * b)), - interpolation=random.choice([1, 2, 3]), ) + interpolation=random.choice([1, 2, 3]), + ) image = np.clip(image, 0.0, 1.0) elif i == 4: @@ -673,19 +660,21 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None): img_lq = deg_fn(img)["image"] img_hq, img_lq = util.uint2single(img_hq), util.uint2single(img_lq) print(img_lq) - img_lq_bicubic = albumentations.SmallestMaxSize( - max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)["image"] + img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)[ + "image" + ] print(img_lq.shape) print("bicubic", img_lq_bicubic.shape) print(img_hq.shape) lq_nearest = cv2.resize( util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])), - interpolation=0, ) + interpolation=0, + ) lq_bicubic_nearest = cv2.resize( util.single2uint(img_lq_bicubic), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])), - interpolation=0, ) - img_concat = np.concatenate( - [lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1) + interpolation=0, + ) + img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1) util.imsave(img_concat, str(i) + ".png") diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan_light.py b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan_light.py index 86127e21d672e..94a515d93d914 100644 --- a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan_light.py +++ b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/bsrgan_light.py @@ -29,6 +29,7 @@ from scipy.linalg import orth from . import utils_image as util + """ # -------------------------------------------- # Super-Resolution @@ -51,7 +52,7 @@ def modcrop_np(img, sf): """ w, h = img.shape[:2] im = np.copy(img) - return im[:w - w % sf, :h - h % sf, ...] + return im[: w - w % sf, : h - h % sf, ...] """ @@ -69,7 +70,7 @@ def analytic_kernel(k): # Loop over the small kernel to fill the big one for r in range(k_size): for c in range(k_size): - big_k[2 * r:2 * r + k_size, 2 * c:2 * c + k_size] += k[r, c] * k + big_k[2 * r : 2 * r + k_size, 2 * c : 2 * c + k_size] += k[r, c] * k # Crop the edges of the big kernel to ignore very small values and increase run time of SR crop = k_size // 2 cropped_big_k = big_k[crop:-crop, crop:-crop] @@ -90,9 +91,9 @@ def anisotropic_Gaussian(ksize=15, theta=np.pi, l1=6, l2=6): """ v = np.dot( - np.array( - [[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), - np.array([1.0, 0.0]), ) + np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]), + np.array([1.0, 0.0]), + ) V = np.array([[v[0], v[1]], [v[1], -v[0]]]) D = np.array([[l1, 0], [0, l2]]) Sigma = np.dot(np.dot(V, D), np.linalg.inv(V)) @@ -161,11 +162,12 @@ def blur(x, k): def gen_kernel( - k_size=np.array([15, 15]), - scale_factor=np.array([4, 4]), - min_var=0.6, - max_var=10.0, - noise_level=0, ): + k_size=np.array([15, 15]), + scale_factor=np.array([4, 4]), + min_var=0.6, + max_var=10.0, + noise_level=0, +): """ " # modified version of https://github.com/assafshocher/BlindSR_dataset_generator # Kai Zhang @@ -180,14 +182,12 @@ def gen_kernel( # Set COV matrix using Lambdas and Theta LAMBDA = np.diag([lambda_1, lambda_2]) - Q = np.array( - [[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]) - SIGMA = Q @LAMBDA @Q.T + Q = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]) + SIGMA = Q @ LAMBDA @ Q.T INV_SIGMA = np.linalg.inv(SIGMA)[None, None, :, :] # Set expectation position (shifting kernel for aligned image) - MU = k_size // 2 - 0.5 * (scale_factor - 1 - ) # - 0.5 * (scale_factor - k_size % 2) + MU = k_size // 2 - 0.5 * (scale_factor - 1) # - 0.5 * (scale_factor - k_size % 2) MU = MU[None, None, :, None] # Create meshgrid for Gaussian @@ -197,7 +197,7 @@ def gen_kernel( # Calcualte Gaussian for every pixel of the kernel ZZ = Z - MU ZZ_t = ZZ.transpose(0, 1, 3, 2) - raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @INV_SIGMA @ZZ)) * (1 + noise) + raw_kernel = np.exp(-0.5 * np.squeeze(ZZ_t @ INV_SIGMA @ ZZ)) * (1 + noise) # shift the kernel so it will be centered # raw_kernel_centered = kernel_shift(raw_kernel, scale_factor) @@ -212,8 +212,7 @@ def fspecial_gaussian(hsize, sigma): hsize = [hsize, hsize] siz = [(hsize[0] - 1.0) / 2.0, (hsize[1] - 1.0) / 2.0] std = sigma - [x, y] = np.meshgrid( - np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1)) + [x, y] = np.meshgrid(np.arange(-siz[1], siz[1] + 1), np.arange(-siz[0], siz[0] + 1)) arg = -(x * x + y * y) / (2 * std * std) h = np.exp(arg) h[h < scipy.finfo(float).eps * h.max()] = 0 @@ -279,9 +278,7 @@ def srmd_degradation(x, k, sf=3): year={2018} } """ - x = ndimage.filters.convolve( - x, np.expand_dims( - k, axis=2), mode="wrap") # 'nearest' | 'mirror' + x = ndimage.filters.convolve(x, np.expand_dims(k, axis=2), mode="wrap") # 'nearest' | 'mirror' x = bicubic_degradation(x, sf=sf) return x @@ -359,16 +356,10 @@ def add_blur(img, sf=4): if random.random() < 0.5: l1 = wd2 * random.random() l2 = wd2 * random.random() - k = anisotropic_Gaussian( - ksize=random.randint(2, 11) + 3, - theta=random.random() * np.pi, - l1=l1, - l2=l2) + k = anisotropic_Gaussian(ksize=random.randint(2, 11) + 3, theta=random.random() * np.pi, l1=l1, l2=l2) else: k = fspecial("gaussian", random.randint(2, 4) + 3, wd * random.random()) - img = ndimage.filters.convolve( - img, np.expand_dims( - k, axis=2), mode="mirror") + img = ndimage.filters.convolve(img, np.expand_dims(k, axis=2), mode="mirror") return img @@ -384,7 +375,8 @@ def add_resize(img, sf=4): img = cv2.resize( img, (int(sf1 * img.shape[1]), int(sf1 * img.shape[0])), - interpolation=random.choice([1, 2, 3]), ) + interpolation=random.choice([1, 2, 3]), + ) img = np.clip(img, 0.0, 1.0) return img @@ -394,18 +386,15 @@ def add_Gaussian_noise(img, noise_level1=2, noise_level2=25): noise_level = random.randint(noise_level1, noise_level2) rnum = np.random.rand() if rnum > 0.6: # add color Gaussian noise - img = img + np.random.normal(0, noise_level / 255.0, - img.shape).astype(np.float32) + img = img + np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32) elif rnum < 0.4: # add grayscale Gaussian noise - img = img + np.random.normal(0, noise_level / 255.0, - (*img.shape[:2], 1)).astype(np.float32) + img = img + np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32) else: # add noise L = noise_level2 / 255.0 D = np.diag(np.random.rand(3)) U = orth(np.random.rand(3, 3)) conv = np.dot(np.dot(np.transpose(U), D), U) - img = img + np.random.multivariate_normal( - [0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32) + img = img + np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32) img = np.clip(img, 0.0, 1.0) return img @@ -415,32 +404,28 @@ def add_speckle_noise(img, noise_level1=2, noise_level2=25): img = np.clip(img, 0.0, 1.0) rnum = random.random() if rnum > 0.6: - img += img * np.random.normal(0, noise_level / 255.0, - img.shape).astype(np.float32) + img += img * np.random.normal(0, noise_level / 255.0, img.shape).astype(np.float32) elif rnum < 0.4: - img += img * np.random.normal(0, noise_level / 255.0, - (*img.shape[:2], 1)).astype(np.float32) + img += img * np.random.normal(0, noise_level / 255.0, (*img.shape[:2], 1)).astype(np.float32) else: L = noise_level2 / 255.0 D = np.diag(np.random.rand(3)) U = orth(np.random.rand(3, 3)) conv = np.dot(np.dot(np.transpose(U), D), U) - img += img * np.random.multivariate_normal( - [0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32) + img += img * np.random.multivariate_normal([0, 0, 0], np.abs(L**2 * conv), img.shape[:2]).astype(np.float32) img = np.clip(img, 0.0, 1.0) return img def add_Poisson_noise(img): img = np.clip((img * 255.0).round(), 0, 255) / 255.0 - vals = 10**(2 * random.random() + 2.0) # [2, 4] + vals = 10 ** (2 * random.random() + 2.0) # [2, 4] if random.random() < 0.5: img = np.random.poisson(img * vals).astype(np.float32) / vals else: img_gray = np.dot(img[..., :3], [0.299, 0.587, 0.114]) img_gray = np.clip((img_gray * 255.0).round(), 0, 255) / 255.0 - noise_gray = (np.random.poisson(img_gray * vals).astype(np.float32) / - vals - img_gray) + noise_gray = np.random.poisson(img_gray * vals).astype(np.float32) / vals - img_gray img += noise_gray[:, :, np.newaxis] img = np.clip(img, 0.0, 1.0) return img @@ -449,8 +434,7 @@ def add_Poisson_noise(img): def add_JPEG_noise(img): quality_factor = random.randint(80, 95) img = cv2.cvtColor(util.single2uint(img), cv2.COLOR_RGB2BGR) - result, encimg = cv2.imencode( - ".jpg", img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor]) + result, encimg = cv2.imencode(".jpg", img, [int(cv2.IMWRITE_JPEG_QUALITY), quality_factor]) img = cv2.imdecode(encimg, 1) img = cv2.cvtColor(util.uint2single(img), cv2.COLOR_BGR2RGB) return img @@ -460,11 +444,10 @@ def random_crop(lq, hq, sf=4, lq_patchsize=64): h, w = lq.shape[:2] rnd_h = random.randint(0, h - lq_patchsize) rnd_w = random.randint(0, w - lq_patchsize) - lq = lq[rnd_h:rnd_h + lq_patchsize, rnd_w:rnd_w + lq_patchsize, :] + lq = lq[rnd_h : rnd_h + lq_patchsize, rnd_w : rnd_w + lq_patchsize, :] rnd_h_H, rnd_w_H = int(rnd_h * sf), int(rnd_w * sf) - hq = hq[rnd_h_H:rnd_h_H + lq_patchsize * sf, rnd_w_H:rnd_w_H + lq_patchsize - * sf, :] + hq = hq[rnd_h_H : rnd_h_H + lq_patchsize * sf, rnd_w_H : rnd_w_H + lq_patchsize * sf, :] return lq, hq @@ -485,7 +468,7 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None): sf_ori = sf h1, w1 = img.shape[:2] - img = img.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop + img = img.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...] # mod crop h, w = img.shape[:2] if h < lq_patchsize * sf or w < lq_patchsize * sf: @@ -498,7 +481,8 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None): img = cv2.resize( img, (int(1 / 2 * img.shape[1]), int(1 / 2 * img.shape[0])), - interpolation=random.choice([1, 2, 3]), ) + interpolation=random.choice([1, 2, 3]), + ) else: img = util.imresize_np(img, 1 / 2, True) img = np.clip(img, 0.0, 1.0) @@ -509,7 +493,8 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None): if idx1 > idx2: # keep downsample3 last shuffle_order[idx1], shuffle_order[idx2] = ( shuffle_order[idx2], - shuffle_order[idx1], ) + shuffle_order[idx1], + ) for i in shuffle_order: @@ -527,15 +512,13 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None): img = cv2.resize( img, (int(1 / sf1 * img.shape[1]), int(1 / sf1 * img.shape[0])), - interpolation=random.choice([1, 2, 3]), ) + interpolation=random.choice([1, 2, 3]), + ) else: k = fspecial("gaussian", 25, random.uniform(0.1, 0.6 * sf)) k_shifted = shift_pixel(k, sf) - k_shifted = k_shifted / k_shifted.sum( - ) # blur with shifted kernel - img = ndimage.filters.convolve( - img, np.expand_dims( - k_shifted, axis=2), mode="mirror") + k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel + img = ndimage.filters.convolve(img, np.expand_dims(k_shifted, axis=2), mode="mirror") img = img[0::sf, 0::sf, ...] # nearest downsampling img = np.clip(img, 0.0, 1.0) @@ -544,7 +527,8 @@ def degradation_bsrgan(img, sf=4, lq_patchsize=72, isp_model=None): img = cv2.resize( img, (int(1 / sf * a), int(1 / sf * b)), - interpolation=random.choice([1, 2, 3]), ) + interpolation=random.choice([1, 2, 3]), + ) img = np.clip(img, 0.0, 1.0) elif i == 4: @@ -588,7 +572,7 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None): _, jpeg_prob, scale2_prob = 0.25, 0.9, 0.25 h1, w1 = image.shape[:2] - image = image.copy()[:w1 - w1 % sf, :h1 - h1 % sf, ...] # mod crop + image = image.copy()[: w1 - w1 % sf, : h1 - h1 % sf, ...] # mod crop h, w = image.shape[:2] if sf == 4 and random.random() < scale2_prob: # downsample1 @@ -596,7 +580,8 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None): image = cv2.resize( image, (int(1 / 2 * image.shape[1]), int(1 / 2 * image.shape[0])), - interpolation=random.choice([1, 2, 3]), ) + interpolation=random.choice([1, 2, 3]), + ) else: image = util.imresize_np(image, 1 / 2, True) image = np.clip(image, 0.0, 1.0) @@ -607,7 +592,8 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None): if idx1 > idx2: # keep downsample3 last shuffle_order[idx1], shuffle_order[idx2] = ( shuffle_order[idx2], - shuffle_order[idx1], ) + shuffle_order[idx1], + ) for i in shuffle_order: @@ -624,17 +610,14 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None): sf1 = random.uniform(1, 2 * sf) image = cv2.resize( image, - (int(1 / sf1 * image.shape[1]), - int(1 / sf1 * image.shape[0])), - interpolation=random.choice([1, 2, 3]), ) + (int(1 / sf1 * image.shape[1]), int(1 / sf1 * image.shape[0])), + interpolation=random.choice([1, 2, 3]), + ) else: k = fspecial("gaussian", 25, random.uniform(0.1, 0.6 * sf)) k_shifted = shift_pixel(k, sf) - k_shifted = k_shifted / k_shifted.sum( - ) # blur with shifted kernel - image = ndimage.filters.convolve( - image, np.expand_dims( - k_shifted, axis=2), mode="mirror") + k_shifted = k_shifted / k_shifted.sum() # blur with shifted kernel + image = ndimage.filters.convolve(image, np.expand_dims(k_shifted, axis=2), mode="mirror") image = image[0::sf, 0::sf, ...] # nearest downsampling image = np.clip(image, 0.0, 1.0) @@ -644,7 +627,8 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None): image = cv2.resize( image, (int(1 / sf * a), int(1 / sf * b)), - interpolation=random.choice([1, 2, 3]), ) + interpolation=random.choice([1, 2, 3]), + ) image = np.clip(image, 0.0, 1.0) elif i == 4: @@ -677,19 +661,21 @@ def degradation_bsrgan_variant(image, sf=4, isp_model=None): img_lq = deg_fn(img)["image"] img_hq, img_lq = util.uint2single(img_hq), util.uint2single(img_lq) print(img_lq) - img_lq_bicubic = albumentations.SmallestMaxSize( - max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)["image"] + img_lq_bicubic = albumentations.SmallestMaxSize(max_size=h, interpolation=cv2.INTER_CUBIC)(image=img_hq)[ + "image" + ] print(img_lq.shape) print("bicubic", img_lq_bicubic.shape) print(img_hq.shape) lq_nearest = cv2.resize( util.single2uint(img_lq), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])), - interpolation=0, ) + interpolation=0, + ) lq_bicubic_nearest = cv2.resize( util.single2uint(img_lq_bicubic), (int(sf * img_lq.shape[1]), int(sf * img_lq.shape[0])), - interpolation=0, ) - img_concat = np.concatenate( - [lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1) + interpolation=0, + ) + img_concat = np.concatenate([lq_bicubic_nearest, lq_nearest, util.single2uint(img_hq)], axis=1) util.imsave(img_concat, str(i) + ".png") diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/utils_image.py b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/utils_image.py index 1e21fe66a10b6..be3bdaa3321cc 100644 --- a/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/utils_image.py +++ b/ppdiffusers/examples/autoencoder/vae/ldm/image_degradation/utils_image.py @@ -71,14 +71,12 @@ def cubic(x): absx = paddle.abs(x) absx2 = absx**2 absx3 = absx**3 - return (1.5 * absx3 - 2.5 * absx2 + 1) * ( - (absx <= 1).astype(absx.dtype)) + ( - -0.5 * absx3 + 2.5 * absx2 - 4 * absx + 2) * (( - (absx > 1) * (absx <= 2)).astype(absx.dtype)) + return (1.5 * absx3 - 2.5 * absx2 + 1) * ((absx <= 1).astype(absx.dtype)) + ( + -0.5 * absx3 + 2.5 * absx2 - 4 * absx + 2 + ) * (((absx > 1) * (absx <= 2)).astype(absx.dtype)) -def calculate_weights_indices(in_length, out_length, scale, kernel, - kernel_width, antialiasing): +def calculate_weights_indices(in_length, out_length, scale, kernel, kernel_width, antialiasing): if (scale < 1) and (antialiasing): # Use a modified kernel to simultaneously interpolate and antialias- larger kernel width kernel_width = kernel_width / scale @@ -102,14 +100,13 @@ def calculate_weights_indices(in_length, out_length, scale, kernel, # The indices of the input pixels involved in computing the k-th output # pixel are in row k of the indices matrix. - indices = left.reshape([out_length, 1]).expand( - [out_length, P]) + paddle.linspace(0, P - 1, P).reshape([1, P]).expand( - [out_length, P]) + indices = left.reshape([out_length, 1]).expand([out_length, P]) + paddle.linspace(0, P - 1, P).reshape( + [1, P] + ).expand([out_length, P]) # The weights used to compute the k-th output pixel are in row k of the # weights matrix. - distance_to_center = u.reshape([out_length, 1]).expand( - [out_length, P]) - indices + distance_to_center = u.reshape([out_length, 1]).expand([out_length, P]) - indices # apply cubic kernel if (scale < 1) and (antialiasing): weights = scale * cubic(distance_to_center * scale) @@ -158,13 +155,15 @@ def imresize_np(img, scale, antialiasing=True): # get weights and indices weights_H, indices_H, sym_len_Hs, sym_len_He = calculate_weights_indices( - in_H, out_H, scale, kernel, kernel_width, antialiasing) + in_H, out_H, scale, kernel, kernel_width, antialiasing + ) weights_W, indices_W, sym_len_Ws, sym_len_We = calculate_weights_indices( - in_W, out_W, scale, kernel, kernel_width, antialiasing) + in_W, out_W, scale, kernel, kernel_width, antialiasing + ) # process H dimension # symmetric copying img_aug = paddle.zeros([in_H + sym_len_Hs + sym_len_He, in_W, in_C]) - img_aug[sym_len_Hs:sym_len_Hs + in_H] = img + img_aug[sym_len_Hs : sym_len_Hs + in_H] = img sym_patch = img[:sym_len_Hs, :, :] inv_idx = paddle.arange(sym_patch.shape[0] - 1, -1, -1).astype("int64") @@ -174,20 +173,19 @@ def imresize_np(img, scale, antialiasing=True): sym_patch = img[-sym_len_He:, :, :] inv_idx = paddle.arange(sym_patch.shape[0] - 1, -1, -1).astype("int64") sym_patch_inv = sym_patch.index_select(inv_idx, axis=0) - img_aug[sym_len_Hs + in_H:sym_len_Hs + in_H + sym_len_He] = sym_patch_inv + img_aug[sym_len_Hs + in_H : sym_len_Hs + in_H + sym_len_He] = sym_patch_inv out_1 = paddle.zeros([out_H, in_W, in_C]) kernel_width = weights_H.shape[1] for i in range(out_H): idx = int(indices_H[i][0]) for j in range(out_C): - out_1[i, :, j] = (img_aug[idx:idx + kernel_width, :, j] - .transpose([1, 0]).mv(weights_H[i])) + out_1[i, :, j] = img_aug[idx : idx + kernel_width, :, j].transpose([1, 0]).mv(weights_H[i]) # process W dimension # symmetric copying out_1_aug = paddle.zeros([out_H, in_W + sym_len_Ws + sym_len_We, in_C]) - out_1_aug[:, sym_len_Ws:sym_len_Ws + in_W] = out_1 + out_1_aug[:, sym_len_Ws : sym_len_Ws + in_W] = out_1 sym_patch = out_1[:, :sym_len_Ws, :] inv_idx = paddle.arange(sym_patch.shape[1] - 1, -1, -1).astype("int64") @@ -197,16 +195,14 @@ def imresize_np(img, scale, antialiasing=True): sym_patch = out_1[:, -sym_len_We:, :] inv_idx = paddle.arange(sym_patch.shape[1] - 1, -1, -1).astype("int64") sym_patch_inv = sym_patch.index_select(inv_idx, axis=1) - out_1_aug[:, sym_len_Ws + in_W:sym_len_Ws + in_W + - sym_len_We] = sym_patch_inv + out_1_aug[:, sym_len_Ws + in_W : sym_len_Ws + in_W + sym_len_We] = sym_patch_inv out_2 = paddle.zeros([out_H, out_W, in_C]) kernel_width = weights_W.shape[1] for i in range(out_W): idx = int(indices_W[i][0]) for j in range(out_C): - out_2[:, i, j] = out_1_aug[:, idx:idx + kernel_width, j].mv( - weights_W[i]) + out_2[:, i, j] = out_1_aug[:, idx : idx + kernel_width, j].mv(weights_W[i]) if need_squeeze: out_2 = out_2.squeeze() diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/losses.py b/ppdiffusers/examples/autoencoder/vae/ldm/losses.py index 3d8311776fdb3..a1d4f642125ae 100644 --- a/ppdiffusers/examples/autoencoder/vae/ldm/losses.py +++ b/ppdiffusers/examples/autoencoder/vae/ldm/losses.py @@ -22,47 +22,36 @@ import paddle.nn.functional as F from paddle.utils.download import get_weights_path_from_url -from ppdiffusers.initializer import (constant_, normal_, - reset_initialized_parameter) +from ppdiffusers.initializer import constant_, normal_, reset_initialized_parameter model_urls = { "vgg16": ( "https://paddlenlp.bj.bcebos.com/models/lpips_vgg16/lpips_vgg16.pdparams", - "a1583475db9e49334735f2866847ae41", ), + "a1583475db9e49334735f2866847ae41", + ), "vgg_netlin": ( "https://paddlenlp.bj.bcebos.com/models/lpips_vgg16/vgg_netlin.pdparams", - "f3ae85f16a1a243e789606ae0c4a59a1", ), + "f3ae85f16a1a243e789606ae0c4a59a1", + ), } class ActNorm(nn.Layer): - def __init__(self, - num_features, - logdet=False, - affine=True, - allow_reverse_init=False): + def __init__(self, num_features, logdet=False, affine=True, allow_reverse_init=False): assert affine super().__init__() self.logdet = logdet - self.loc = self.create_parameter( - (1, num_features, 1, 1), - default_initializer=nn.initializer.Constant(0)) - self.scale = self.create_parameter( - (1, num_features, 1, 1), - default_initializer=nn.initializer.Constant(1)) + self.loc = self.create_parameter((1, num_features, 1, 1), default_initializer=nn.initializer.Constant(0)) + self.scale = self.create_parameter((1, num_features, 1, 1), default_initializer=nn.initializer.Constant(1)) self.allow_reverse_init = allow_reverse_init - self.register_buffer( - "initialized", paddle.to_tensor( - 0, dtype=paddle.int64)) + self.register_buffer("initialized", paddle.to_tensor(0, dtype=paddle.int64)) @paddle.no_grad() def initialize(self, input): flatten = input.transpose([1, 0, 2, 3]).reshape([input.shape[1], -1]) - mean = (flatten.mean(1).unsqueeze(1).unsqueeze(2).unsqueeze(3) - .transpose([1, 0, 2, 3])) - std = (flatten.std(1).unsqueeze(1).unsqueeze(2).unsqueeze(3) - .transpose([1, 0, 2, 3])) + mean = flatten.mean(1).unsqueeze(1).unsqueeze(2).unsqueeze(3).transpose([1, 0, 2, 3]) + std = flatten.std(1).unsqueeze(1).unsqueeze(2).unsqueeze(3).transpose([1, 0, 2, 3]) self.loc.set_value(-mean) self.scale.set_value(1 / (std + 1e-6)) @@ -80,9 +69,7 @@ def forward(self, input, reverse=False): if self.training and self.initialized.item() == 0: self.initialize(input) - self.initialized.set_value( - paddle.to_tensor( - 1, dtype=self.initialized.dtype)) + self.initialized.set_value(paddle.to_tensor(1, dtype=self.initialized.dtype)) h = self.scale * (input + self.loc) @@ -106,9 +93,7 @@ def reverse(self, output): ) else: self.initialize(output) - self.initialized.set_value( - paddle.to_tensor( - 1, dtype=self.initialized.dtype)) + self.initialized.set_value(paddle.to_tensor(1, dtype=self.initialized.dtype)) if len(output.shape) == 2: output = output[:, :, None, None] @@ -137,8 +122,7 @@ def hinge_d_loss(logits_real, logits_fake): def vanilla_d_loss(logits_real, logits_fake): - d_loss = 0.5 * (paddle.mean(F.softplus(-logits_real)) + - paddle.mean(F.softplus(logits_fake))) + d_loss = 0.5 * (paddle.mean(F.softplus(-logits_real)) + paddle.mean(F.softplus(logits_fake))) return d_loss @@ -170,8 +154,7 @@ def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False): norm_layer = nn.BatchNorm2D else: norm_layer = ActNorm - if (type(norm_layer) == functools. - partial): # no need to use bias as BatchNorm2d has affine parameters + if type(norm_layer) == functools.partial: # no need to use bias as BatchNorm2d has affine parameters use_bias = norm_layer.func != nn.BatchNorm2D else: use_bias = norm_layer != nn.BatchNorm2D @@ -179,8 +162,7 @@ def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False): kw = 4 padw = 1 sequence = [ - nn.Conv2D( - input_nc, ndf, kernel_size=kw, stride=2, padding=padw), + nn.Conv2D(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2), ] nf_mult = 1 @@ -195,7 +177,8 @@ def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False): kernel_size=kw, stride=2, padding=padw, - bias_attr=use_bias, ), + bias_attr=use_bias, + ), norm_layer(ndf * nf_mult), nn.LeakyReLU(0.2), ] @@ -209,14 +192,14 @@ def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False): kernel_size=kw, stride=1, padding=padw, - bias_attr=use_bias, ), + bias_attr=use_bias, + ), norm_layer(ndf * nf_mult), nn.LeakyReLU(0.2), ] sequence += [ - nn.Conv2D( - ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw) + nn.Conv2D(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw) ] # output 1 channel prediction map self.main = nn.Sequential(*sequence) @@ -229,10 +212,8 @@ def spatial_average(in_tens, keepdim=True): return in_tens.mean([2, 3], keepdim=keepdim) -def upsample(in_tens, - out_HW=(64, 64)): # assumes scale factor is same for H and W - return nn.Upsample( - size=out_HW, mode="bilinear", align_corners=False)(in_tens) +def upsample(in_tens, out_HW=(64, 64)): # assumes scale factor is same for H and W + return nn.Upsample(size=out_HW, mode="bilinear", align_corners=False)(in_tens) def normalize_tensor(in_feat, eps=1e-10): @@ -246,10 +227,15 @@ class NetLinLayer(nn.Layer): def __init__(self, chn_in, chn_out=1, use_dropout=False): super(NetLinLayer, self).__init__() - layers = ([nn.Dropout(), ] if (use_dropout) else []) + layers = ( + [ + nn.Dropout(), + ] + if (use_dropout) + else [] + ) layers += [ - nn.Conv2D( - chn_in, chn_out, 1, stride=1, padding=0, bias_attr=False), + nn.Conv2D(chn_in, chn_out, 1, stride=1, padding=0, bias_attr=False), ] self.model = nn.Sequential(*layers) @@ -262,14 +248,12 @@ def __init__(self): super(ScalingLayer, self).__init__() self.register_buffer( "shift", - paddle.to_tensor( - np.asarray([-0.030, -0.088, -0.188]).astype("float32")[ - None, :, None, None]), ) + paddle.to_tensor(np.asarray([-0.030, -0.088, -0.188]).astype("float32")[None, :, None, None]), + ) self.register_buffer( "scale", - paddle.to_tensor( - np.asarray([0.458, 0.448, 0.450]).astype("float32")[ - None, :, None, None]), ) + paddle.to_tensor(np.asarray([0.458, 0.448, 0.450]).astype("float32")[None, :, None, None]), + ) def forward(self, inp): return (inp - self.shift) / self.scale @@ -280,8 +264,7 @@ def __init__(self, pretrained=True, requires_grad=False): super(VGG16, self).__init__() vgg_model = paddle.vision.models.vgg16(pretrained=False) if pretrained: - state_dict = paddle.load( - get_weights_path_from_url(*model_urls["vgg16"])) + state_dict = paddle.load(get_weights_path_from_url(*model_urls["vgg16"])) vgg_model.set_state_dict(state_dict) vgg_pretrained_features = vgg_model.features self.slice1 = nn.Sequential() @@ -315,9 +298,7 @@ def forward(self, X): h_relu4_3 = h h = self.slice5(h) h_relu5_3 = h - vgg_outputs = namedtuple( - "VggOutputs", - ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"]) + vgg_outputs = namedtuple("VggOutputs", ["relu1_2", "relu2_2", "relu3_3", "relu4_3", "relu5_3"]) out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3) return out @@ -325,25 +306,27 @@ def forward(self, X): class LPIPS(nn.Layer): def __init__( - self, - pretrained=True, - net="alex", - lpips=True, - spatial=False, - pnet_rand=False, - pnet_tune=False, - use_dropout=True, - model_path=None, - eval_mode=True, - verbose=True, ): + self, + pretrained=True, + net="alex", + lpips=True, + spatial=False, + pnet_rand=False, + pnet_tune=False, + use_dropout=True, + model_path=None, + eval_mode=True, + verbose=True, + ): # lpips - [True] means with linear calibration on top of base network # pretrained - [True] means load linear weights super(LPIPS, self).__init__() if verbose: - print("Setting up [%s] perceptual loss: trunk [%s], spatial [%s]" % - ("LPIPS" if lpips else "baseline", net, "on" - if spatial else "off")) + print( + "Setting up [%s] perceptual loss: trunk [%s], spatial [%s]" + % ("LPIPS" if lpips else "baseline", net, "on" if spatial else "off") + ) self.pnet_type = net.lower() self.pnet_tune = pnet_tune @@ -359,8 +342,7 @@ def __init__( raise NotImplementedError self.L = len(self.chns) - self.net = net_type( - pretrained=not self.pnet_rand, requires_grad=self.pnet_tune) + self.net = net_type(pretrained=not self.pnet_rand, requires_grad=self.pnet_tune) if lpips: lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout) @@ -377,8 +359,7 @@ def __init__( if pretrained: if model_path is None: - model_path = get_weights_path_from_url(*model_urls[ - "vgg_netlin"]) + model_path = get_weights_path_from_url(*model_urls["vgg_netlin"]) if verbose: print("Loading model from: %s" % model_path) import warnings @@ -393,47 +374,29 @@ def __init__( param.stop_gradient = True def forward(self, in0, in1, retPerLayer=False, normalize=False): - if (normalize): # turn on this flag if input is [0,1] so it can be adjusted to [-1, +1] + if normalize: # turn on this flag if input is [0,1] so it can be adjusted to [-1, +1] in0 = 2 * in0 - 1 in1 = 2 * in1 - 1 # v0.0 - original release had a bug, where input was not scaled - in0_input, in1_input = (self.scaling_layer(in0), - self.scaling_layer(in1)) + in0_input, in1_input = (self.scaling_layer(in0), self.scaling_layer(in1)) outs0, outs1 = self.net.forward(in0_input), self.net.forward(in1_input) feats0, feats1, diffs = {}, {}, {} for kk in range(self.L): - feats0[kk], feats1[kk] = normalize_tensor(outs0[ - kk]), normalize_tensor(outs1[kk]) - diffs[kk] = (feats0[kk] - feats1[kk])**2 + feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk]) + diffs[kk] = (feats0[kk] - feats1[kk]) ** 2 if self.lpips: if self.spatial: - res = [ - upsample( - self.lins[kk](diffs[kk]), out_HW=in0.shape[2:]) - for kk in range(self.L) - ] + res = [upsample(self.lins[kk](diffs[kk]), out_HW=in0.shape[2:]) for kk in range(self.L)] else: - res = [ - spatial_average( - self.lins[kk](diffs[kk]), keepdim=True) - for kk in range(self.L) - ] + res = [spatial_average(self.lins[kk](diffs[kk]), keepdim=True) for kk in range(self.L)] else: if self.spatial: - res = [ - upsample( - diffs[kk].sum(axis=1, keepdim=True), - out_HW=in0.shape[2:]) for kk in range(self.L) - ] + res = [upsample(diffs[kk].sum(axis=1, keepdim=True), out_HW=in0.shape[2:]) for kk in range(self.L)] else: - res = [ - spatial_average( - diffs[kk].sum(axis=1, keepdim=True), keepdim=True) - for kk in range(self.L) - ] + res = [spatial_average(diffs[kk].sum(axis=1, keepdim=True), keepdim=True) for kk in range(self.L)] val = res[0] for l in range(1, self.L): @@ -447,19 +410,20 @@ def forward(self, in0, in1, retPerLayer=False, normalize=False): class LPIPSWithDiscriminator(nn.Layer): def __init__( - self, - disc_start, - logvar_init=0.0, - kl_weight=1.0, - pixelloss_weight=1.0, - disc_num_layers=3, - disc_in_channels=3, - disc_factor=1.0, - disc_weight=1.0, - perceptual_weight=1.0, - use_actnorm=False, - disc_conditional=False, - disc_loss="hinge", ): + self, + disc_start, + logvar_init=0.0, + kl_weight=1.0, + pixelloss_weight=1.0, + disc_num_layers=3, + disc_in_channels=3, + disc_factor=1.0, + disc_weight=1.0, + perceptual_weight=1.0, + use_actnorm=False, + disc_conditional=False, + disc_loss="hinge", + ): super().__init__() assert disc_loss in ["hinge", "vanilla"] @@ -471,15 +435,13 @@ def __init__( self.perceptual_weight = perceptual_weight self.discriminator = NLayerDiscriminator( - input_nc=disc_in_channels, - n_layers=disc_num_layers, - use_actnorm=use_actnorm) + input_nc=disc_in_channels, n_layers=disc_num_layers, use_actnorm=use_actnorm + ) reset_initialized_parameter(self.discriminator) self.discriminator.apply(weights_init) # output log variance - self.logvar = self.create_parameter( - (1, ), default_initializer=nn.initializer.Constant(logvar_init)) + self.logvar = self.create_parameter((1,), default_initializer=nn.initializer.Constant(logvar_init)) self.discriminator_iter_start = disc_start self.disc_loss = hinge_d_loss if disc_loss == "hinge" else vanilla_d_loss @@ -489,15 +451,11 @@ def __init__( def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None): if last_layer is not None: - nll_grads = paddle.autograd.grad( - nll_loss, last_layer, retain_graph=True)[0] - g_grads = paddle.autograd.grad( - g_loss, last_layer, retain_graph=True)[0] + nll_grads = paddle.autograd.grad(nll_loss, last_layer, retain_graph=True)[0] + g_grads = paddle.autograd.grad(g_loss, last_layer, retain_graph=True)[0] else: - nll_grads = paddle.autograd.grad( - nll_loss, self.last_layer[0], retain_graph=True)[0] - g_grads = paddle.autograd.grad( - g_loss, self.last_layer[0], retain_graph=True)[0] + nll_grads = paddle.autograd.grad(nll_loss, self.last_layer[0], retain_graph=True)[0] + g_grads = paddle.autograd.grad(g_loss, self.last_layer[0], retain_graph=True)[0] d_weight = paddle.norm(nll_grads) / (paddle.norm(g_grads) + 1e-4) d_weight = paddle.clip(d_weight, 0.0, 1e4).detach() @@ -505,16 +463,17 @@ def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer=None): return d_weight def forward( - self, - inputs, - reconstructions, - posteriors, - optimizer_idx, - global_step, - last_layer=None, - cond=None, - split="train", - weights=None, ): + self, + inputs, + reconstructions, + posteriors, + optimizer_idx, + global_step, + last_layer=None, + cond=None, + split="train", + weights=None, + ): rec_loss = paddle.abs(inputs - reconstructions) if self.perceptual_weight > 0: p_loss = self.perceptual_loss(inputs, reconstructions) @@ -525,8 +484,7 @@ def forward( weighted_nll_loss = nll_loss if weights is not None: weighted_nll_loss = weights * nll_loss - weighted_nll_loss = paddle.sum( - weighted_nll_loss) / weighted_nll_loss.shape[0] + weighted_nll_loss = paddle.sum(weighted_nll_loss) / weighted_nll_loss.shape[0] nll_loss = paddle.sum(nll_loss) / nll_loss.shape[0] kl_loss = posteriors.kl() kl_loss = paddle.sum(kl_loss) / kl_loss.shape[0] @@ -539,37 +497,28 @@ def forward( logits_fake = self.discriminator(reconstructions) else: assert self.disc_conditional - logits_fake = self.discriminator( - paddle.concat( - (reconstructions, cond), axis=1)) + logits_fake = self.discriminator(paddle.concat((reconstructions, cond), axis=1)) g_loss = -paddle.mean(logits_fake) if self.disc_factor > 0.0: try: - d_weight = self.calculate_adaptive_weight( - nll_loss, g_loss, last_layer=last_layer) + d_weight = self.calculate_adaptive_weight(nll_loss, g_loss, last_layer=last_layer) except Exception: assert not self.training d_weight = paddle.to_tensor(0.0) else: d_weight = paddle.to_tensor(0.0) - disc_factor = adopt_weight( - self.disc_factor, - global_step, - threshold=self.discriminator_iter_start) - loss = (weighted_nll_loss + self.kl_weight * kl_loss + d_weight * - disc_factor * g_loss) + disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start) + loss = weighted_nll_loss + self.kl_weight * kl_loss + d_weight * disc_factor * g_loss log = { - "{}/total_loss".format(split): - loss.clone().detach().mean().item(), + "{}/total_loss".format(split): loss.clone().detach().mean().item(), "{}/logvar".format(split): self.logvar.detach().item(), "{}/kl_loss".format(split): kl_loss.detach().mean().item(), "{}/nll_loss".format(split): nll_loss.detach().mean().item(), "{}/rec_loss".format(split): rec_loss.detach().mean().item(), "{}/d_weight".format(split): d_weight.detach().item(), - "{}/disc_factor".format(split): - paddle.to_tensor(disc_factor).item(), + "{}/disc_factor".format(split): paddle.to_tensor(disc_factor).item(), "{}/g_loss".format(split): g_loss.detach().mean().item(), } return loss, log @@ -580,24 +529,14 @@ def forward( logits_real = self.discriminator(inputs.detach()) logits_fake = self.discriminator(reconstructions.detach()) else: - logits_real = self.discriminator( - paddle.concat( - (inputs.detach(), cond), axis=1)) - logits_fake = self.discriminator( - paddle.concat( - (reconstructions.detach(), cond), axis=1)) - disc_factor = adopt_weight( - self.disc_factor, - global_step, - threshold=self.discriminator_iter_start) + logits_real = self.discriminator(paddle.concat((inputs.detach(), cond), axis=1)) + logits_fake = self.discriminator(paddle.concat((reconstructions.detach(), cond), axis=1)) + disc_factor = adopt_weight(self.disc_factor, global_step, threshold=self.discriminator_iter_start) d_loss = disc_factor * self.disc_loss(logits_real, logits_fake) log = { - "{}/disc_loss".format(split): - d_loss.clone().detach().mean().item(), - "{}/logits_real".format(split): - logits_real.detach().mean().item(), - "{}/logits_fake".format(split): - logits_fake.detach().mean().item(), + "{}/disc_loss".format(split): d_loss.clone().detach().mean().item(), + "{}/logits_real".format(split): logits_real.detach().mean().item(), + "{}/logits_fake".format(split): logits_fake.detach().mean().item(), } return d_loss, log diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/model.py b/ppdiffusers/examples/autoencoder/vae/ldm/model.py index 81cd75c9787bc..5df1c98fe4c61 100644 --- a/ppdiffusers/examples/autoencoder/vae/ldm/model.py +++ b/ppdiffusers/examples/autoencoder/vae/ldm/model.py @@ -22,8 +22,13 @@ from ppdiffusers.configuration_utils import ConfigMixin, register_to_config from ppdiffusers.initializer import reset_initialized_parameter from ppdiffusers.models.autoencoder_kl import ( - AutoencoderKLOutput, Decoder, DecoderOutput, DiagonalGaussianDistribution, - Encoder) + AutoencoderKLOutput, + Decoder, + DecoderOutput, + DiagonalGaussianDistribution, + Encoder, +) + # from ppdiffusers.models.ema import LitEma from ppdiffusers.models.modeling_utils import ModelMixin @@ -33,8 +38,7 @@ def count_params(model, verbose=True): total_params = sum(p.numel() for p in model.parameters()).item() if verbose: - print( - f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.") + print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.") return total_params @@ -44,59 +48,62 @@ class AutoencoderKLWithLoss(ModelMixin, ConfigMixin): @register_to_config def __init__( - self, - in_channels: int=3, - out_channels: int=3, - down_block_types: Tuple[str]=( - "DownEncoderBlock2D", - "DownEncoderBlock2D", - "DownEncoderBlock2D", - "DownEncoderBlock2D", ), - down_block_out_channels: Tuple[int]=None, - up_block_types: Tuple[str]=( - "UpDecoderBlock2D", - "UpDecoderBlock2D", - "UpDecoderBlock2D", - "UpDecoderBlock2D", ), - up_block_out_channels: Tuple[int]=None, - block_out_channels: Tuple[int]=(128, 256, 512, 512), - layers_per_block: int=2, - act_fn: str="silu", - latent_channels: int=4, - norm_num_groups: int=32, - sample_size: int=512, - # new add - input_size: Tuple[int]=None, - # loss arguments - disc_start=50001, - kl_weight=1.0e-6, - disc_weight=0.5, - logvar_init=0.0, - pixelloss_weight=1.0, - disc_num_layers=3, - disc_in_channels=3, - disc_factor=1.0, - perceptual_weight=1.0, - use_actnorm=False, - disc_conditional=False, - disc_loss="hinge", - use_ema=False, - ema_decay=None, ): + self, + in_channels: int = 3, + out_channels: int = 3, + down_block_types: Tuple[str] = ( + "DownEncoderBlock2D", + "DownEncoderBlock2D", + "DownEncoderBlock2D", + "DownEncoderBlock2D", + ), + down_block_out_channels: Tuple[int] = None, + up_block_types: Tuple[str] = ( + "UpDecoderBlock2D", + "UpDecoderBlock2D", + "UpDecoderBlock2D", + "UpDecoderBlock2D", + ), + up_block_out_channels: Tuple[int] = None, + block_out_channels: Tuple[int] = (128, 256, 512, 512), + layers_per_block: int = 2, + act_fn: str = "silu", + latent_channels: int = 4, + norm_num_groups: int = 32, + sample_size: int = 512, + # new add + input_size: Tuple[int] = None, + # loss arguments + disc_start=50001, + kl_weight=1.0e-6, + disc_weight=0.5, + logvar_init=0.0, + pixelloss_weight=1.0, + disc_num_layers=3, + disc_in_channels=3, + disc_factor=1.0, + perceptual_weight=1.0, + use_actnorm=False, + disc_conditional=False, + disc_loss="hinge", + use_ema=False, + ema_decay=None, + ): super().__init__() - self.input_size = ([int(_) for _ in input_size] - if input_size is not None else None) + self.input_size = [int(_) for _ in input_size] if input_size is not None else None self.encoder = Encoder( in_channels=in_channels, out_channels=latent_channels, down_block_types=down_block_types, block_out_channels=down_block_out_channels - if down_block_out_channels is - not None # if down_block_out_channels not givien, we will use block_out_channels + if down_block_out_channels + is not None # if down_block_out_channels not givien, we will use block_out_channels else block_out_channels, layers_per_block=layers_per_block, act_fn=act_fn, norm_num_groups=norm_num_groups, - double_z=True, ) + double_z=True, + ) # pass init params to Decoder self.decoder = Decoder( @@ -104,10 +111,12 @@ def __init__( out_channels=out_channels, up_block_types=up_block_types, block_out_channels=up_block_out_channels # if up_block_out_channels not givien, we will use block_out_channels - if up_block_out_channels is not None else block_out_channels, + if up_block_out_channels is not None + else block_out_channels, layers_per_block=layers_per_block, norm_num_groups=norm_num_groups, - act_fn=act_fn, ) + act_fn=act_fn, + ) self.quant_conv = nn.Conv2D(2 * latent_channels, 2 * latent_channels, 1) self.post_quant_conv = nn.Conv2D(latent_channels, latent_channels, 1) @@ -125,7 +134,8 @@ def __init__( perceptual_weight=perceptual_weight, use_actnorm=use_actnorm, disc_conditional=disc_conditional, - disc_loss=disc_loss, ) + disc_loss=disc_loss, + ) count_params(self) self.init_weights() self.use_ema = use_ema @@ -143,9 +153,10 @@ def init_weights(self): reset_initialized_parameter(self.post_quant_conv) def custom_forward( - self, - sample: paddle.Tensor, - sample_posterior: bool=True, ): + self, + sample: paddle.Tensor, + sample_posterior: bool = True, + ): posterior = self.encode(sample).latent_dist if sample_posterior: z = posterior.sample() @@ -183,8 +194,7 @@ def forward(self, pixel_values, optimizer_idx=0, global_step=0): if self.input_size is None: encoder_inputs = pixel_values else: - encoder_inputs = F.interpolate( - pixel_values, size=self.input_size, mode="bilinear") + encoder_inputs = F.interpolate(pixel_values, size=self.input_size, mode="bilinear") reconstructions, posterior = self.custom_forward(encoder_inputs) @@ -197,7 +207,8 @@ def forward(self, pixel_values, optimizer_idx=0, global_step=0): optimizer_idx, global_step, last_layer=self.get_last_layer(), - split="train", ) + split="train", + ) return aeloss, log_dict_ae if optimizer_idx == 1: @@ -209,7 +220,8 @@ def forward(self, pixel_values, optimizer_idx=0, global_step=0): optimizer_idx, global_step, last_layer=self.get_last_layer(), - split="train", ) + split="train", + ) return discloss, log_dict_disc @paddle.no_grad() @@ -219,21 +231,18 @@ def log_images(self, pixel_values, only_inputs=False, **kwargs): if self.input_size is None: encoder_inputs = pixel_values else: - encoder_inputs = F.interpolate( - pixel_values, size=self.input_size, mode="bilinear") + encoder_inputs = F.interpolate(pixel_values, size=self.input_size, mode="bilinear") if not only_inputs: xrec, posterior = self.custom_forward(encoder_inputs) - log["samples"] = self.decode_image( - self.decode(paddle.randn(posterior.sample().shape)).sample) + log["samples"] = self.decode_image(self.decode(paddle.randn(posterior.sample().shape)).sample) log["reconstructions"] = self.decode_image(xrec) if self.use_ema: with self.ema_scope(): - xrec_ema, posterior_ema = self.custom_forward( - encoder_inputs) + xrec_ema, posterior_ema = self.custom_forward(encoder_inputs) log["samples_ema"] = self.decode_image( - self.decode( - paddle.randn(posterior_ema.sample().shape)).sample) + self.decode(paddle.randn(posterior_ema.sample().shape)).sample + ) log["reconstructions_ema"] = self.decode_image(xrec_ema) # update log["encoder_inputs"] = self.decode_image(encoder_inputs) @@ -247,12 +256,10 @@ def decode_image(self, image): @paddle.no_grad() def validation_step(self, pixel_values, global_step=0): - log_dict_ae, log_dict_disc = self._validation_step(pixel_values, - global_step) + log_dict_ae, log_dict_disc = self._validation_step(pixel_values, global_step) if self.use_ema: with self.ema_scope(): - log_dict_ae_ema, log_dict_disc_ema = self._validation_step( - pixel_values, global_step, postfix="_ema") + log_dict_ae_ema, log_dict_disc_ema = self._validation_step(pixel_values, global_step, postfix="_ema") log_dict_ae.update(log_dict_ae_ema) log_dict_disc.update(log_dict_disc_ema) @@ -263,8 +270,7 @@ def _validation_step(self, pixel_values, global_step=0, postfix=""): if self.input_size is None: encoder_inputs = pixel_values else: - encoder_inputs = F.interpolate( - pixel_values, size=self.input_size, mode="bilinear") + encoder_inputs = F.interpolate(pixel_values, size=self.input_size, mode="bilinear") reconstructions, posterior = self.custom_forward(encoder_inputs) aeloss, log_dict_ae = self.loss( @@ -274,7 +280,8 @@ def _validation_step(self, pixel_values, global_step=0, postfix=""): 0, global_step, last_layer=self.get_last_layer(), - split="val" + postfix, ) + split="val" + postfix, + ) discloss, log_dict_disc = self.loss( pixel_values, @@ -283,7 +290,8 @@ def _validation_step(self, pixel_values, global_step=0, postfix=""): 1, global_step, last_layer=self.get_last_layer(), - split="val" + postfix, ) + split="val" + postfix, + ) self.train() return log_dict_ae, log_dict_disc @@ -333,26 +341,25 @@ def untoggle_optimizer(self, optimizers, optimizer_idx): if optimizer_idx != opt_idx: for param in opt._parameter_list: if param in self._param_stop_gradient_state: - param.stop_gradient = self._param_stop_gradient_state[ - param] + param.stop_gradient = self._param_stop_gradient_state[param] # save memory self._param_stop_gradient_state = {} - def encode(self, x: paddle.Tensor, return_dict: bool=True): + def encode(self, x: paddle.Tensor, return_dict: bool = True): h = self.encoder(x) moments = self.quant_conv(h) posterior = DiagonalGaussianDistribution(moments) if not return_dict: - return (posterior, ) + return (posterior,) return AutoencoderKLOutput(latent_dist=posterior) - def decode(self, z: paddle.Tensor, return_dict: bool=True): + def decode(self, z: paddle.Tensor, return_dict: bool = True): z = self.post_quant_conv(z) dec = self.decoder(z) if not return_dict: - return (dec, ) + return (dec,) return DecoderOutput(sample=dec) diff --git a/ppdiffusers/examples/autoencoder/vae/ldm/text_image_pair.py b/ppdiffusers/examples/autoencoder/vae/ldm/text_image_pair.py index 8d3f4a8f4ac7a..4a91b34df3acc 100644 --- a/ppdiffusers/examples/autoencoder/vae/ldm/text_image_pair.py +++ b/ppdiffusers/examples/autoencoder/vae/ldm/text_image_pair.py @@ -77,22 +77,25 @@ def _get_param(self, img, output_size): class TextImagePair(IterableDataset): def __init__( - self, - file_list, - size, - num_records, - image_processing=None, - buffer_size=1000, - shuffle_every_n_samples=5, - interpolation="lanczos", ): + self, + file_list, + size, + num_records, + image_processing=None, + buffer_size=1000, + shuffle_every_n_samples=5, + interpolation="lanczos", + ): self.size = size if image_processing is None: - self.image_processing = transforms.Compose([ - transforms.Resize(int(size / 0.9), interpolation), - RandomCrop(size), - transforms.ToTensor(), - transforms.Normalize(0.5, 0.5), - ]) + self.image_processing = transforms.Compose( + [ + transforms.Resize(int(size / 0.9), interpolation), + RandomCrop(size), + transforms.ToTensor(), + transforms.Normalize(0.5, 0.5), + ] + ) else: self.image_processing = image_processing self.file_list = [] @@ -115,19 +118,14 @@ def __init__( file_weights = file_weights / file_weight_sum print(f"sample weights of files: {file_weights}") self.file_weights_cumsum = np.cumsum(file_weights) - self.file_weights_cumsum = np.concatenate( - [[0.0], self.file_weights_cumsum]) + self.file_weights_cumsum = np.concatenate([[0.0], self.file_weights_cumsum]) else: print("sample each file list with same probabiliy") self.file_weights_cumsum = None self.num_records = num_records - self.file_ids = [ - np.arange(len(filelist)) for filelist in self.file_list - ] - print( - f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}" - ) + self.file_ids = [np.arange(len(filelist)) for filelist in self.file_list] + print(f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}") self.buffer_size = buffer_size self.shuffle_every_n_samples = shuffle_every_n_samples @@ -136,9 +134,7 @@ def sample_loader(self, file_ids, filenames): random.shuffle(file_ids) for i in file_ids: filename = filenames[i].strip("\n") - with gzip.open(filename, - "rb") if filename.endswith(".gz") else open( - filename, "rb") as f: + with gzip.open(filename, "rb") if filename.endswith(".gz") else open(filename, "rb") as f: retry = 0 while True: line = f.readline() @@ -167,12 +163,9 @@ def sample_loader(self, file_ids, filenames): yield data def random_load_from_multi_dataset(self): - print( - f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}" - ) + print(f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}") sample_loader_per_dataset = [ - iter(self.sample_loader(self.file_ids[i], self.file_list[i])) - for i in range(len(self.file_ids)) + iter(self.sample_loader(self.file_ids[i], self.file_list[i])) for i in range(len(self.file_ids)) ] while True: @@ -181,8 +174,7 @@ def random_load_from_multi_dataset(self): else: rand_num = random.random() for i in range(len(self.file_list)): - if (self.file_weights_cumsum[i] <= rand_num < - self.file_weights_cumsum[i + 1]): + if self.file_weights_cumsum[i] <= rand_num < self.file_weights_cumsum[i + 1]: break sample_loader = sample_loader_per_dataset[i] # debug @@ -211,8 +203,7 @@ def __iter__(self): return self.shuffle(iter(self.random_load_from_multi_dataset())) -def split_data_per_worker(dataset, worker_id, local_rank, world_size, - num_workers): +def split_data_per_worker(dataset, worker_id, local_rank, world_size, num_workers): worker_global_id = local_rank * num_workers + worker_id dataset.rng = np.random.RandomState(worker_global_id) for i in range(len(dataset.file_ids)): @@ -238,8 +229,7 @@ def worker_init_fn(_): world_size = dist.get_world_size() num_workers = worker_info.num_workers if isinstance(dataset, TextImagePair): - split_data_per_worker(dataset, worker_id, local_rank, world_size, - num_workers) + split_data_per_worker(dataset, worker_id, local_rank, world_size, num_workers) return np.random.seed(np.random.get_state()[1][0] + worker_id) else: return np.random.seed(np.random.get_state()[1][0] + worker_id) diff --git a/ppdiffusers/examples/autoencoder/vae/scripts/calculate_psnr_ssim.py b/ppdiffusers/examples/autoencoder/vae/scripts/calculate_psnr_ssim.py index 08141d43c821e..ebfb3ff1df677 100644 --- a/ppdiffusers/examples/autoencoder/vae/scripts/calculate_psnr_ssim.py +++ b/ppdiffusers/examples/autoencoder/vae/scripts/calculate_psnr_ssim.py @@ -40,9 +40,7 @@ def reorder_image(img, input_order="HWC"): """ if input_order not in ["HWC", "CHW"]: - raise ValueError( - f"Wrong input_order {input_order}. Supported input_orders are " - "'HWC' and 'CHW'") + raise ValueError(f"Wrong input_order {input_order}. Supported input_orders are " "'HWC' and 'CHW'") if len(img.shape) == 2: img = img[..., None] if input_order == "CHW": @@ -68,12 +66,9 @@ def calculate_psnr(img, img2, crop_border, input_order="HWC", **kwargs): float: psnr result. """ - assert (img.shape == img2.shape - ), f"Image shapes are different: {img.shape}, {img2.shape}." + assert img.shape == img2.shape, f"Image shapes are different: {img.shape}, {img2.shape}." if input_order not in ["HWC", "CHW"]: - raise ValueError( - f"Wrong input_order {input_order}. Supported input_orders are " - '"HWC" and "CHW"') + raise ValueError(f"Wrong input_order {input_order}. Supported input_orders are " '"HWC" and "CHW"') img = reorder_image(img, input_order=input_order) img2 = reorder_image(img2, input_order=input_order) img = img.astype(np.float64) @@ -83,7 +78,7 @@ def calculate_psnr(img, img2, crop_border, input_order="HWC", **kwargs): img = img[crop_border:-crop_border, crop_border:-crop_border, ...] img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...] - mse = np.mean((img - img2)**2) + mse = np.mean((img - img2) ** 2) if mse == 0: return float("inf") return 20.0 * np.log10(255.0 / np.sqrt(mse)) @@ -102,8 +97,8 @@ def _ssim(img, img2): float: ssim result. """ - c1 = (0.01 * 255)**2 - c2 = (0.03 * 255)**2 + c1 = (0.01 * 255) ** 2 + c2 = (0.03 * 255) ** 2 img = img.astype(np.float64) img2 = img2.astype(np.float64) @@ -119,8 +114,7 @@ def _ssim(img, img2): sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq sigma12 = cv2.filter2D(img * img2, -1, window)[5:-5, 5:-5] - mu1_mu2 - ssim_map = ((2 * mu1_mu2 + c1) * (2 * sigma12 + c2)) / ( - (mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2)) + ssim_map = ((2 * mu1_mu2 + c1) * (2 * sigma12 + c2)) / ((mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2)) return ssim_map.mean() @@ -149,12 +143,9 @@ def calculate_ssim(img, img2, crop_border, input_order="HWC", **kwargs): float: ssim result. """ - assert (img.shape == img2.shape - ), f"Image shapes are different: {img.shape}, {img2.shape}." + assert img.shape == img2.shape, f"Image shapes are different: {img.shape}, {img2.shape}." if input_order not in ["HWC", "CHW"]: - raise ValueError( - f"Wrong input_order {input_order}. Supported input_orders are " - '"HWC" and "CHW"') + raise ValueError(f"Wrong input_order {input_order}. Supported input_orders are " '"HWC" and "CHW"') img = reorder_image(img, input_order=input_order) img2 = reorder_image(img2, input_order=input_order) img = img.astype(np.float64) diff --git a/ppdiffusers/examples/autoencoder/vae/scripts/convert_kl_8_to_ppdiffusers.py b/ppdiffusers/examples/autoencoder/vae/scripts/convert_kl_8_to_ppdiffusers.py index d466ef6155819..d239d53cf5fcf 100644 --- a/ppdiffusers/examples/autoencoder/vae/scripts/convert_kl_8_to_ppdiffusers.py +++ b/ppdiffusers/examples/autoencoder/vae/scripts/convert_kl_8_to_ppdiffusers.py @@ -53,8 +53,7 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0): new_item = new_item.replace("emb_layers.1", "time_emb_proj") new_item = new_item.replace("skip_connection", "conv_shortcut") - new_item = shave_segments( - new_item, n_shave_prefix_segments=n_shave_prefix_segments) + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) mapping.append({"old": old_item, "new": new_item}) @@ -70,8 +69,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0): new_item = old_item new_item = new_item.replace("nin_shortcut", "conv_shortcut") - new_item = shave_segments( - new_item, n_shave_prefix_segments=n_shave_prefix_segments) + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) mapping.append({"old": old_item, "new": new_item}) @@ -113,8 +111,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): new_item = new_item.replace("proj_out.weight", "proj_attn.weight") new_item = new_item.replace("proj_out.bias", "proj_attn.bias") - new_item = shave_segments( - new_item, n_shave_prefix_segments=n_shave_prefix_segments) + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) mapping.append({"old": old_item, "new": new_item}) @@ -122,21 +119,20 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): def assign_to_checkpoint( - paths, - checkpoint, - old_checkpoint, - attention_paths_to_split=None, - additional_replacements=None, - config=None, ): + paths, + checkpoint, + old_checkpoint, + attention_paths_to_split=None, + additional_replacements=None, + config=None, +): """ This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits attention layers, and takes into account additional replacements that may arise. Assigns the weights to the new checkpoint. """ - assert isinstance( - paths, - list), "Paths should be a list of dicts containing 'old' and 'new' keys." + assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys." # Splits the attention layers into three variables. if attention_paths_to_split is not None: @@ -144,13 +140,11 @@ def assign_to_checkpoint( old_tensor = old_checkpoint[path] channels = old_tensor.shape[0] // 3 - target_shape = (-1, channels) if len(old_tensor.shape) == 3 else ( - -1) + target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1) num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3 - old_tensor = old_tensor.reshape((num_heads, 3 * channels // - num_heads) + old_tensor.shape[1:]) + old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:]) query, key, value = old_tensor.split(channels // num_heads, dim=1) checkpoint[path_map["query"]] = query.reshape(target_shape) @@ -161,8 +155,7 @@ def assign_to_checkpoint( new_path = path["new"] # These have already been assigned - if (attention_paths_to_split is not None and - new_path in attention_paths_to_split): + if attention_paths_to_split is not None and new_path in attention_paths_to_split: continue # Global renaming happens here @@ -172,8 +165,7 @@ def assign_to_checkpoint( if additional_replacements is not None: for replacement in additional_replacements: - new_path = new_path.replace(replacement["old"], - replacement["new"]) + new_path = new_path.replace(replacement["old"], replacement["new"]) # proj_attn.weight has to be converted from conv 1D to linear if "proj_attn.weight" in new_path: @@ -202,14 +194,10 @@ def create_vae_diffusers_config(original_config): decoder_vae_params = original_config.model.params.ddconfig.decoder vae_params = decoder_vae_params - encoder_block_out_channels = [ - encoder_vae_params.ch * mult for mult in encoder_vae_params.ch_mult - ] + encoder_block_out_channels = [encoder_vae_params.ch * mult for mult in encoder_vae_params.ch_mult] down_block_types = ["DownEncoderBlock2D"] * len(encoder_block_out_channels) - decoder_block_out_channels = [ - decoder_vae_params.ch * mult for mult in decoder_vae_params.ch_mult - ] + decoder_block_out_channels = [decoder_vae_params.ch * mult for mult in decoder_vae_params.ch_mult] up_block_types = ["UpDecoderBlock2D"] * len(decoder_block_out_channels) config = dict( @@ -222,114 +210,82 @@ def create_vae_diffusers_config(original_config): down_block_out_channels=tuple(encoder_block_out_channels), up_block_out_channels=tuple(decoder_block_out_channels), latent_channels=vae_params.z_channels, - layers_per_block=vae_params.num_res_blocks, ) + layers_per_block=vae_params.num_res_blocks, + ) return config def convert_ldm_vae_checkpoint(vae_state_dict, config): new_checkpoint = {} - new_checkpoint["encoder.conv_in.weight"] = vae_state_dict[ - "encoder.conv_in.weight"] - new_checkpoint["encoder.conv_in.bias"] = vae_state_dict[ - "encoder.conv_in.bias"] - new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[ - "encoder.conv_out.weight"] - new_checkpoint["encoder.conv_out.bias"] = vae_state_dict[ - "encoder.conv_out.bias"] - new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[ - "encoder.norm_out.weight"] - new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[ - "encoder.norm_out.bias"] - - new_checkpoint["decoder.conv_in.weight"] = vae_state_dict[ - "decoder.conv_in.weight"] - new_checkpoint["decoder.conv_in.bias"] = vae_state_dict[ - "decoder.conv_in.bias"] - new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[ - "decoder.conv_out.weight"] - new_checkpoint["decoder.conv_out.bias"] = vae_state_dict[ - "decoder.conv_out.bias"] - new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[ - "decoder.norm_out.weight"] - new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[ - "decoder.norm_out.bias"] + new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"] + new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"] + new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"] + new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"] + new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"] + new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"] + + new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"] + new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"] + new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"] + new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"] + new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"] + new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"] new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"] new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"] - new_checkpoint["post_quant_conv.weight"] = vae_state_dict[ - "post_quant_conv.weight"] - new_checkpoint["post_quant_conv.bias"] = vae_state_dict[ - "post_quant_conv.bias"] + new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"] + new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"] # Retrieves the keys for the encoder down blocks only - num_down_blocks = len({ - ".".join(layer.split(".")[:3]) - for layer in vae_state_dict if "encoder.down" in layer - }) + num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer}) down_blocks = { - layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] - for layer_id in range(num_down_blocks) + layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks) } # Retrieves the keys for the decoder up blocks only - num_up_blocks = len({ - ".".join(layer.split(".")[:3]) - for layer in vae_state_dict if "decoder.up" in layer - }) + num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer}) up_blocks = { - layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] - for layer_id in range(num_up_blocks) + layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks) } for i in range(num_down_blocks): - resnets = [ - key for key in down_blocks[i] - if f"down.{i}" in key and f"down.{i}.downsample" not in key - ] + resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key] if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict: - new_checkpoint[ - f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop( - f"encoder.down.{i}.downsample.conv.weight") - new_checkpoint[ - f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop( - f"encoder.down.{i}.downsample.conv.bias") + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop( + f"encoder.down.{i}.downsample.conv.weight" + ) + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop( + f"encoder.down.{i}.downsample.conv.bias" + ) paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"down.{i}.block", - "new": f"down_blocks.{i}.resnets" - } + meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key] num_mid_res_blocks = 2 for i in range(1, num_mid_res_blocks + 1): - resnets = [ - key for key in mid_resnets if f"encoder.mid.block_{i}" in key - ] + resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key] paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"mid.block_{i}", - "new": f"mid_block.resnets.{i - 1}" - } + meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) - mid_attentions = [ - key for key in vae_state_dict if "encoder.mid.attn" in key - ] + mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key] paths = renew_vae_attention_paths(mid_attentions) meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} assign_to_checkpoint( @@ -337,58 +293,50 @@ def convert_ldm_vae_checkpoint(vae_state_dict, config): new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) conv_attn_to_linear(new_checkpoint) for i in range(num_up_blocks): block_id = num_up_blocks - 1 - i resnets = [ - key for key in up_blocks[block_id] - if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key + key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key ] if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict: - new_checkpoint[ - f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[ - f"decoder.up.{block_id}.upsample.conv.weight"] - new_checkpoint[ - f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[ - f"decoder.up.{block_id}.upsample.conv.bias"] + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[ + f"decoder.up.{block_id}.upsample.conv.weight" + ] + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[ + f"decoder.up.{block_id}.upsample.conv.bias" + ] paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"up.{block_id}.block", - "new": f"up_blocks.{i}.resnets" - } + meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key] num_mid_res_blocks = 2 for i in range(1, num_mid_res_blocks + 1): - resnets = [ - key for key in mid_resnets if f"decoder.mid.block_{i}" in key - ] + resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key] paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"mid.block_{i}", - "new": f"mid_block.resnets.{i - 1}" - } + meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) - mid_attentions = [ - key for key in vae_state_dict if "decoder.mid.attn" in key - ] + mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key] paths = renew_vae_attention_paths(mid_attentions) meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} assign_to_checkpoint( @@ -396,14 +344,13 @@ def convert_ldm_vae_checkpoint(vae_state_dict, config): new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) conv_attn_to_linear(new_checkpoint) return new_checkpoint -def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, - diffusers_vae_unet_checkpoint, - dtype="float32"): +def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint, dtype="float32"): need_transpose = [] for k, v in vae_or_unet.named_sublayers(include_self=True): if isinstance(v, paddle.nn.Linear): @@ -442,7 +389,8 @@ def check_keys(model, state_dict): default=None, type=str, required=True, - help="Path to the checkpoint to convert.", ) + help="Path to the checkpoint to convert.", + ) parser.add_argument( "--original_config_file", default="../config/f8encoder_f16decoder.yaml", @@ -453,13 +401,15 @@ def check_keys(model, state_dict): "--dtype", default="float32", type=str, - help="Dtype of model weights.", ) + help="Dtype of model weights.", + ) parser.add_argument( "--dump_path", default=None, type=str, required=True, - help="Path to the output model.", ) + help="Path to the output model.", + ) args = parser.parse_args() @@ -469,11 +419,9 @@ def check_keys(model, state_dict): vae_config = create_vae_diffusers_config(original_config) # 1. convert vae encoder and decoder - diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, - vae_config) + diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config) vae = AutoencoderKL.from_config(vae_config) - ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers( - vae, diffusers_vae_checkpoint, args.dtype) + ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(vae, diffusers_vae_checkpoint, args.dtype) # 2. convert losses maps = { @@ -491,7 +439,7 @@ def check_keys(model, state_dict): k = k.replace(old, new) # paddle donot support 0d tensor if v.ndim == 0: - v = v.reshape((1, )) + v = v.reshape((1,)) # rename if "perceptual_loss.lin" in k: k = k.replace("perceptual_loss.lin", "perceptual_loss.lins.") @@ -501,5 +449,4 @@ def check_keys(model, state_dict): check_keys(vae, ppdiffusers_vae_checkpoint) vae.save_config(args.dump_path) # 4. save state_dict - paddle.save(ppdiffusers_vae_checkpoint, - os.path.join(args.dump_path, "model_state.pdparams")) + paddle.save(ppdiffusers_vae_checkpoint, os.path.join(args.dump_path, "model_state.pdparams")) diff --git a/ppdiffusers/examples/autoencoder/vae/scripts/fid_score.py b/ppdiffusers/examples/autoencoder/vae/scripts/fid_score.py index 6bc24b3d88bab..0e7e08a580299 100644 --- a/ppdiffusers/examples/autoencoder/vae/scripts/fid_score.py +++ b/ppdiffusers/examples/autoencoder/vae/scripts/fid_score.py @@ -67,35 +67,28 @@ def tqdm(x): from inception import InceptionV3 parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) -parser.add_argument( - "--batch-size", type=int, default=50, help="Batch size to use") +parser.add_argument("--batch-size", type=int, default=50, help="Batch size to use") parser.add_argument( "--num-workers", type=int, - help=("Number of processes to use for data loading. " - "Defaults to `min(8, num_cpus)`"), ) -parser.add_argument( - "--device", - type=str, - default=None, - help="Device to use. Like gpu, gpu:0 or cpu") + help=("Number of processes to use for data loading. " "Defaults to `min(8, num_cpus)`"), +) +parser.add_argument("--device", type=str, default=None, help="Device to use. Like gpu, gpu:0 or cpu") parser.add_argument( "--dims", type=int, default=2048, choices=list(InceptionV3.BLOCK_INDEX_BY_DIM), - help=("Dimensionality of Inception features to use. " - "By default, uses pool3 features"), ) + help=("Dimensionality of Inception features to use. " "By default, uses pool3 features"), +) parser.add_argument( "path", type=str, nargs=2, - help=("Paths to the generated images or " - "to .npz statistic files"), ) + help=("Paths to the generated images or " "to .npz statistic files"), +) -IMAGE_EXTENSIONS = { - "bmp", "jpg", "jpeg", "pgm", "png", "ppm", "tif", "tiff", "webp" -} +IMAGE_EXTENSIONS = {"bmp", "jpg", "jpeg", "pgm", "png", "ppm", "tif", "tiff", "webp"} class ImagePathDataset(paddle.io.Dataset): @@ -136,8 +129,7 @@ def get_activations(files, model, batch_size=50, dims=2048, num_workers=1): model.eval() if batch_size > len(files): - print(("Warning: batch size is bigger than the data size. " - "Setting batch size to data size")) + print(("Warning: batch size is bigger than the data size. " "Setting batch size to data size")) batch_size = len(files) dataset = ImagePathDataset(files, transforms=TF.ToTensor()) @@ -146,7 +138,8 @@ def get_activations(files, model, batch_size=50, dims=2048, num_workers=1): batch_size=batch_size, shuffle=False, drop_last=False, - num_workers=num_workers, ) + num_workers=num_workers, + ) pred_arr = np.empty((len(files), dims)) @@ -165,7 +158,7 @@ def get_activations(files, model, batch_size=50, dims=2048, num_workers=1): pred = pred.squeeze(3).squeeze(2).cpu().numpy() - pred_arr[start_idx:start_idx + pred.shape[0]] = pred + pred_arr[start_idx : start_idx + pred.shape[0]] = pred start_idx = start_idx + pred.shape[0] @@ -200,18 +193,15 @@ def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6): sigma1 = np.atleast_2d(sigma1) sigma2 = np.atleast_2d(sigma2) - assert (mu1.shape == mu2.shape - ), "Training and test mean vectors have different lengths" - assert (sigma1.shape == sigma2.shape - ), "Training and test covariances have different dimensions" + assert mu1.shape == mu2.shape, "Training and test mean vectors have different lengths" + assert sigma1.shape == sigma2.shape, "Training and test covariances have different dimensions" diff = mu1 - mu2 # Product might be almost singular covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False) if not np.isfinite(covmean).all(): - msg = ("fid calculation produces singular product; " - "adding %s to diagonal of cov estimates") % eps + msg = ("fid calculation produces singular product; " "adding %s to diagonal of cov estimates") % eps print(msg) offset = np.eye(sigma1.shape[0]) * eps covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset)) @@ -228,11 +218,7 @@ def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6): return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean -def calculate_activation_statistics(files, - model, - batch_size=50, - dims=2048, - num_workers=1): +def calculate_activation_statistics(files, model, batch_size=50, dims=2048, num_workers=1): """Calculation of the statistics used by the FID. Params: -- files : List of image files paths @@ -261,13 +247,8 @@ def compute_statistics_of_path(path, model, batch_size, dims, num_workers=1): m, s = f["mu"][:], f["sigma"][:] else: path = pathlib.Path(path) - files = sorted([ - file - for ext in IMAGE_EXTENSIONS - for file in path.glob("*.{}".format(ext)) - ]) - m, s = calculate_activation_statistics(files, model, batch_size, dims, - num_workers) + files = sorted([file for ext in IMAGE_EXTENSIONS for file in path.glob("*.{}".format(ext))]) + m, s = calculate_activation_statistics(files, model, batch_size, dims, num_workers) return m, s @@ -282,10 +263,8 @@ def calculate_fid_given_paths(paths, batch_size, dims, num_workers=1): model = InceptionV3([block_idx]) - m1, s1 = compute_statistics_of_path(paths[0], model, batch_size, dims, - num_workers) - m2, s2 = compute_statistics_of_path(paths[1], model, batch_size, dims, - num_workers) + m1, s1 = compute_statistics_of_path(paths[0], model, batch_size, dims, num_workers) + m2, s2 = compute_statistics_of_path(paths[1], model, batch_size, dims, num_workers) fid_value = calculate_frechet_distance(m1, s1, m2, s2) return fid_value @@ -302,8 +281,7 @@ def main(): else: num_workers = args.num_workers - fid_value = calculate_fid_given_paths(args.path, args.batch_size, args.dims, - num_workers) + fid_value = calculate_fid_given_paths(args.path, args.batch_size, args.dims, num_workers) print("FID: ", fid_value) diff --git a/ppdiffusers/examples/autoencoder/vae/scripts/get_autoencoder_results.py b/ppdiffusers/examples/autoencoder/vae/scripts/get_autoencoder_results.py index 3eb58b8b7de40..7e5eadaf365b2 100644 --- a/ppdiffusers/examples/autoencoder/vae/scripts/get_autoencoder_results.py +++ b/ppdiffusers/examples/autoencoder/vae/scripts/get_autoencoder_results.py @@ -25,15 +25,16 @@ from ppdiffusers import AutoencoderKL, StableDiffusionImg2ImgPipeline -image_processing = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize(0.5, 0.5), -]) +image_processing = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize(0.5, 0.5), + ] +) def decode_image(image): - image = (image / 2 + 0.5).clip(0, 1).transpose( - [0, 2, 3, 1]).cast("float32").numpy() + image = (image / 2 + 0.5).clip(0, 1).transpose([0, 2, 3, 1]).cast("float32").numpy() image = StableDiffusionImg2ImgPipeline.numpy_to_pil(image) return image @@ -62,8 +63,7 @@ def main(vae_path, src_size, tgt_size, imgs, outdir): z = model.encode(img).latent_dist.sample() recon = model.decode(z).sample - decode_image(recon)[0].save( - osp.join(outdir, osp.basename(img_path))) + decode_image(recon)[0].save(osp.join(outdir, osp.basename(img_path))) if __name__ == "__main__": diff --git a/ppdiffusers/examples/autoencoder/vae/scripts/inception.py b/ppdiffusers/examples/autoencoder/vae/scripts/inception.py index 9aecdf265779a..bbdff9a933432 100644 --- a/ppdiffusers/examples/autoencoder/vae/scripts/inception.py +++ b/ppdiffusers/examples/autoencoder/vae/scripts/inception.py @@ -21,7 +21,8 @@ # http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz FID_WEIGHTS_URL = ( "https://paddlenlp.bj.bcebos.com/models/mseitzer/pp_inception-2015-12-05-6726825d.pdparams", - "8e2ae24c34c5c8b81d45167bb9361f4c", ) + "8e2ae24c34c5c8b81d45167bb9361f4c", +) WEIGHTS_PATH = "pp_inception-2015-12-05-6726825d.pdparams" @@ -47,17 +48,18 @@ class ConvNormActivation(nn.Sequential): """ def __init__( - self, - in_channels, - out_channels, - kernel_size=3, - stride=1, - padding=None, - groups=1, - norm_layer=nn.BatchNorm2D, - activation_layer=nn.ReLU, - dilation=1, - bias=None, ): + self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=None, + groups=1, + norm_layer=nn.BatchNorm2D, + activation_layer=nn.ReLU, + dilation=1, + bias=None, + ): if padding is None: padding = (kernel_size - 1) // 2 * dilation if bias is None: @@ -71,7 +73,8 @@ def __init__( padding, dilation=dilation, groups=groups, - bias_attr=bias, ) + bias_attr=bias, + ) ] if norm_layer is not None: # The hyperparameter of BatchNorm2D is different from PaddlePaddle. @@ -97,12 +100,13 @@ class InceptionV3(nn.Layer): } def __init__( - self, - output_blocks=(DEFAULT_BLOCK_INDEX, ), - resize_input=True, - normalize_input=True, - requires_grad=False, - use_fid_inception=True, ): + self, + output_blocks=(DEFAULT_BLOCK_INDEX,), + resize_input=True, + normalize_input=True, + requires_grad=False, + use_fid_inception=True, + ): """Build pretrained InceptionV3 Parameters @@ -211,8 +215,7 @@ def forward(self, inp): outp = [] x = inp if self.resize_input: - x = F.interpolate( - x, size=(299, 299), mode="bilinear", align_corners=False) + x = F.interpolate(x, size=(299, 299), mode="bilinear", align_corners=False) if self.normalize_input: x = 2 * x - 1 # Scale from range (0, 1) to range (-1, 1) @@ -235,8 +238,7 @@ def hack_bn_layer(layer): def _inception_v3(*args, **kwargs): """Wraps `paddle.vision.models.inception_v3`""" - return paddle.vision.models.inception_v3(*args, - **kwargs).apply(hack_bn_layer) + return paddle.vision.models.inception_v3(*args, **kwargs).apply(hack_bn_layer) def fid_inception_v3(): @@ -248,8 +250,7 @@ def fid_inception_v3(): This method first constructs paddle.vision's Inception and then patches the necessary parts that are different in the FID Inception model. """ - inception = _inception_v3( - num_classes=1008, with_pool=True, pretrained=False) + inception = _inception_v3(num_classes=1008, with_pool=True, pretrained=False) inception.inception_block_list[0] = InceptionA(192, pool_features=32) inception.inception_block_list[1] = InceptionA(256, pool_features=64) inception.inception_block_list[2] = InceptionA(288, pool_features=64) @@ -260,8 +261,7 @@ def fid_inception_v3(): inception.inception_block_list[9] = InceptionE_1(1280) inception.inception_block_list[10] = InceptionE_2(2048) - weight_path = get_weights_path_from_url(FID_WEIGHTS_URL[0], - FID_WEIGHTS_URL[1]) + weight_path = get_weights_path_from_url(FID_WEIGHTS_URL[0], FID_WEIGHTS_URL[1]) state_dict = paddle.load(weight_path) inception.set_state_dict(state_dict) return inception @@ -275,49 +275,55 @@ def __init__(self, num_channels, pool_features): out_channels=64, kernel_size=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch5x5_1 = ConvNormActivation( in_channels=num_channels, out_channels=48, kernel_size=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch5x5_2 = ConvNormActivation( in_channels=48, out_channels=64, kernel_size=5, padding=2, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch3x3dbl_1 = ConvNormActivation( in_channels=num_channels, out_channels=64, kernel_size=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch3x3dbl_2 = ConvNormActivation( in_channels=64, out_channels=96, kernel_size=3, padding=1, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch3x3dbl_3 = ConvNormActivation( in_channels=96, out_channels=96, kernel_size=3, padding=1, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) # Patch: Tensorflow's average pool does not use the padded zero's in # its average calculation - self.branch_pool = nn.AvgPool2D( - kernel_size=3, stride=1, padding=1, exclusive=True) + self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True) self.branch_pool_conv = ConvNormActivation( in_channels=num_channels, out_channels=pool_features, kernel_size=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) def forward(self, x): branch1x1 = self.branch1x1(x) @@ -330,8 +336,7 @@ def forward(self, x): branch_pool = self.branch_pool(x) branch_pool = self.branch_pool_conv(branch_pool) - x = paddle.concat( - [branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1) + x = paddle.concat([branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1) return x @@ -343,7 +348,8 @@ def __init__(self, num_channels, channels_7x7): out_channels=192, kernel_size=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch7x7_1 = ConvNormActivation( in_channels=num_channels, @@ -351,62 +357,70 @@ def __init__(self, num_channels, channels_7x7): kernel_size=1, stride=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch7x7_2 = ConvNormActivation( in_channels=channels_7x7, out_channels=channels_7x7, kernel_size=(1, 7), stride=1, padding=(0, 3), - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch7x7_3 = ConvNormActivation( in_channels=channels_7x7, out_channels=192, kernel_size=(7, 1), stride=1, padding=(3, 0), - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch7x7dbl_1 = ConvNormActivation( in_channels=num_channels, out_channels=channels_7x7, kernel_size=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch7x7dbl_2 = ConvNormActivation( in_channels=channels_7x7, out_channels=channels_7x7, kernel_size=(7, 1), padding=(3, 0), - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch7x7dbl_3 = ConvNormActivation( in_channels=channels_7x7, out_channels=channels_7x7, kernel_size=(1, 7), padding=(0, 3), - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch7x7dbl_4 = ConvNormActivation( in_channels=channels_7x7, out_channels=channels_7x7, kernel_size=(7, 1), padding=(3, 0), - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch7x7dbl_5 = ConvNormActivation( in_channels=channels_7x7, out_channels=192, kernel_size=(1, 7), padding=(0, 3), - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) # Patch: Tensorflow's average pool does not use the padded zero's in # its average calculation - self.branch_pool = nn.AvgPool2D( - kernel_size=3, stride=1, padding=1, exclusive=True) + self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True) self.branch_pool_conv = ConvNormActivation( in_channels=num_channels, out_channels=192, kernel_size=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) def forward(self, x): branch1x1 = self.branch1x1(x) @@ -424,8 +438,7 @@ def forward(self, x): branch_pool = self.branch_pool(x) branch_pool = self.branch_pool_conv(branch_pool) - x = paddle.concat( - [branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1) + x = paddle.concat([branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1) return x @@ -438,61 +451,69 @@ def __init__(self, num_channels): out_channels=320, kernel_size=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch3x3_1 = ConvNormActivation( in_channels=num_channels, out_channels=384, kernel_size=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch3x3_2a = ConvNormActivation( in_channels=384, out_channels=384, kernel_size=(1, 3), padding=(0, 1), - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch3x3_2b = ConvNormActivation( in_channels=384, out_channels=384, kernel_size=(3, 1), padding=(1, 0), - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch3x3dbl_1 = ConvNormActivation( in_channels=num_channels, out_channels=448, kernel_size=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch3x3dbl_2 = ConvNormActivation( in_channels=448, out_channels=384, kernel_size=3, padding=1, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch3x3dbl_3a = ConvNormActivation( in_channels=384, out_channels=384, kernel_size=(1, 3), padding=(0, 1), - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch3x3dbl_3b = ConvNormActivation( in_channels=384, out_channels=384, kernel_size=(3, 1), padding=(1, 0), - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) # Patch: Tensorflow's average pool does not use the padded zero's in # its average calculation - self.branch_pool = nn.AvgPool2D( - kernel_size=3, stride=1, padding=1, exclusive=True) + self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True) self.branch_pool_conv = ConvNormActivation( in_channels=num_channels, out_channels=192, kernel_size=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) def forward(self, x): branch1x1 = self.branch1x1(x) @@ -515,8 +536,7 @@ def forward(self, x): branch_pool = self.branch_pool(x) branch_pool = self.branch_pool_conv(branch_pool) - x = paddle.concat( - [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1) + x = paddle.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1) return x @@ -549,6 +569,5 @@ def forward(self, x): branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1) branch_pool = self.branch_pool_conv(branch_pool) - x = paddle.concat( - [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1) + x = paddle.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1) return x diff --git a/ppdiffusers/examples/autoencoder/vae/train_vae.py b/ppdiffusers/examples/autoencoder/vae/train_vae.py index e96c6718040c0..44a8798100e3a 100644 --- a/ppdiffusers/examples/autoencoder/vae/train_vae.py +++ b/ppdiffusers/examples/autoencoder/vae/train_vae.py @@ -28,8 +28,7 @@ from tqdm.auto import tqdm from ppdiffusers.models.ema import LitEma -from ppdiffusers.training_utils import (freeze_params, main_process_first, - unwrap_model) +from ppdiffusers.training_utils import freeze_params, main_process_first, unwrap_model def read_json(file): @@ -56,8 +55,7 @@ def run_evaluate(vae, val_dataloader, writer, global_step): log_dict_ae_all = defaultdict(list) log_dict_disc_all = defaultdict(list) for batch in val_dataloader: - log_dict_ae, log_dict_disc = unwrap_model(vae).validation_step( - batch["image"], global_step=global_step) + log_dict_ae, log_dict_disc = unwrap_model(vae).validation_step(batch["image"], global_step=global_step) for k, v in log_dict_ae.items(): if "loss" not in k: continue @@ -71,25 +69,21 @@ def run_evaluate(vae, val_dataloader, writer, global_step): def parse_args(): - parser = argparse.ArgumentParser( - description="Simple example of a training a autoencoder model script.") + parser = argparse.ArgumentParser(description="Simple example of a training a autoencoder model script.") parser.add_argument( "--pretrained_model_name_or_path", type=str, default=None, required=False, - help="Path to pretrained model or model identifier from bos.", ) + help="Path to pretrained model or model identifier from bos.", + ) parser.add_argument( "--output_dir", type=str, default="autoencoder_outputs", help="The output directory where the model predictions and checkpoints will be written.", ) - parser.add_argument( - "--seed", - type=int, - default=23, - help="A seed for reproducible training.") + parser.add_argument("--seed", type=int, default=23, help="A seed for reproducible training.") parser.add_argument( "--batch_size", type=int, @@ -112,48 +106,39 @@ def parse_args(): parser.add_argument( "--scale_lr", action="store_true", - help="Scale base-lr by ngpu * batch_size", ) - parser.add_argument( - "--freeze_encoder", - action="store_true", - help="Whether to freeze encoder layer.") + help="Scale base-lr by ngpu * batch_size", + ) + parser.add_argument("--freeze_encoder", action="store_true", help="Whether to freeze encoder layer.") parser.add_argument( "--from_scratch", action="store_true", - help="Whether to train new model from scratch. ", ) - parser.add_argument( - "--vae_config_file", - default=None, - type=str, - help="Path to the vae_config_file.") + help="Whether to train new model from scratch. ", + ) + parser.add_argument("--vae_config_file", default=None, type=str, help="Path to the vae_config_file.") parser.add_argument( "--logging_dir", type=str, default="logs", help=( "[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to" - "*output_dir/logs"), ) + "*output_dir/logs" + ), + ) parser.add_argument( "--report_to", type=str, default="visualdl", choices=["tensorboard", "visualdl"], - help="Log writer type.", ) - parser.add_argument( - "--logging_steps", - default=100, - type=int, - help="The interval steps to logging.") + help="Log writer type.", + ) + parser.add_argument("--logging_steps", default=100, type=int, help="The interval steps to logging.") parser.add_argument( "--image_logging_steps", default=500, type=int, - help="The interval steps to logging images.", ) - parser.add_argument( - "--save_steps", - default=2000, - type=int, - help="The interval steps to saveing.") + help="The interval steps to logging images.", + ) + parser.add_argument("--save_steps", default=2000, type=int, help="The interval steps to saveing.") parser.add_argument( "--ignore_keys", default=[], @@ -166,136 +151,152 @@ def parse_args(): default=None, type=int, nargs="*", - help="The height and width of the input at the encoder.", ) + help="The height and width of the input at the encoder.", + ) # dataset parser.add_argument( "--dataset_type", type=str, default="text_image_pair", choices=["imagenet", "text_image_pair"], - help="The type of dataset.", ) + help="The type of dataset.", + ) parser.add_argument( "--resolution", type=int, default=512, help=( "The resolution for input images, all the images in the train/validation dataset will be resized to this" - " resolution"), ) + " resolution" + ), + ) parser.add_argument( "--degradation", type=str, default="pil_nearest", - help="Degradation_fn, e.g. cv_bicubic, bsrgan_light, or pil_nearest", ) + help="Degradation_fn, e.g. cv_bicubic, bsrgan_light, or pil_nearest", + ) parser.add_argument( "--file_list", type=str, default="./data/filelist/train.filelist.list", - help="Path to the train file_list.", ) + help="Path to the train file_list.", + ) parser.add_argument( "--num_workers", type=int, default=8, - help="The number of subprocess to load data.", ) + help="The number of subprocess to load data.", + ) parser.add_argument( "--num_records", type=int, default=62500, - help="The num_records of the text_image_pair dataset.", ) + help="The num_records of the text_image_pair dataset.", + ) parser.add_argument( "--buffer_size", type=int, default=100, - help="The buffer size of the text_image_pair dataset.", ) + help="The buffer size of the text_image_pair dataset.", + ) parser.add_argument( "--shuffle_every_n_samples", type=int, default=5, - help="The shuffle_every_n_samples of the text_image_pair dataset.", ) + help="The shuffle_every_n_samples of the text_image_pair dataset.", + ) parser.add_argument( "--init_from_ckpt", type=str, default=None, - help="The path of checkpoint to be loaded.", ) + help="The path of checkpoint to be loaded.", + ) # loss fn parser.add_argument( "--disc_start", type=int, default=50001, - help="The number of steps the discriminator started.", ) + help="The number of steps the discriminator started.", + ) parser.add_argument( "--kl_weight", type=float, default=1.0e-6, - help="The weight ratio of the kl_loss.", ) + help="The weight ratio of the kl_loss.", + ) parser.add_argument( "--disc_weight", type=float, default=0.5, - help="The weight ratio of the disc_loss.", ) + help="The weight ratio of the disc_loss.", + ) parser.add_argument( "--logvar_init", type=float, default=0.0, - help="The init value of the output log variances.", ) + help="The init value of the output log variances.", + ) parser.add_argument( "--pixelloss_weight", type=float, default=1.0, - help="The weight ratio of the pixelloss.", ) + help="The weight ratio of the pixelloss.", + ) parser.add_argument( "--disc_num_layers", type=int, default=3, - help="The num layers of the discriminator.", ) + help="The num layers of the discriminator.", + ) parser.add_argument( "--disc_in_channels", type=int, default=3, - help="The in channels of the discriminator.", ) + help="The in channels of the discriminator.", + ) parser.add_argument( "--disc_factor", type=float, default=1.0, - help="The factor of the discriminator loss.", ) + help="The factor of the discriminator loss.", + ) parser.add_argument( "--perceptual_weight", type=float, default=1.0, - help="The weight ratio of the perceptual loss.", ) + help="The weight ratio of the perceptual loss.", + ) parser.add_argument( "--use_actnorm", action="store_true", - help="Whether to use actnorm in NLayerDiscriminator layer.", ) + help="Whether to use actnorm in NLayerDiscriminator layer.", + ) parser.add_argument( "--disc_conditional", action="store_true", - help="Whether to use conditional discriminator.", ) + help="Whether to use conditional discriminator.", + ) parser.add_argument( "--disc_loss", type=str, choices=["hinge", "vanilla"], default="hinge", - help="The type of discriminator loss.", ) - parser.add_argument( - "--use_ema", action="store_true", help="Whether to use_ema.") + help="The type of discriminator loss.", + ) + parser.add_argument("--use_ema", action="store_true", help="Whether to use_ema.") parser.add_argument( "--enable_xformers_memory_efficient_attention", action="store_true", - help="Whether to enable_xformers_memory_efficient_attention.", ) - parser.add_argument( - "--recompute", action="store_true", help="Whether to recompute.") - parser.add_argument( - "--ema_decay", - type=float, - default=0.9999, - help="The value of ema_decay.") + help="Whether to enable_xformers_memory_efficient_attention.", + ) + parser.add_argument("--recompute", action="store_true", help="Whether to recompute.") + parser.add_argument("--ema_decay", type=float, default=0.9999, help="The value of ema_decay.") args = parser.parse_args() args.logging_dir = os.path.join(args.output_dir, args.logging_dir) - args.image_logging_steps = ( - math.ceil(args.image_logging_steps / args.logging_steps) * - args.logging_steps) + args.image_logging_steps = math.ceil(args.image_logging_steps / args.logging_steps) * args.logging_steps return args @@ -358,7 +359,8 @@ def main(): disc_loss=args.disc_loss, ema_decay=args.ema_decay, use_ema=args.use_ema, - **model_kwargs, ) + **model_kwargs, + ) else: assert args.vae_config_file is not None, "We must supply vae_config_file!" # Load config: train model from scatch @@ -378,7 +380,8 @@ def main(): disc_conditional=args.disc_conditional, disc_loss=args.disc_loss, ema_decay=args.ema_decay, - use_ema=args.use_ema, ) + use_ema=args.use_ema, + ) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) @@ -390,8 +393,7 @@ def main(): args.learning_rate = num_processes * args.batch_size * args.learning_rate # configure_optimizers - parameters = list(vae.decoder.parameters()) + list( - vae.post_quant_conv.parameters()) + parameters = list(vae.decoder.parameters()) + list(vae.post_quant_conv.parameters()) # we may freeze_encoder if not args.freeze_encoder: parameters += list(vae.encoder.parameters()) @@ -401,16 +403,13 @@ def main(): freeze_params(vae.quant_conv.parameters()) print("Freeze vae.encoder.parameters and vae.quant_conv.parameters!") - opt_ae = Adam( - parameters=parameters, - learning_rate=args.learning_rate, - beta1=0.5, - beta2=0.9) + opt_ae = Adam(parameters=parameters, learning_rate=args.learning_rate, beta1=0.5, beta2=0.9) opt_disc = Adam( parameters=vae.loss.discriminator.parameters(), learning_rate=args.learning_rate, beta1=0.5, - beta2=0.9, ) + beta2=0.9, + ) if args.use_ema: vae.model_ema = LitEma(vae, decay=args.ema_decay) if args.recompute: @@ -427,27 +426,17 @@ def main(): from ldm import ImageNetSRTrain, ImageNetSRValidation with main_process_first(): - train_dataset = ImageNetSRTrain( - size=args.resolution, degradation=args.degradation) - val_dataset = ImageNetSRValidation( - size=args.resolution, degradation=args.degradation) - train_sampler = (DistributedBatchSampler( - train_dataset, batch_size=args.batch_size, shuffle=True) - if num_processes > 1 else BatchSampler( - train_dataset, - batch_size=args.batch_size, - shuffle=True)) - train_dataloader = DataLoader( - train_dataset, - batch_sampler=train_sampler, - num_workers=args.num_workers) - - val_sampler = BatchSampler( - val_dataset, batch_size=args.batch_size * 2, shuffle=False) - val_dataloader = DataLoader( - val_dataset, - batch_sampler=val_sampler, - num_workers=args.num_workers) + train_dataset = ImageNetSRTrain(size=args.resolution, degradation=args.degradation) + val_dataset = ImageNetSRValidation(size=args.resolution, degradation=args.degradation) + train_sampler = ( + DistributedBatchSampler(train_dataset, batch_size=args.batch_size, shuffle=True) + if num_processes > 1 + else BatchSampler(train_dataset, batch_size=args.batch_size, shuffle=True) + ) + train_dataloader = DataLoader(train_dataset, batch_sampler=train_sampler, num_workers=args.num_workers) + + val_sampler = BatchSampler(val_dataset, batch_size=args.batch_size * 2, shuffle=False) + val_dataloader = DataLoader(val_dataset, batch_sampler=val_sampler, num_workers=args.num_workers) else: train_dataset = TextImagePair( file_list=args.file_list, @@ -455,19 +444,21 @@ def main(): num_records=args.num_records, buffer_size=args.buffer_size, shuffle_every_n_samples=args.shuffle_every_n_samples, - interpolation="lanczos", ) + interpolation="lanczos", + ) train_dataloader = DataLoader( train_dataset, batch_size=args.batch_size, num_workers=args.num_workers, - worker_init_fn=worker_init_fn, ) + worker_init_fn=worker_init_fn, + ) val_dataloader = val_dataset = None # Scheduler and math around the number of training steps. overrode_max_train_steps = False num_update_steps_per_epoch = ( - len(train_dataloader) if args.dataset_type == "imagenet" else - math.ceil(len(train_dataset) / args.batch_size)) + len(train_dataloader) if args.dataset_type == "imagenet" else math.ceil(len(train_dataset) / args.batch_size) + ) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch overrode_max_train_steps = True @@ -475,8 +466,7 @@ def main(): if overrode_max_train_steps: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch # Afterwards we recalculate our number of training epochs - args.num_train_epochs = math.ceil(args.max_train_steps / - num_update_steps_per_epoch) + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) if rank == 0: logger.info("----------- Configuration Arguments -----------") @@ -492,9 +482,7 @@ def main(): logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info(f" Instantaneous batch size per device = {args.batch_size}") - logger.info( - f" Total train batch size (w. parallel, distributed) = {total_batch_size}" - ) + logger.info(f" Total train batch size (w. parallel, distributed) = {total_batch_size}") logger.info(f" Total optimization steps = {args.max_train_steps}") logger.info( f" Number of trainable parameters = {sum(p.numel().item() for p in vae.parameters() if not p.stop_gradient) }" @@ -515,9 +503,7 @@ def main(): # pytorch_lightning use this `toggle_optimizer` method # ref: https://github.com/Lightning-AI/lightning/blob/a58639ce7e864dd70484e7d34c37730ae204183c/src/pytorch_lightning/core/module.py#L1419-L1447 unwrap_model(vae).toggle_optimizer(optimizers, optimizer_idx) - loss, log_dict = vae(batch["image"], - optimizer_idx=optimizer_idx, - global_step=global_step) + loss, log_dict = vae(batch["image"], optimizer_idx=optimizer_idx, global_step=global_step) optimizers[optimizer_idx].clear_grad() loss.backward() optimizers[optimizer_idx].step() @@ -541,17 +527,13 @@ def main(): if global_step % args.image_logging_steps == 0: images_log = unwrap_model(vae).log_images(batch["image"]) for name, val in images_log.items(): - writer.add_image( - name, val, global_step, dataformats="NHWC") + writer.add_image(name, val, global_step, dataformats="NHWC") # saving if global_step % args.save_steps == 0: if val_dataloader is not None: - run_evaluate( - unwrap_model(vae), val_dataloader, writer, - global_step) - output_dir = os.path.join( - args.output_dir, "checkpoint-{}".format(global_step)) + run_evaluate(unwrap_model(vae), val_dataloader, writer, global_step) + output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) unwrap_model(vae).save_pretrained(output_dir) del logs diff --git a/ppdiffusers/examples/clip_interrogator/clip_interrogator/blip_decoder.py b/ppdiffusers/examples/clip_interrogator/clip_interrogator/blip_decoder.py index 0c715dcb16fff..0c943be785d26 100644 --- a/ppdiffusers/examples/clip_interrogator/clip_interrogator/blip_decoder.py +++ b/ppdiffusers/examples/clip_interrogator/clip_interrogator/blip_decoder.py @@ -13,19 +13,15 @@ @patch_to(BeamHypotheses) -def add(self: BeamHypotheses, - hyp: paddle.Tensor, - sum_logprobs: float, - origin_len: int=0) -> None: +def add(self: BeamHypotheses, hyp: paddle.Tensor, sum_logprobs: float, origin_len: int = 0) -> None: """ Add a new hypothesis to the list. """ - score = sum_logprobs / (hyp.shape[-1]**self.length_penalty) + score = sum_logprobs / (hyp.shape[-1] ** self.length_penalty) if len(self) < self.num_beams or score > self.worst_score: self.beams.append((score, hyp)) if len(self) > self.num_beams: - sorted_next_scores = sorted( - [(s, idx) for idx, (s, _) in enumerate(self.beams)]) + sorted_next_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)]) del self.beams[sorted_next_scores[0][1]] self.worst_score = sorted_next_scores[1][0] else: @@ -33,10 +29,7 @@ def add(self: BeamHypotheses, @patch_to(BeamHypotheses) -def is_done(self: BeamHypotheses, - best_sum_logprobs: float, - cur_len: int, - origin_len: int=0) -> bool: +def is_done(self: BeamHypotheses, best_sum_logprobs: float, cur_len: int, origin_len: int = 0) -> bool: """ If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst one in the heap, then we are done with this sentence. @@ -54,35 +47,31 @@ def is_done(self: BeamHypotheses, class BLIP_Decoder(nn.Layer): def __init__( - self, - pretrained_model_name_or_path, - prompt="a picture of ", ): + self, + pretrained_model_name_or_path, + prompt="a picture of ", + ): super().__init__() - self.text_decoder = BlipForConditionalGeneration.from_pretrained( - pretrained_model_name_or_path) + self.text_decoder = BlipForConditionalGeneration.from_pretrained(pretrained_model_name_or_path) self.text_decoder.eval() - self.processor = BlipProcessor.from_pretrained( - pretrained_model_name_or_path) + self.processor = BlipProcessor.from_pretrained(pretrained_model_name_or_path) self.processor.tokenizer.add_special_tokens({"bos_token": "[DEC]"}) - self.processor.tokenizer.add_special_tokens({ - "additional_special_tokens": ["[ENC]"] - }) - self.processor.tokenizer.enc_token_id = ( - self.processor.tokenizer.additional_special_tokens_ids[0]) + self.processor.tokenizer.add_special_tokens({"additional_special_tokens": ["[ENC]"]}) + self.processor.tokenizer.enc_token_id = self.processor.tokenizer.additional_special_tokens_ids[0] self.prompt = prompt - self.prompt_length = len( - self.processor.tokenizer(self.prompt).input_ids) - 1 + self.prompt_length = len(self.processor.tokenizer(self.prompt).input_ids) - 1 def generate( - self, - image, - prompt=None, - sample=False, - num_beams=3, - max_length=30, - min_length=10, - top_p=0.9, - repetition_penalty=1.0, ): + self, + image, + prompt=None, + sample=False, + num_beams=3, + max_length=30, + min_length=10, + top_p=0.9, + repetition_penalty=1.0, + ): if prompt is None: prompt = self.prompt prompt_length = self.prompt_length @@ -93,8 +82,7 @@ def generate( else: model_kwargs = {"pixel_values": image} prompt = [prompt] * model_kwargs["pixel_values"].shape[0] - input_ids = self.processor.tokenizer( - prompt, return_tensors="pd").input_ids + input_ids = self.processor.tokenizer(prompt, return_tensors="pd").input_ids if sample: # nucleus sampling @@ -106,7 +94,8 @@ def generate( top_p=top_p, num_return_sequences=1, repetition_penalty=repetition_penalty, - **model_kwargs, )[0] + **model_kwargs, + )[0] else: if num_beams == 1: # greedy search @@ -115,7 +104,8 @@ def generate( max_length=max_length - prompt_length, min_length=min_length, decode_strategy="greedy_search", - **model_kwargs, )[0] + **model_kwargs, + )[0] else: # beam search outputs = self.text_decoder.generate( @@ -126,11 +116,10 @@ def generate( decode_strategy="beam_search", repetition_penalty=repetition_penalty, length_penalty=1.0, # note this is not - **model_kwargs, )[0] + **model_kwargs, + )[0] captions = [] for output in outputs: - captions.append( - self.processor.decode( - output, skip_special_tokens=True)) + captions.append(self.processor.decode(output, skip_special_tokens=True)) return captions diff --git a/ppdiffusers/examples/clip_interrogator/clip_interrogator/clip_interrogator.py b/ppdiffusers/examples/clip_interrogator/clip_interrogator/clip_interrogator.py index 380024d3d617e..9cefe1a3b543d 100644 --- a/ppdiffusers/examples/clip_interrogator/clip_interrogator/clip_interrogator.py +++ b/ppdiffusers/examples/clip_interrogator/clip_interrogator/clip_interrogator.py @@ -63,19 +63,16 @@ def __init__(self, config: Config): def load_blip_model(self): config = self.config - self.blip_model = BLIP_Decoder(pretrained_model_name_or_path=config. - blip_pretrained_model_name_or_path) + self.blip_model = BLIP_Decoder(pretrained_model_name_or_path=config.blip_pretrained_model_name_or_path) self.blip_model.eval() def load_clip_model(self): config = self.config # clip model - self.clip_model: CLIPModel = CLIPModel.from_pretrained( - config.clip_pretrained_model_name_or_path) + self.clip_model: CLIPModel = CLIPModel.from_pretrained(config.clip_pretrained_model_name_or_path) self.clip_model.eval() - self.clip_preprocess = CLIPProcessor.from_pretrained( - config.clip_pretrained_model_name_or_path) + self.clip_preprocess = CLIPProcessor.from_pretrained(config.clip_pretrained_model_name_or_path) sites = [ "Artstation", @@ -113,41 +110,45 @@ def load_clip_model(self): return_tensors="pd", padding="max_length", truncation=True, - max_length=self.clip_preprocess.tokenizer.model_max_length, ) - self.artists = LabelTable(artists, "artists", self.clip_model, - self.tokenize, config) + max_length=self.clip_preprocess.tokenizer.model_max_length, + ) + self.artists = LabelTable(artists, "artists", self.clip_model, self.tokenize, config) self.flavors = LabelTable( _load_list(config.data_path, "flavors.txt"), "flavors", self.clip_model, self.tokenize, - config, ) + config, + ) self.mediums = LabelTable( _load_list(config.data_path, "mediums.txt"), "mediums", self.clip_model, self.tokenize, - config, ) + config, + ) self.movements = LabelTable( _load_list(config.data_path, "movements.txt"), "movements", self.clip_model, self.tokenize, - config, ) - self.trendings = LabelTable(trending_list, "trendings", self.clip_model, - self.tokenize, config) + config, + ) + self.trendings = LabelTable(trending_list, "trendings", self.clip_model, self.tokenize, config) self.pad_token_id = self.clip_preprocess.tokenizer.pad_token_id def generate_caption(self, pil_image: Image) -> str: size = self.config.blip_image_eval_size - gpu_image = transforms.Compose([ - transforms.Resize( - (size, size), interpolation="bicubic"), - transforms.ToTensor(), - transforms.Normalize( - self.clip_preprocess.image_processor.image_mean, - self.clip_preprocess.image_processor.image_std, ), - ])(pil_image).unsqueeze(0) + gpu_image = transforms.Compose( + [ + transforms.Resize((size, size), interpolation="bicubic"), + transforms.ToTensor(), + transforms.Normalize( + self.clip_preprocess.image_processor.image_mean, + self.clip_preprocess.image_processor.image_std, + ), + ] + )(pil_image).unsqueeze(0) with paddle.no_grad(): caption = self.blip_model.generate( @@ -157,18 +158,18 @@ def generate_caption(self, pil_image: Image) -> str: max_length=self.config.blip_max_length, min_length=self.config.blip_min_length, top_p=self.config.blip_top_p, - repetition_penalty=self.config.blip_repetition_penalty, ) + repetition_penalty=self.config.blip_repetition_penalty, + ) return caption[0] def image_to_features(self, image: Image) -> paddle.Tensor: images = self.clip_preprocess(images=image, return_tensors="pd") with paddle.no_grad(): - image_features = self.clip_model.get_image_features(images[ - "pixel_values"]) + image_features = self.clip_model.get_image_features(images["pixel_values"]) image_features /= image_features.norm(axis=-1, keepdim=True) return image_features - def interrogate_classic(self, image: Image, max_flavors: int=3) -> str: + def interrogate_classic(self, image: Image, max_flavors: int = 3) -> str: caption = self.generate_caption(image) image_features = self.image_to_features(image) @@ -185,25 +186,21 @@ def interrogate_classic(self, image: Image, max_flavors: int=3) -> str: return _truncate_to_fit(prompt, self.tokenize, self.pad_token_id) - def interrogate_fast(self, image: Image, max_flavors: int=32) -> str: + def interrogate_fast(self, image: Image, max_flavors: int = 32) -> str: caption = self.generate_caption(image) image_features = self.image_to_features(image) merged = _merge_tables( - [ - self.artists, self.flavors, self.mediums, self.movements, - self.trendings - ], - self.config, ) + [self.artists, self.flavors, self.mediums, self.movements, self.trendings], + self.config, + ) tops = merged.rank(image_features, max_flavors) - return _truncate_to_fit(caption + ", " + ", ".join(tops), self.tokenize, - self.pad_token_id) + return _truncate_to_fit(caption + ", " + ", ".join(tops), self.tokenize, self.pad_token_id) - def interrogate(self, image: Image, max_flavors: int=32) -> str: + def interrogate(self, image: Image, max_flavors: int = 32) -> str: caption = self.generate_caption(image) image_features = self.image_to_features(image) - flaves = self.flavors.rank(image_features, - self.config.flavor_intermediate_count) + flaves = self.flavors.rank(image_features, self.config.flavor_intermediate_count) best_medium = self.mediums.rank(image_features, 1)[0] best_artist = self.artists.rank(image_features, 1)[0] best_trending = self.trendings.rank(image_features, 1)[0] @@ -225,65 +222,50 @@ def check(addition: str) -> bool: def check_multi_batch(opts: List[str]): nonlocal best_prompt, best_sim prompts = [] - for i in range(2**len(opts)): + for i in range(2 ** len(opts)): prompt = best_prompt for bit in range(len(opts)): if i & (1 << bit): prompt += ", " + opts[bit] prompts.append(prompt) - t = LabelTable(prompts, None, self.clip_model, self.tokenize, - self.config) + t = LabelTable(prompts, None, self.clip_model, self.tokenize, self.config) best_prompt = t.rank(image_features, 1)[0] best_sim = self.similarity(image_features, best_prompt) - check_multi_batch( - [best_medium, best_artist, best_trending, best_movement]) + check_multi_batch([best_medium, best_artist, best_trending, best_movement]) extended_flavors = set(flaves) - for i in tqdm( - range(max_flavors), desc="Flavor chain", - disable=self.config.quiet): - best = self.rank_top( - image_features, - [f"{best_prompt}, {f}" for f in extended_flavors]) - flave = best[len(best_prompt) + 2:] + for i in tqdm(range(max_flavors), desc="Flavor chain", disable=self.config.quiet): + best = self.rank_top(image_features, [f"{best_prompt}, {f}" for f in extended_flavors]) + flave = best[len(best_prompt) + 2 :] if not check(flave): break - if _prompt_at_max_len(best_prompt, self.tokenize, - self.pad_token_id): + if _prompt_at_max_len(best_prompt, self.tokenize, self.pad_token_id): break extended_flavors.remove(flave) return best_prompt - def rank_top(self, image_features: paddle.Tensor, - text_array: List[str]) -> str: + def rank_top(self, image_features: paddle.Tensor, text_array: List[str]) -> str: text_tokens = self.tokenize(text_array) with paddle.no_grad(): - text_features = self.clip_model.get_text_features(text_tokens[ - "input_ids"]) + text_features = self.clip_model.get_text_features(text_tokens["input_ids"]) text_features /= text_features.norm(axis=-1, keepdim=True) - similarity = text_features @image_features.T + similarity = text_features @ image_features.T return text_array[similarity.argmax().item()] def similarity(self, image_features: paddle.Tensor, text: str) -> float: text_tokens = self.tokenize([text]) with paddle.no_grad(): - text_features = self.clip_model.get_text_features(text_tokens[ - "input_ids"]) + text_features = self.clip_model.get_text_features(text_tokens["input_ids"]) text_features /= text_features.norm(axis=-1, keepdim=True) - similarity = text_features @image_features.T + similarity = text_features @ image_features.T return similarity[0][0].item() class LabelTable: - def __init__(self, - labels: List[str], - desc: str, - clip_model, - tokenize, - config: Config): + def __init__(self, labels: List[str], desc: str, clip_model, tokenize, config: Config): self.chunk_size = config.chunk_size self.config = config self.embeds = [] @@ -295,10 +277,8 @@ def __init__(self, cache_filepath = None if config.cache_path is not None and desc is not None: os.makedirs(config.cache_path, exist_ok=True) - sanitized_name = config.clip_pretrained_model_name_or_path.replace( - "/", "_").replace("@", "_") - cache_filepath = os.path.join(config.cache_path, - f"{sanitized_name}_{desc}.pkl") + sanitized_name = config.clip_pretrained_model_name_or_path.replace("/", "_").replace("@", "_") + cache_filepath = os.path.join(config.cache_path, f"{sanitized_name}_{desc}.pkl") if desc is not None and os.path.exists(cache_filepath): with open(cache_filepath, "rb") as f: try: @@ -311,16 +291,15 @@ def __init__(self, if len(self.labels) != len(self.embeds): self.embeds = [] - chunks = np.array_split( - self.labels, max(1, len(self.labels) / config.chunk_size)) + chunks = np.array_split(self.labels, max(1, len(self.labels) / config.chunk_size)) for chunk in tqdm( - chunks, - desc=f"Preprocessing {desc}" if desc else None, - disable=self.config.quiet, ): + chunks, + desc=f"Preprocessing {desc}" if desc else None, + disable=self.config.quiet, + ): text_tokens = self.tokenize(chunk.tolist()) with paddle.no_grad(): - text_features = clip_model.get_text_features(text_tokens[ - "input_ids"]) + text_features = clip_model.get_text_features(text_tokens["input_ids"]) text_features /= text_features.norm(axis=-1, keepdim=True) text_features = text_features.cpu().numpy() for i in range(text_features.shape[0]): @@ -335,22 +314,23 @@ def __init__(self, "hash": hash, "model": config.clip_pretrained_model_name_or_path, }, - f, ) + f, + ) def _rank( - self, - image_features: paddle.Tensor, - text_embeds: paddle.Tensor, - top_count: int=1, ) -> str: + self, + image_features: paddle.Tensor, + text_embeds: paddle.Tensor, + top_count: int = 1, + ) -> str: top_count = min(top_count, len(text_embeds)) text_embeds = paddle.to_tensor(text_embeds) - similarity = image_features @text_embeds.T + similarity = image_features @ text_embeds.T _, top_labels = similarity.cast("float32").topk(top_count, axis=-1) top_labels = top_labels.tolist() return [top_labels[0][i] for i in range(top_count)] - def rank(self, image_features: paddle.Tensor, - top_count: int=1) -> List[str]: + def rank(self, image_features: paddle.Tensor, top_count: int = 1) -> List[str]: if len(self.labels) <= self.chunk_size: tops = self._rank(image_features, self.embeds, top_count=top_count) return [self.labels[i] for i in tops] @@ -362,10 +342,7 @@ def rank(self, image_features: paddle.Tensor, for chunk_idx in tqdm(range(num_chunks), disable=self.config.quiet): start = chunk_idx * self.chunk_size stop = min(start + self.chunk_size, len(self.embeds)) - tops = self._rank( - image_features, - self.embeds[start:stop], - top_count=keep_per_chunk) + tops = self._rank(image_features, self.embeds[start:stop], top_count=keep_per_chunk) top_labels.extend([self.labels[start + i] for i in tops]) top_embeds.extend([self.embeds[start + i] for i in tops]) @@ -374,11 +351,7 @@ def rank(self, image_features: paddle.Tensor, def _load_list(data_path: str, filename: str) -> List[str]: - with open( - os.path.join(data_path, filename), - "r", - encoding="utf-8", - errors="replace") as f: + with open(os.path.join(data_path, filename), "r", encoding="utf-8", errors="replace") as f: items = [line.strip() for line in f.readlines()] return items @@ -391,7 +364,7 @@ def _merge_tables(tables: List[LabelTable], config: Config) -> LabelTable: return m -def _prompt_at_max_len(text: str, tokenize, pad_token_id: int=0) -> bool: +def _prompt_at_max_len(text: str, tokenize, pad_token_id: int = 0) -> bool: tokens = tokenize([text])["input_ids"] return tokens[0][-1] != pad_token_id diff --git a/ppdiffusers/examples/clip_interrogator/dumpy.py b/ppdiffusers/examples/clip_interrogator/dumpy.py index 9a6e930b2e198..552e84eae5944 100644 --- a/ppdiffusers/examples/clip_interrogator/dumpy.py +++ b/ppdiffusers/examples/clip_interrogator/dumpy.py @@ -14,9 +14,12 @@ # limitations under the License. import gradio as gr -from clip_interrogator import (BLIP_PRETRAINED_MODEL_ARCHIVE_LIST, - CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, Config, - Interrogator) +from clip_interrogator import ( + BLIP_PRETRAINED_MODEL_ARCHIVE_LIST, + CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, + Config, + Interrogator, +) blip_pretrained_model_name_or_path = "Salesforce/blip-image-captioning-base" clip_pretrained_model_name_or_path = "openai/clip-vit-large-patch14" @@ -38,16 +41,18 @@ config = Config( blip_num_beams=64, blip_pretrained_model_name_or_path=blip_pretrained_model_name_or_path, - clip_pretrained_model_name_or_path=clip_pretrained_model_name_or_path, ) + clip_pretrained_model_name_or_path=clip_pretrained_model_name_or_path, +) ci = Interrogator(config) def inference(image, mode, best_max_flavors=32): - ci.config.chunk_size = (2048 if ci.config.clip_pretrained_model_name_or_path - == "openai/clip-vit-large-patch14" else 1024) + ci.config.chunk_size = ( + 2048 if ci.config.clip_pretrained_model_name_or_path == "openai/clip-vit-large-patch14" else 1024 + ) ci.config.flavor_intermediate_count = ( - 2048 if ci.config.clip_pretrained_model_name_or_path == - "openai/clip-vit-large-patch14" else 1024) + 2048 if ci.config.clip_pretrained_model_name_or_path == "openai/clip-vit-large-patch14" else 1024 + ) image = image.convert("RGB") if mode == "best": return ci.interrogate(image, max_flavors=int(best_max_flavors)) @@ -59,16 +64,17 @@ def inference(image, mode, best_max_flavors=32): inputs = [ gr.inputs.Image(type="pil"), - gr.Radio( - ["best", "fast", "classic"], label="", value="best"), - gr.Number( - value=16, label="best mode max flavors"), + gr.Radio(["best", "fast", "classic"], label="", value="best"), + gr.Number(value=16, label="best mode max flavors"), +] +outputs = [ + gr.outputs.Textbox(label="Output"), ] -outputs = [gr.outputs.Textbox(label="Output"), ] io = gr.Interface( inference, inputs, outputs, - allow_flagging=False, ) + allow_flagging=False, +) io.launch(debug=False, server_name="0.0.0.0", server_port=8586) diff --git a/ppdiffusers/examples/clip_interrogator/predict.py b/ppdiffusers/examples/clip_interrogator/predict.py index d42d5a666a53c..bb6dd5f6004b7 100644 --- a/ppdiffusers/examples/clip_interrogator/predict.py +++ b/ppdiffusers/examples/clip_interrogator/predict.py @@ -15,9 +15,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from clip_interrogator import (BLIP_PRETRAINED_MODEL_ARCHIVE_LIST, - CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, Config, - Interrogator) +from clip_interrogator import ( + BLIP_PRETRAINED_MODEL_ARCHIVE_LIST, + CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, + Config, + Interrogator, +) from cog import BasePredictor, Input, Path from PIL import Image @@ -28,29 +31,32 @@ def setup(self): Config( blip_pretrained_model_name_or_path="Salesforce/blip-image-captioning-large", clip_pretrained_model_name_or_path="openai/clip-vit-large-patch14", - device="gpu", )) + device="gpu", + ) + ) def predict( - self, - image: Path=Input(description="Input image"), - clip_pretrained_model_name_or_path: str=Input( - default="openai/clip-vit-large-patch14", - choices=CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, - description="Choose ViT-L for Stable Diffusion 1, and ViT-H for Stable Diffusion 2", - ), - blip_pretrained_model_name_or_path: str=Input( - default="Salesforce/blip-image-captioning-large", - choices=BLIP_PRETRAINED_MODEL_ARCHIVE_LIST, - description="Choose Salesforce/blip-image-captioning-large", ), - mode: str=Input( - default="best", - choices=["best", "classic", "fast"], - description="Prompt mode (best takes 10-20 seconds, fast takes 1-2 seconds).", - ), ) -> str: + self, + image: Path = Input(description="Input image"), + clip_pretrained_model_name_or_path: str = Input( + default="openai/clip-vit-large-patch14", + choices=CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, + description="Choose ViT-L for Stable Diffusion 1, and ViT-H for Stable Diffusion 2", + ), + blip_pretrained_model_name_or_path: str = Input( + default="Salesforce/blip-image-captioning-large", + choices=BLIP_PRETRAINED_MODEL_ARCHIVE_LIST, + description="Choose Salesforce/blip-image-captioning-large", + ), + mode: str = Input( + default="best", + choices=["best", "classic", "fast"], + description="Prompt mode (best takes 10-20 seconds, fast takes 1-2 seconds).", + ), + ) -> str: """Run a single prediction on the model""" image = Image.open(str(image)).convert("RGB") - self.switch_model(clip_pretrained_model_name_or_path, - blip_pretrained_model_name_or_path) + self.switch_model(clip_pretrained_model_name_or_path, blip_pretrained_model_name_or_path) if mode == "best": return self.ci.interrogate(image) elif mode == "classic": @@ -59,16 +65,13 @@ def predict( return self.ci.interrogate_fast(image) def switch_model( - self, - clip_pretrained_model_name_or_path: str, - blip_pretrained_model_name_or_path: str, ): - if (clip_pretrained_model_name_or_path != - self.ci.config.clip_pretrained_model_name_or_path): - self.ci.config.clip_pretrained_model_name_or_path = ( - clip_pretrained_model_name_or_path) + self, + clip_pretrained_model_name_or_path: str, + blip_pretrained_model_name_or_path: str, + ): + if clip_pretrained_model_name_or_path != self.ci.config.clip_pretrained_model_name_or_path: + self.ci.config.clip_pretrained_model_name_or_path = clip_pretrained_model_name_or_path self.ci.load_clip_model() - if (blip_pretrained_model_name_or_path != - self.ci.config.blip_pretrained_model_name_or_path): - self.ci.config.blip_pretrained_model_name_or_path = ( - blip_pretrained_model_name_or_path) + if blip_pretrained_model_name_or_path != self.ci.config.blip_pretrained_model_name_or_path: + self.ci.config.blip_pretrained_model_name_or_path = blip_pretrained_model_name_or_path self.ci.load_blip_model() diff --git a/ppdiffusers/examples/clip_interrogator/run_cli.py b/ppdiffusers/examples/clip_interrogator/run_cli.py index 081717fcf915d..c905195af03f8 100755 --- a/ppdiffusers/examples/clip_interrogator/run_cli.py +++ b/ppdiffusers/examples/clip_interrogator/run_cli.py @@ -21,9 +21,12 @@ import paddle import requests -from clip_interrogator import (BLIP_PRETRAINED_MODEL_ARCHIVE_LIST, - CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, Config, - Interrogator) +from clip_interrogator import ( + BLIP_PRETRAINED_MODEL_ARCHIVE_LIST, + CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, + Config, + Interrogator, +) from PIL import Image @@ -44,18 +47,16 @@ def main(): "--clip", default="openai/clip-vit-large-patch14", choices=CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, - help="name of CLIP model to use", ) + help="name of CLIP model to use", + ) parser.add_argument( "-b", "--blip", default="Salesforce/blip-image-captioning-large", choices=BLIP_PRETRAINED_MODEL_ARCHIVE_LIST, - help="name of BLIP model to use", ) - parser.add_argument( - "-d", - "--device", - default="auto", - help="device to use (auto, gpu or cpu)") + help="name of BLIP model to use", + ) + parser.add_argument("-d", "--device", default="auto", help="device to use (auto, gpu or cpu)") parser.add_argument("-f", "--folder", help="path to folder of images") parser.add_argument("-i", "--image", help="image file or url") parser.add_argument( @@ -63,7 +64,8 @@ def main(): "--mode", default="best", choices=["best", "classic", "fast"], - help="best, classic, or fast", ) + help="best, classic, or fast", + ) args = parser.parse_args() if not args.folder and not args.image: @@ -71,8 +73,7 @@ def main(): exit(1) if args.folder is not None and args.image is not None: - print( - "Specify a folder or batch processing or a single image, not both") + print("Specify a folder or batch processing or a single image, not both") exit(1) # validate clip model name @@ -98,16 +99,15 @@ def main(): # generate a nice prompt config = Config( clip_pretrained_model_name_or_path=args.clip, - blip_pretrained_model_name_or_path=args.blip, ) + blip_pretrained_model_name_or_path=args.blip, + ) ci = Interrogator(config) # process single image if args.image is not None: image_path = args.image - if str(image_path).startswith("http://") or str(image_path).startswith( - "https://"): - image = Image.open(requests.get(image_path, stream=True) - .raw).convert("RGB") + if str(image_path).startswith("http://") or str(image_path).startswith("https://"): + image = Image.open(requests.get(image_path, stream=True).raw).convert("RGB") else: image = Image.open(image_path).convert("RGB") if not image: @@ -121,10 +121,7 @@ def main(): print(f"The folder {args.folder} does not exist!") exit(1) - files = [ - f for f in os.listdir(args.folder) - if f.endswith(".jpg") or f.endswith(".png") - ] + files = [f for f in os.listdir(args.folder) if f.endswith(".jpg") or f.endswith(".png")] prompts = [] for file in files: image = Image.open(os.path.join(args.folder, file)).convert("RGB") @@ -140,9 +137,7 @@ def main(): for file, prompt in zip(files, prompts): w.writerow([file, prompt]) - print( - f"\n\n\n\nGenerated {len(prompts)} and saved to {csv_path}, enjoy!" - ) + print(f"\n\n\n\nGenerated {len(prompts)} and saved to {csv_path}, enjoy!") if __name__ == "__main__": diff --git a/ppdiffusers/examples/clip_interrogator/run_gradio.py b/ppdiffusers/examples/clip_interrogator/run_gradio.py index 435c7c46a265b..60c35b66fe030 100755 --- a/ppdiffusers/examples/clip_interrogator/run_gradio.py +++ b/ppdiffusers/examples/clip_interrogator/run_gradio.py @@ -19,9 +19,12 @@ import gradio as gr import paddle -from clip_interrogator import (BLIP_PRETRAINED_MODEL_ARCHIVE_LIST, - CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, Config, - Interrogator) +from clip_interrogator import ( + BLIP_PRETRAINED_MODEL_ARCHIVE_LIST, + CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, + Config, + Interrogator, +) parser = argparse.ArgumentParser() parser.add_argument( @@ -29,19 +32,18 @@ "--clip", default="openai/clip-vit-large-patch14", choices=CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, - help="name of CLIP model to use", ) + help="name of CLIP model to use", +) parser.add_argument( "-b", "--blip", default="Salesforce/blip-image-captioning-large", choices=BLIP_PRETRAINED_MODEL_ARCHIVE_LIST, - help="name of BLIP model to use", ) -parser.add_argument( - "-d", "--device", default="auto", help="device to use (auto, gpu or cpu)") -parser.add_argument( - "-s", "--share", action="store_true", help="Create a public link") -parser.add_argument( - "--server_name", default="0.0.0.0", type=str, help="server_name") + help="name of BLIP model to use", +) +parser.add_argument("-d", "--device", default="auto", help="device to use (auto, gpu or cpu)") +parser.add_argument("-s", "--share", action="store_true", help="Create a public link") +parser.add_argument("--server_name", default="0.0.0.0", type=str, help="server_name") parser.add_argument("--server_port", default=8586, type=int, help="server_port") args = parser.parse_args() @@ -69,31 +71,29 @@ config = Config( cache_path="cache", clip_pretrained_model_name_or_path=args.clip, - blip_pretrained_model_name_or_path=args.blip, ) + blip_pretrained_model_name_or_path=args.blip, +) ci = Interrogator(config) def inference( - image, - mode, - clip_pretrained_model_name_or_path, - blip_pretrained_model_name_or_path, - blip_min_length, - blip_max_length, - blip_sample, - blip_top_p, - blip_repetition_penalty, - blip_num_beams, ): - if (clip_pretrained_model_name_or_path != - ci.config.clip_pretrained_model_name_or_path): - ci.config.clip_pretrained_model_name_or_path = ( - clip_pretrained_model_name_or_path) + image, + mode, + clip_pretrained_model_name_or_path, + blip_pretrained_model_name_or_path, + blip_min_length, + blip_max_length, + blip_sample, + blip_top_p, + blip_repetition_penalty, + blip_num_beams, +): + if clip_pretrained_model_name_or_path != ci.config.clip_pretrained_model_name_or_path: + ci.config.clip_pretrained_model_name_or_path = clip_pretrained_model_name_or_path ci.load_clip_model() - if (blip_pretrained_model_name_or_path != - ci.config.blip_pretrained_model_name_or_path): - ci.config.blip_pretrained_model_name_or_path = ( - blip_pretrained_model_name_or_path) + if blip_pretrained_model_name_or_path != ci.config.blip_pretrained_model_name_or_path: + ci.config.blip_pretrained_model_name_or_path = blip_pretrained_model_name_or_path ci.load_blip_model() ci.config.blip_min_length = int(blip_min_length) @@ -114,36 +114,25 @@ def inference( inputs = [ gr.inputs.Image(type="pil"), - gr.Radio( - ["best", "classic", "fast"], label="Mode", value="fast"), - gr.Dropdown( - CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, value=args.clip, - label="CLIP Model"), - gr.Dropdown( - BLIP_PRETRAINED_MODEL_ARCHIVE_LIST, value=args.blip, - label="BLIP Model"), - gr.Number( - value=8, label="Caption min Length"), - gr.Number( - value=32, label="Caption Max Length"), - gr.Radio( - ["True", "False"], value="False", label="Sample or not?"), - gr.Number( - value=0.9, label="TopP value, when Sample is true"), - gr.Number( - value=1.1, label="Repetition penalty value, when Sample is false"), - gr.Number( - value=64, label="Caption Num Beams, when Sample is false"), + gr.Radio(["best", "classic", "fast"], label="Mode", value="fast"), + gr.Dropdown(CLIP_PRETRAINED_MODEL_ARCHIVE_LIST, value=args.clip, label="CLIP Model"), + gr.Dropdown(BLIP_PRETRAINED_MODEL_ARCHIVE_LIST, value=args.blip, label="BLIP Model"), + gr.Number(value=8, label="Caption min Length"), + gr.Number(value=32, label="Caption Max Length"), + gr.Radio(["True", "False"], value="False", label="Sample or not?"), + gr.Number(value=0.9, label="TopP value, when Sample is true"), + gr.Number(value=1.1, label="Repetition penalty value, when Sample is false"), + gr.Number(value=64, label="Caption Num Beams, when Sample is false"), +] +outputs = [ + gr.outputs.Textbox(label="Image Caption Output"), ] -outputs = [gr.outputs.Textbox(label="Image Caption Output"), ] io = gr.Interface( inference, inputs, outputs, title="🕵️‍♂️ Paddle CLIP Interrogator 🕵️‍♂️", - allow_flagging=False, ) -io.launch( - share=args.share, - server_name=args.server_name, - server_port=args.server_port) + allow_flagging=False, +) +io.launch(share=args.share, server_name=args.server_name, server_port=args.server_port) diff --git a/ppdiffusers/examples/community/clip_guided_images_mixing_stable_diffusion.py b/ppdiffusers/examples/community/clip_guided_images_mixing_stable_diffusion.py index 3ef59efaf907f..f4495bba5b6f4 100644 --- a/ppdiffusers/examples/community/clip_guided_images_mixing_stable_diffusion.py +++ b/ppdiffusers/examples/community/clip_guided_images_mixing_stable_diffusion.py @@ -20,18 +20,30 @@ import paddle.nn.functional as F import PIL from einops import rearrange -from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPModel, - CLIPTextModel, CLIPTokenizer) +from paddlenlp.transformers import ( + CLIPFeatureExtractor, + CLIPModel, + CLIPTextModel, + CLIPTokenizer, +) from tqdm import tqdm -from ppdiffusers import (AutoencoderKL, DDIMScheduler, DiffusionPipeline, - DPMSolverMultistepScheduler, LMSDiscreteScheduler, - PNDMScheduler, UNet2DConditionModel) +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + DiffusionPipeline, + DPMSolverMultistepScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + UNet2DConditionModel, +) from ppdiffusers.loaders import FromCkptMixin -from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import \ - StableDiffusionPipelineOutput -from ppdiffusers.pipelines.stable_diffusion.safety_checker import \ - StableDiffusionSafetyChecker +from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import ( + StableDiffusionPipelineOutput, +) +from ppdiffusers.pipelines.stable_diffusion.safety_checker import ( + StableDiffusionSafetyChecker, +) from ppdiffusers.utils import PIL_INTERPOLATION, logging, randn_tensor logger = logging.get_logger(__name__) @@ -43,11 +55,7 @@ def preprocess(image, w, h): elif isinstance(image, PIL.Image.Image): image = [image] if isinstance(image[0], PIL.Image.Image): - image = [ - np.array(i.resize( - (w, h), resample=PIL_INTERPOLATION["lanczos"]))[(None), :] - for i in image - ] + image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[(None), :] for i in image] image = np.concatenate(image, axis=0) image = np.array(image).astype(np.float32) / 255.0 image = image.transpose(0, 3, 1, 2) @@ -82,11 +90,12 @@ def slerp(t, v0, v1, DOT_THRESHOLD=0.9995): def spherical_dist_loss(x, y): x = F.normalize(x=x, axis=-1) y = F.normalize(x=y, axis=-1) - return (paddle.divide( - (x - y).norm(axis=-1), paddle.to_tensor( - 2, dtype=x.dtype)).asin().pow(y=paddle.to_tensor( - 2, dtype=x.dtype)).multiply(y=paddle.to_tensor( - 2, dtype=x.dtype))) + return ( + paddle.divide((x - y).norm(axis=-1), paddle.to_tensor(2, dtype=x.dtype)) + .asin() + .pow(y=paddle.to_tensor(2, dtype=x.dtype)) + .multiply(y=paddle.to_tensor(2, dtype=x.dtype)) + ) def set_requires_grad(model, value): @@ -97,20 +106,25 @@ def set_requires_grad(model, value): class CLIPGuidedImagesMixingStableDiffusion(DiffusionPipeline, FromCkptMixin): # _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - clip_model: CLIPModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler, - DPMSolverMultistepScheduler, ], - feature_extractor: CLIPFeatureExtractor, - safety_checker: StableDiffusionSafetyChecker, - blip_model=None, - blip_processor=None, - clip_interrogator=None, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + clip_model: CLIPModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: Union[ + PNDMScheduler, + LMSDiscreteScheduler, + DDIMScheduler, + DPMSolverMultistepScheduler, + ], + feature_extractor: CLIPFeatureExtractor, + safety_checker: StableDiffusionSafetyChecker, + blip_model=None, + blip_processor=None, + clip_interrogator=None, + requires_safety_checker: bool = True, + ): super().__init__() if safety_checker is None and requires_safety_checker: logger.warning( @@ -139,18 +153,21 @@ def __init__( blip_model=blip_model, blip_processor=blip_processor, clip_interrogator=clip_interrogator, - safety_checker=safety_checker, ) + safety_checker=safety_checker, + ) self.feature_extractor_size = ( - feature_extractor.size if isinstance(feature_extractor.size, int) - else feature_extractor.size["shortest_edge"]) + feature_extractor.size + if isinstance(feature_extractor.size, int) + else feature_extractor.size["shortest_edge"] + ) self.normalize = paddle.vision.transforms.Normalize( - mean=feature_extractor.image_mean, std=feature_extractor.image_std) + mean=feature_extractor.image_mean, std=feature_extractor.image_std + ) set_requires_grad(self.text_encoder, False) set_requires_grad(self.clip_model, False) self.register_to_config(requires_safety_checker=requires_safety_checker) - def enable_attention_slicing(self, - slice_size: Optional[Union[str, int]]="auto"): + def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): if slice_size == "auto": slice_size = self.unet.config.attention_head_dim // 2 self.unet.set_attention_slice(slice_size) @@ -171,46 +188,35 @@ def unfreeze_unet(self): set_requires_grad(self.unet, True) def get_timesteps(self, num_inference_steps, strength): - init_timestep = min( - int(num_inference_steps * strength), num_inference_steps) + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start:] return timesteps, num_inference_steps - t_start def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) else: has_nsfw_concept = None return image, has_nsfw_concept - def prepare_latents(self, - image, - timestep, - batch_size, - dtype, - generator=None): + def prepare_latents(self, image, timestep, batch_size, dtype, generator=None): if not isinstance(image, paddle.Tensor): - raise ValueError( - f"`image` has to be of type `torch.Tensor` but is {type(image)}") + raise ValueError(f"`image` has to be of type `torch.Tensor` but is {type(image)}") image = image.cast(dtype) if isinstance(generator, list): init_latents = [ - self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i]) - for i in range(batch_size) + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) ] init_latents = paddle.concat(x=init_latents, axis=0) else: init_latents = self.vae.encode(image).latent_dist.sample(generator) init_latents = 0.18215 * init_latents - init_latents = init_latents.repeat_interleave( - repeats=batch_size, axis=0) - noise = randn_tensor( - init_latents.shape, generator=generator, dtype=dtype) + init_latents = init_latents.repeat_interleave(repeats=batch_size, axis=0) + noise = randn_tensor(init_latents.shape, generator=generator, dtype=dtype) # get latents init_latents = self.scheduler.add_noise(init_latents, noise, timestep) @@ -223,63 +229,53 @@ def get_image_description(self, image): else: # with paddle.no_grad(), paddle.amp.auto_cast(): inputs = self.blip_processor(images=image, return_tensors="pd") - inputs["pixel_values"] = inputs["pixel_values"].cast( - self.blip_model.dtype) + inputs["pixel_values"] = inputs["pixel_values"].cast(self.blip_model.dtype) # out = self.blip_model.generate(**inputs, decode_strategy="beam_search", num_beams=2, length_penalty=0, max_length=5) out = self.blip_model.generate(**inputs) - return self.blip_processor.decode( - out[0][0], skip_special_tokens=True) + return self.blip_processor.decode(out[0][0], skip_special_tokens=True) def get_clip_image_embeddings(self, image, batch_size): clip_image_input = self.feature_extractor.preprocess(image) clip_image_features = ( - paddle.to_tensor(data=clip_image_input["pixel_values"][0]) - .unsqueeze(axis=0).astype(dtype="float16")) - image_embeddings_clip = self.clip_model.get_image_features( - clip_image_features) - image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm( - p=2, axis=-1, keepdim=True) - image_embeddings_clip = image_embeddings_clip.repeat_interleave( - repeats=batch_size, axis=0) + paddle.to_tensor(data=clip_image_input["pixel_values"][0]).unsqueeze(axis=0).astype(dtype="float16") + ) + image_embeddings_clip = self.clip_model.get_image_features(clip_image_features) + image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, axis=-1, keepdim=True) + image_embeddings_clip = image_embeddings_clip.repeat_interleave(repeats=batch_size, axis=0) return image_embeddings_clip @paddle.enable_grad() def cond_fn( - self, - latents, - timestep, - index, - text_embeddings, - noise_pred_original, - original_image_embeddings_clip, - clip_guidance_scale, ): + self, + latents, + timestep, + index, + text_embeddings, + noise_pred_original, + original_image_embeddings_clip, + clip_guidance_scale, + ): out_0 = latents.detach() out_0.stop_gradient = not True latents = out_0 latent_model_input = self.scheduler.scale_model_input(latents, timestep) # predict the noise residual - noise_pred = self.unet( - latent_model_input, timestep, - encoder_hidden_states=text_embeddings).sample - if isinstance( - self.scheduler, - (PNDMScheduler, DDIMScheduler, DPMSolverMultistepScheduler)): + noise_pred = self.unet(latent_model_input, timestep, encoder_hidden_states=text_embeddings).sample + if isinstance(self.scheduler, (PNDMScheduler, DDIMScheduler, DPMSolverMultistepScheduler)): alpha_prod_t = self.scheduler.alphas_cumprod[timestep] beta_prod_t = 1 - alpha_prod_t # compute predicted original sample from predicted noise also called # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf - pred_original_sample = ( - latents - beta_prod_t**0.5 * noise_pred) / alpha_prod_t**0.5 + pred_original_sample = (latents - beta_prod_t**0.5 * noise_pred) / alpha_prod_t**0.5 fac = paddle.sqrt(x=beta_prod_t) sample = pred_original_sample * fac + latents * (1 - fac) elif isinstance(self.scheduler, LMSDiscreteScheduler): sigma = self.scheduler.sigmas[index] sample = latents - sigma * noise_pred else: - raise ValueError( - f"scheduler type {type(self.scheduler)} not supported") + raise ValueError(f"scheduler type {type(self.scheduler)} not supported") # Hardcode 0.18215 because stable-diffusion-2-base has not self.vae.config.scaling_factor sample = 1 / 0.18215 * sample @@ -289,56 +285,48 @@ def cond_fn( # image = paddle.vision.transforms.Resize(self.feature_extractor_size)(image) c_size = image.shape[0] image = rearrange(image, "c t h w -> (c t) h w") - image = paddle.vision.transforms.Resize(self.feature_extractor_size)( - image) + image = paddle.vision.transforms.Resize(self.feature_extractor_size)(image) image = rearrange(image, "(c t) h w -> c t h w", c=c_size) image = self.normalize(image) image_embeddings_clip = self.clip_model.get_image_features(image) - image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm( - p=2, axis=-1, keepdim=True) - loss = (spherical_dist_loss(image_embeddings_clip, - original_image_embeddings_clip).mean() * - clip_guidance_scale) + image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, axis=-1, keepdim=True) + loss = spherical_dist_loss(image_embeddings_clip, original_image_embeddings_clip).mean() * clip_guidance_scale grads = -paddle.autograd.grad(loss, latents)[0] if isinstance(self.scheduler, LMSDiscreteScheduler): latents = latents.detach() + grads * sigma**2 noise_pred = noise_pred_original else: - noise_pred = noise_pred_original - paddle.sqrt( - x=beta_prod_t) * grads + noise_pred = noise_pred_original - paddle.sqrt(x=beta_prod_t) * grads return noise_pred, latents @paddle.no_grad() def __call__( - self, - style_image: Union[paddle.Tensor, PIL.Image.Image], - content_image: Union[paddle.Tensor, PIL.Image.Image], - style_prompt: Optional[str]=None, - content_prompt: Optional[str]=None, - negative_prompt=None, - height: Optional[int]=512, - width: Optional[int]=512, - noise_strength: float=0.6, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - batch_size: Optional[int]=1, - eta: float=0.0, - clip_guidance_scale: Optional[float]=100, - generator: Optional[paddle.Generator]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - slerp_latent_style_strength: float=0.8, - slerp_prompt_style_strength: float=0.1, - slerp_clip_image_style_strength: float=0.1, ): + self, + style_image: Union[paddle.Tensor, PIL.Image.Image], + content_image: Union[paddle.Tensor, PIL.Image.Image], + style_prompt: Optional[str] = None, + content_prompt: Optional[str] = None, + negative_prompt=None, + height: Optional[int] = 512, + width: Optional[int] = 512, + noise_strength: float = 0.6, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + batch_size: Optional[int] = 1, + eta: float = 0.0, + clip_guidance_scale: Optional[float] = 100, + generator: Optional[paddle.Generator] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + slerp_latent_style_strength: float = 0.8, + slerp_prompt_style_strength: float = 0.1, + slerp_clip_image_style_strength: float = 0.1, + ): if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed {batch_size} batch_size, but only {len(generator)} generators." - ) + raise ValueError(f"You have passed {batch_size} batch_size, but only {len(generator)} generators.") if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") # generate prompts with blip model if prompt is if content_prompt is None: @@ -353,35 +341,32 @@ def __call__( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) - content_text_embeddings = self.text_encoder( - content_text_input.input_ids)[0] + return_tensors="pd", + ) + content_text_embeddings = self.text_encoder(content_text_input.input_ids)[0] style_text_input = self.tokenizer( style_prompt, padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) style_text_embeddings = self.text_encoder(style_text_input.input_ids)[0] - text_embeddings = slerp(slerp_prompt_style_strength, - content_text_embeddings, style_text_embeddings) + text_embeddings = slerp(slerp_prompt_style_strength, content_text_embeddings, style_text_embeddings) # duplicate text embeddings for each generation per prompt - text_embeddings = text_embeddings.repeat_interleave( - repeats=batch_size, axis=0) + text_embeddings = text_embeddings.repeat_interleave(repeats=batch_size, axis=0) # set timesteps - accepts_offset = "offset" in set( - inspect.signature(self.scheduler.set_timesteps).parameters.keys()) + accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys()) extra_set_kwargs = {} if accepts_offset: extra_set_kwargs["offset"] = 1 self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs) # Some schedulers like PNDM have timesteps as arrays # It's more optimized to move all timesteps to correct device beforehand - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, - noise_strength) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, noise_strength) latent_timestep = timesteps[:1].tile(repeat_times=[batch_size]) # Preprocess image @@ -391,25 +376,25 @@ def __call__( latent_timestep, batch_size, text_embeddings.dtype, - generator, ) + generator, + ) preprocessed_style_image = preprocess(style_image, width, height) style_latents = self.prepare_latents( preprocessed_style_image, latent_timestep, batch_size, text_embeddings.dtype, - generator, ) - latents = slerp(slerp_latent_style_strength, content_latents, - style_latents) + generator, + ) + latents = slerp(slerp_latent_style_strength, content_latents, style_latents) if clip_guidance_scale > 0: - content_clip_image_embedding = self.get_clip_image_embeddings( - content_image, batch_size) - style_clip_image_embedding = self.get_clip_image_embeddings( - style_image, batch_size) + content_clip_image_embedding = self.get_clip_image_embeddings(content_image, batch_size) + style_clip_image_embedding = self.get_clip_image_embeddings(style_image, batch_size) clip_image_embeddings = slerp( slerp_clip_image_style_strength, content_clip_image_embedding, - style_clip_image_embedding, ) + style_clip_image_embedding, + ) # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` @@ -426,17 +411,16 @@ def __call__( uncond_tokens, padding="max_length", max_length=max_length, - return_tensors="pd", ) + return_tensors="pd", + ) uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0] # duplicate unconditional embeddings for each generation per prompt - uncond_embeddings = uncond_embeddings.repeat_interleave( - repeats=batch_size, axis=0) + uncond_embeddings = uncond_embeddings.repeat_interleave(repeats=batch_size, axis=0) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - text_embeddings = paddle.concat( - x=[uncond_embeddings, text_embeddings]) + text_embeddings = paddle.concat(x=[uncond_embeddings, text_embeddings]) # get the initial random noise unless the user supplied it @@ -451,13 +435,10 @@ def __call__( ] latents_dtype = text_embeddings.dtype if latents is None: - latents = paddle.randn( - shape=latents_shape, generator=generator, dtype=latents_dtype) + latents = paddle.randn(shape=latents_shape, generator=generator, dtype=latents_dtype) else: if latents.shape != latents_shape: - raise ValueError( - f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}" - ) + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}") # scale the initial noise by the standard deviation required by the scheduler latents = latents * self.scheduler.init_noise_sigma @@ -466,41 +447,34 @@ def __call__( # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator # with self.progress_bar(total=num_inference_steps): for i, t in tqdm(enumerate(timesteps)): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat(x=[latents] * 2) - if do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat(x=[latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual - noise_pred = self.unet( - latent_model_input, t, - encoder_hidden_states=text_embeddings).sample + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample # perform classifier free guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # perform clip guidance if clip_guidance_scale > 0: text_embeddings_for_guidance = ( - text_embeddings.chunk(chunks=2)[1] - if do_classifier_free_guidance else text_embeddings) + text_embeddings.chunk(chunks=2)[1] if do_classifier_free_guidance else text_embeddings + ) noise_pred, latents = self.cond_fn( latents, t, @@ -508,23 +482,21 @@ def __call__( text_embeddings_for_guidance, noise_pred, clip_image_embeddings, - clip_guidance_scale, ) + clip_guidance_scale, + ) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # Hardcode 0.18215 because stable-diffusion-2-base has not self.vae.config.scaling_factor latents = 1 / 0.18215 * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clip(min=0, max=1) image = image.cpu().transpose(perm=[0, 2, 3, 1]).numpy() - image, has_nsfw_concept = self.run_safety_checker(image, - text_embeddings.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, text_embeddings.dtype) if output_type == "pil": image = self.numpy_to_pil(image) if not return_dict: return image, None - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/examples/community/clip_guided_stable_diffusion.py b/ppdiffusers/examples/community/clip_guided_stable_diffusion.py index ee8e0cac04537..f23f5d60b2eee 100644 --- a/ppdiffusers/examples/community/clip_guided_stable_diffusion.py +++ b/ppdiffusers/examples/community/clip_guided_stable_diffusion.py @@ -20,14 +20,22 @@ from paddle import nn from paddle.nn import functional as F from paddle.vision import transforms -from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPModel, - CLIPTextModel, CLIPTokenizer) - -from ppdiffusers import (AutoencoderKL, DDIMScheduler, DiffusionPipeline, - LMSDiscreteScheduler, PNDMScheduler, - UNet2DConditionModel) -from ppdiffusers.pipelines.stable_diffusion import \ - StableDiffusionPipelineOutput +from paddlenlp.transformers import ( + CLIPFeatureExtractor, + CLIPModel, + CLIPTextModel, + CLIPTokenizer, +) + +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + DiffusionPipeline, + LMSDiscreteScheduler, + PNDMScheduler, + UNet2DConditionModel, +) +from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from ppdiffusers.utils import logging logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -46,13 +54,10 @@ def forward(self, pixel_values, num_cutouts): min_size = min(sideX, sideY, self.cut_size) cutouts = [] for _ in range(num_cutouts): - size = int( - paddle.rand((1, ))**self.cut_power * (max_size - min_size) + - min_size) - offsetx = int(paddle.randint(0, sideX - size + 1, (1, ))) - offsety = int(paddle.randint(0, sideY - size + 1, (1, ))) - cutout = pixel_values[:, :, offsety:offsety + size, offsetx:offsetx - + size] + size = int(paddle.rand((1,)) ** self.cut_power * (max_size - min_size) + min_size) + offsetx = int(paddle.randint(0, sideX - size + 1, (1,))) + offsety = int(paddle.randint(0, sideY - size + 1, (1,))) + cutout = pixel_values[:, :, offsety : offsety + size, offsetx : offsetx + size] cutouts.append(F.adaptive_avg_pool2d(cutout, self.cut_size)) return paddle.concat(cutouts) @@ -75,15 +80,15 @@ class CLIPGuidedStableDiffusion(DiffusionPipeline): """ def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - clip_model: CLIPModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, - DDIMScheduler], - feature_extractor: CLIPFeatureExtractor, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + clip_model: CLIPModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler], + feature_extractor: CLIPFeatureExtractor, + ): super().__init__() self.register_modules( vae=vae, @@ -92,20 +97,21 @@ def __init__( tokenizer=tokenizer, unet=unet, scheduler=scheduler, - feature_extractor=feature_extractor, ) - - self.normalize = transforms.Normalize( - mean=feature_extractor.image_mean, std=feature_extractor.image_std) - self.cut_out_size = (feature_extractor.size - if isinstance(feature_extractor.size, int) else - feature_extractor.size["shortest_edge"]) + feature_extractor=feature_extractor, + ) + + self.normalize = transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std) + self.cut_out_size = ( + feature_extractor.size + if isinstance(feature_extractor.size, int) + else feature_extractor.size["shortest_edge"] + ) self.make_cutouts = MakeCutouts(self.cut_out_size) set_stop_gradient(self.text_encoder, True) set_stop_gradient(self.clip_model, True) - def enable_attention_slicing(self, - slice_size: Optional[Union[str, int]]="auto"): + def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): if slice_size == "auto": # half the attention head size is usually a good trade-off between # speed and memory @@ -128,16 +134,17 @@ def unfreeze_unet(self): set_stop_gradient(self.unet, False) def cond_fn( - self, - latents, - timestep, - index, - text_embeddings, - noise_pred_original, - text_embeddings_clip, - clip_guidance_scale, - num_cutouts, - use_cutouts=True, ): + self, + latents, + timestep, + index, + text_embeddings, + noise_pred_original, + text_embeddings_clip, + clip_guidance_scale, + num_cutouts, + use_cutouts=True, + ): # https://github.com/PaddlePaddle/Paddle/issues/54306 in 2.5rc paddle.set_grad_enabled has bug with paddle.set_grad_enabled(True): latents = latents.detach() @@ -146,24 +153,19 @@ def cond_fn( if isinstance(self.scheduler, LMSDiscreteScheduler): sigma = self.scheduler.sigmas[index] # the model input needs to be scaled to match the continuous ODE formulation in K-LMS - latent_model_input = latents / ((sigma**2 + 1)**0.5) + latent_model_input = latents / ((sigma**2 + 1) ** 0.5) else: latent_model_input = latents # predict the noise residual - noise_pred = self.unet( - latent_model_input, - timestep, - encoder_hidden_states=text_embeddings).sample + noise_pred = self.unet(latent_model_input, timestep, encoder_hidden_states=text_embeddings).sample if isinstance(self.scheduler, (PNDMScheduler, DDIMScheduler)): alpha_prod_t = self.scheduler.alphas_cumprod[timestep] beta_prod_t = 1 - alpha_prod_t # compute predicted original sample from predicted noise also called # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf - pred_original_sample = ( - latents - beta_prod_t** - (0.5) * noise_pred) / alpha_prod_t**(0.5) + pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5) fac = paddle.sqrt(beta_prod_t) sample = pred_original_sample * (fac) + latents * (1 - fac) @@ -171,8 +173,7 @@ def cond_fn( sigma = self.scheduler.sigmas[index] sample = latents - sigma * noise_pred else: - raise ValueError( - f"scheduler type {type(self.scheduler)} not supported") + raise ValueError(f"scheduler type {type(self.scheduler)} not supported") sample = 1 / 0.18215 * sample image = self.vae.decode(sample).sample @@ -182,23 +183,18 @@ def cond_fn( image = self.make_cutouts(image, num_cutouts) else: resize_transform = transforms.Resize(self.cut_out_size) - image = paddle.stack( - [resize_transform(img) for img in image], axis=0) + image = paddle.stack([resize_transform(img) for img in image], axis=0) image = self.normalize(image).astype(latents.dtype) image_embeddings_clip = self.clip_model.get_image_features(image) - image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm( - p=2, axis=-1, keepdim=True) + image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, axis=-1, keepdim=True) if use_cutouts: - dists = spherical_dist_loss(image_embeddings_clip, - text_embeddings_clip) + dists = spherical_dist_loss(image_embeddings_clip, text_embeddings_clip) dists = dists.reshape([num_cutouts, sample.shape[0], -1]) loss = dists.sum(2).mean(0).sum() * clip_guidance_scale else: - loss = (spherical_dist_loss(image_embeddings_clip, - text_embeddings_clip).mean() * - clip_guidance_scale) + loss = spherical_dist_loss(image_embeddings_clip, text_embeddings_clip).mean() * clip_guidance_scale grads = -paddle.autograd.grad(loss, latents)[0] @@ -206,52 +202,49 @@ def cond_fn( latents = latents.detach() + grads * (sigma**2) noise_pred = noise_pred_original else: - noise_pred = noise_pred_original - paddle.sqrt( - beta_prod_t) * grads + noise_pred = noise_pred_original - paddle.sqrt(beta_prod_t) * grads return noise_pred, latents @paddle.no_grad() def __call__( - self, - prompt: Union[str, List[str]], - height: Optional[int]=512, - width: Optional[int]=512, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - clip_guidance_scale: Optional[float]=100, - clip_prompt: Optional[Union[str, List[str]]]=None, - num_cutouts: Optional[int]=4, - use_cutouts: Optional[bool]=True, - generator: Optional[paddle.Generator]=None, - latents: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - **kwargs, ): + self, + prompt: Union[str, List[str]], + height: Optional[int] = 512, + width: Optional[int] = 512, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + clip_guidance_scale: Optional[float] = 100, + clip_prompt: Optional[Union[str, List[str]]] = None, + num_cutouts: Optional[int] = 4, + use_cutouts: Optional[bool] = True, + generator: Optional[paddle.Generator] = None, + latents: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + **kwargs, + ): if isinstance(prompt, str): batch_size = 1 elif isinstance(prompt, list): batch_size = len(prompt) else: - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) # get prompt text embeddings text_inputs = self.tokenizer( @@ -259,26 +252,25 @@ def __call__( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids if text_input_ids.shape[-1] > self.tokenizer.model_max_length: - removed_text = self.tokenizer.batch_decode( - text_input_ids[:, self.tokenizer.model_max_length:]) + removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :]) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") - text_input_ids = text_input_ids[:, :self.tokenizer.model_max_length] + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length] attention_mask = paddle.ones_like(text_input_ids) - text_embeddings = self.text_encoder( - text_input_ids, attention_mask=attention_mask)[0] + text_embeddings = self.text_encoder(text_input_ids, attention_mask=attention_mask)[0] # duplicate text embeddings for each generation per prompt bs_embed, seq_len, _ = text_embeddings.shape text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1]) - text_embeddings = text_embeddings.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) if clip_guidance_scale > 0: if clip_prompt is not None: @@ -287,19 +279,16 @@ def __call__( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ).input_ids + return_tensors="pd", + ).input_ids else: clip_text_input_ids = text_inputs.input_ids - text_embeddings_clip = self.clip_model.get_text_features( - clip_text_input_ids) - text_embeddings_clip = text_embeddings_clip / text_embeddings_clip.norm( - p=2, axis=-1, keepdim=True) + text_embeddings_clip = self.clip_model.get_text_features(clip_text_input_ids) + text_embeddings_clip = text_embeddings_clip / text_embeddings_clip.norm(p=2, axis=-1, keepdim=True) # duplicate text embeddings clip for each generation per prompt bs_embed, _ = text_embeddings_clip.shape - text_embeddings_clip = text_embeddings_clip.tile( - [1, num_images_per_prompt]) - text_embeddings_clip = text_embeddings_clip.reshape( - [bs_embed * num_images_per_prompt, -1]) + text_embeddings_clip = text_embeddings_clip.tile([1, num_images_per_prompt]) + text_embeddings_clip = text_embeddings_clip.reshape([bs_embed * num_images_per_prompt, -1]) # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` @@ -313,14 +302,16 @@ def __call__( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt @@ -330,23 +321,20 @@ def __call__( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) attention_mask = paddle.ones_like(uncond_input.input_ids) - uncond_embeddings = self.text_encoder( - uncond_input.input_ids, attention_mask=attention_mask)[0] + uncond_embeddings = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask)[0] # duplicate unconditional embeddings for each generation per prompt seq_len = uncond_embeddings.shape[1] - uncond_embeddings = uncond_embeddings.tile( - [batch_size, num_images_per_prompt, 1]) - uncond_embeddings = uncond_embeddings.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + uncond_embeddings = uncond_embeddings.tile([batch_size, num_images_per_prompt, 1]) + uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - text_embeddings = paddle.concat( - [uncond_embeddings, text_embeddings]) + text_embeddings = paddle.concat([uncond_embeddings, text_embeddings]) # get the initial random noise unless the user supplied it @@ -360,13 +348,10 @@ def __call__( width // 8, ] if latents is None: - latents = paddle.randn( - latents_shape, generator=generator, dtype=text_embeddings.dtype) + latents = paddle.randn(latents_shape, generator=generator, dtype=text_embeddings.dtype) else: if latents.shape != latents_shape: - raise ValueError( - f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}" - ) + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}") # set timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -382,41 +367,34 @@ def __call__( # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator for i, t in enumerate(self.progress_bar(timesteps_tensor)): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) - if do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual - noise_pred = self.unet( - latent_model_input, t, - encoder_hidden_states=text_embeddings).sample + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample # perform classifier free guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # perform clip guidance if clip_guidance_scale > 0: - text_embeddings_for_guidance = (text_embeddings.chunk(2)[1] - if do_classifier_free_guidance - else text_embeddings) + text_embeddings_for_guidance = ( + text_embeddings.chunk(2)[1] if do_classifier_free_guidance else text_embeddings + ) noise_pred, latents = self.cond_fn( latents, t, @@ -426,11 +404,11 @@ def __call__( text_embeddings_clip, clip_guidance_scale, num_cutouts, - use_cutouts, ) + use_cutouts, + ) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided if callback is not None and i % callback_steps == 0: @@ -449,5 +427,4 @@ def __call__( if not return_dict: return (image, None) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=None) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None) diff --git a/ppdiffusers/examples/community/composable_stable_diffusion.py b/ppdiffusers/examples/community/composable_stable_diffusion.py index f3ff012a945f0..74e7f3856fdb6 100644 --- a/ppdiffusers/examples/community/composable_stable_diffusion.py +++ b/ppdiffusers/examples/community/composable_stable_diffusion.py @@ -16,18 +16,16 @@ from typing import Callable, Optional, Union import paddle -from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ppdiffusers.configuration_utils import FrozenDict from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel from ppdiffusers.pipeline_utils import DiffusionPipeline -from ppdiffusers.pipelines.stable_diffusion import \ - StableDiffusionPipelineOutput -from ppdiffusers.pipelines.stable_diffusion.safety_checker import \ - StableDiffusionSafetyChecker -from ppdiffusers.schedulers import (DDIMScheduler, LMSDiscreteScheduler, - PNDMScheduler) +from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput +from ppdiffusers.pipelines.stable_diffusion.safety_checker import ( + StableDiffusionSafetyChecker, +) +from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from ppdiffusers.utils import deprecate, logging logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -62,30 +60,26 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline): """ def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: Union[DDIMScheduler, PNDMScheduler, - LMSDiscreteScheduler], - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + ): super().__init__() - if (hasattr(scheduler.config, "steps_offset") and - scheduler.config.steps_offset != 1): + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " "to update the config accordingly as leaving `steps_offset` might led to incorrect results" " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" - " file") - deprecate( - "steps_offset!=1", - "1.0.0", - deprecation_message, - standard_warn=False) + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["steps_offset"] = 1 scheduler._internal_dict = FrozenDict(new_config) @@ -107,10 +101,10 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) + feature_extractor=feature_extractor, + ) - def enable_attention_slicing(self, - slice_size: Optional[Union[str, int]]="auto"): + def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): r""" Enable sliced attention computation. @@ -139,24 +133,25 @@ def disable_attention_slicing(self): @paddle.no_grad() def __call__( - self, - prompt: str, - height: Optional[int]=512, - width: Optional[int]=512, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - negative_prompt: str=None, - # num_images_per_prompt: Optional[int] = 1, - eta: Optional[float]=0.0, - seed: Optional[int]=None, - latents: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - weights: Optional[str]="", - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - reduce_memory: Optional[bool]=True, - **kwargs, ): + self, + prompt: str, + height: Optional[int] = 512, + width: Optional[int] = 512, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + negative_prompt: str = None, + # num_images_per_prompt: Optional[int] = 1, + eta: Optional[float] = 0.0, + seed: Optional[int] = None, + latents: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + weights: Optional[str] = "", + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + reduce_memory: Optional[bool] = True, + **kwargs, + ): r""" Function invoked when calling the pipeline for generation. @@ -215,24 +210,20 @@ def __call__( if isinstance(prompt, str): batch_size = 1 else: - raise ValueError( - f"`prompt` has to be of type `str`but is {type(prompt)}") + raise ValueError(f"`prompt` has to be of type `str`but is {type(prompt)}") if negative_prompt is not None and not isinstance(negative_prompt, str): - raise ValueError( - f"`negative_prompt` has to be of type `str`but is {type(prompt)}" - ) + raise ValueError(f"`negative_prompt` has to be of type `str`but is {type(prompt)}") if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if "|" in prompt: prompt = [x.strip() for x in prompt.split("|")] @@ -244,19 +235,19 @@ def __call__( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids if text_input_ids.shape[-1] > self.tokenizer.model_max_length: - removed_text = self.tokenizer.batch_decode( - text_input_ids[:, self.tokenizer.model_max_length:]) + removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :]) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") - text_input_ids = text_input_ids[:, :self.tokenizer.model_max_length] + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length] attention_mask = paddle.ones_like(text_input_ids) - text_embeddings = self.text_encoder( - text_input_ids, attention_mask=attention_mask)[0] + text_embeddings = self.text_encoder(text_input_ids, attention_mask=attention_mask)[0] # duplicate text embeddings for each generation per prompt, using mps friendly method # bs_embed, seq_len, _ = text_embeddings.shape @@ -268,20 +259,17 @@ def __call__( # specify weights for prompts (excluding the unconditional score) print("using equal weights for all prompts...") pos_weights = paddle.to_tensor( - [1 / (text_embeddings.shape[0] - 1)] * - (text_embeddings.shape[0] - 1)).reshape([-1, 1, 1, 1]) + [1 / (text_embeddings.shape[0] - 1)] * (text_embeddings.shape[0] - 1) + ).reshape([-1, 1, 1, 1]) neg_weights = paddle.to_tensor([1.0]).reshape([-1, 1, 1, 1]) - mask = paddle.to_tensor( - [False] + [True] * pos_weights.shape[0], dtype=paddle.bool) + mask = paddle.to_tensor([False] + [True] * pos_weights.shape[0], dtype=paddle.bool) else: # set prompt weight for each num_prompts = len(prompt) if isinstance(prompt, list) else 1 weights = [float(w.strip()) for w in weights.split("|")] if len(weights) < num_prompts: weights.append(1.0) - assert ( - len(weights) == text_embeddings.shape[0] - ), "weights specified are not equal to the number of prompts" + assert len(weights) == text_embeddings.shape[0], "weights specified are not equal to the number of prompts" pos_weights = [] neg_weights = [] mask = [] # first one is unconditional score @@ -296,8 +284,7 @@ def __call__( pos_weights = paddle.to_tensor(pos_weights).reshape([-1, 1, 1, 1]) pos_weights = pos_weights / pos_weights.sum() if neg_weights: - neg_weights = paddle.to_tensor(neg_weights).reshape( - [-1, 1, 1, 1]) + neg_weights = paddle.to_tensor(neg_weights).reshape([-1, 1, 1, 1]) neg_weights = neg_weights / neg_weights.sum() mask = paddle.to_tensor(mask, dtype=paddle.bool) @@ -320,10 +307,10 @@ def __call__( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) attention_mask = paddle.ones_like(uncond_input.input_ids) - uncond_embeddings = self.text_encoder( - uncond_input.input_ids, attention_mask=attention_mask)[0] + uncond_embeddings = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask)[0] # duplicate unconditional embeddings for each generation per prompt, using mps friendly method # seq_len = uncond_embeddings.shape[1] @@ -335,31 +322,25 @@ def __call__( # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - text_embeddings = paddle.concat( - [uncond_embeddings, text_embeddings]) + text_embeddings = paddle.concat([uncond_embeddings, text_embeddings]) # update negative weights neg_weights = paddle.to_tensor([1.0]).reshape([-1, 1, 1, 1]) - mask = paddle.to_tensor( - [False] + mask.tolist(), dtype=paddle.bool) + mask = paddle.to_tensor([False] + mask.tolist(), dtype=paddle.bool) # get the initial random noise unless the user supplied it # Unlike in other pipelines, latents need to be generated in the target device # for 1-to-1 results reproducibility with the CompVis implementation. # However this currently doesn't work in `mps`. - latents_shape = [ - batch_size, self.unet.in_channels, height // 8, width // 8 - ] + latents_shape = [batch_size, self.unet.in_channels, height // 8, width // 8] if latents is None: if seed is not None: paddle.seed(seed) latents = paddle.randn(latents_shape, dtype=text_embeddings.dtype) else: if latents.shape != latents_shape: - raise ValueError( - f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}" - ) + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}") # set timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -375,8 +356,7 @@ def __call__( # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta @@ -384,47 +364,34 @@ def __call__( for i, t in enumerate(self.progress_bar(timesteps_tensor)): # expand the latents if we are doing classifier free guidance latent_model_input = ( - paddle.concat([latents] * text_embeddings.shape[0]) - if do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + paddle.concat([latents] * text_embeddings.shape[0]) if do_classifier_free_guidance else latents + ) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) if reduce_memory: # reduce memory by predicting each score sequentially noise_preds = [] # predict the noise residual for latent_in, text_embedding_in in zip( - latent_model_input.chunk( - latent_model_input.shape[0], axis=0), - text_embeddings.chunk( - text_embeddings.shape[0], axis=0), ): - noise_preds.append( - self.unet( - latent_in, - t, - encoder_hidden_states=text_embedding_in).sample) + latent_model_input.chunk(latent_model_input.shape[0], axis=0), + text_embeddings.chunk(text_embeddings.shape[0], axis=0), + ): + noise_preds.append(self.unet(latent_in, t, encoder_hidden_states=text_embedding_in).sample) noise_preds = paddle.concat(noise_preds, axis=0) else: # predict the noise residual - noise_pred = self.unet( - latent_model_input, - t, - encoder_hidden_states=text_embeddings).sample + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample # perform guidance if do_classifier_free_guidance: mask_index = paddle.nonzero(mask).reshape([-1]) non_mask_index = paddle.nonzero(~mask).reshape([-1]) - noise_pred_uncond = (noise_preds[non_mask_index] * - neg_weights).sum(axis=0, keepdim=True) - noise_pred_text = (noise_preds[mask_index] * pos_weights).sum( - axis=0, keepdim=True) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred_uncond = (noise_preds[non_mask_index] * neg_weights).sum(axis=0, keepdim=True) + noise_pred_text = (noise_preds[mask_index] * pos_weights).sum(axis=0, keepdim=True) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided if callback is not None and i % callback_steps == 0: @@ -441,12 +408,11 @@ def __call__( # run safety checker if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( images=image, - clip_input=safety_checker_input.pixel_values.astype( - text_embeddings.dtype), ) + clip_input=safety_checker_input.pixel_values.astype(text_embeddings.dtype), + ) else: has_nsfw_concept = None @@ -456,5 +422,4 @@ def __call__( if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/examples/community/inference_clip_guided_stable_diffusion.py b/ppdiffusers/examples/community/inference_clip_guided_stable_diffusion.py index 1a244474fba03..87cee7e93a914 100644 --- a/ppdiffusers/examples/community/inference_clip_guided_stable_diffusion.py +++ b/ppdiffusers/examples/community/inference_clip_guided_stable_diffusion.py @@ -33,15 +33,14 @@ def image_grid(imgs, rows, cols): def create_clip_guided_pipeline( - model_id="CompVis/stable-diffusion-v1-4", - clip_model_id="openai/clip-vit-large-patch14", - scheduler="plms", ): - pipeline = StableDiffusionPipeline.from_pretrained( - model_id, paddle_dtype=paddle.float16) + model_id="CompVis/stable-diffusion-v1-4", + clip_model_id="openai/clip-vit-large-patch14", + scheduler="plms", +): + pipeline = StableDiffusionPipeline.from_pretrained(model_id, paddle_dtype=paddle.float16) if scheduler == "lms": - scheduler = LMSDiscreteScheduler( - beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear") + scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear") else: scheduler = pipeline.scheduler @@ -55,26 +54,28 @@ def create_clip_guided_pipeline( text_encoder=pipeline.text_encoder, scheduler=scheduler, clip_model=clip_model, - feature_extractor=feature_extractor, ) + feature_extractor=feature_extractor, + ) return guided_pipeline def infer( - prompt, - clip_prompt, - num_return_images=1, - num_images_per_prompt=1, - num_inference_steps=50, - clip_guidance_scale=100, - guidance_scale=7.5, - guided_pipeline=None, - negative_prompt="", - use_cutouts=True, - num_cutouts=4, - seed=None, - unfreeze_unet=True, - unfreeze_vae=True, ): + prompt, + clip_prompt, + num_return_images=1, + num_images_per_prompt=1, + num_inference_steps=50, + clip_guidance_scale=100, + guidance_scale=7.5, + guided_pipeline=None, + negative_prompt="", + use_cutouts=True, + num_cutouts=4, + seed=None, + unfreeze_unet=True, + unfreeze_vae=True, +): clip_prompt = clip_prompt if clip_prompt.strip() != "" else None if unfreeze_unet: guided_pipeline.unfreeze_unet() @@ -98,7 +99,8 @@ def infer( num_cutouts=num_cutouts, use_cutouts=use_cutouts, seed=seed, - num_images_per_prompt=num_images_per_prompt, ).images + num_images_per_prompt=num_images_per_prompt, + ).images images.extend(image) return image_grid(images, 1, len(images)) @@ -141,6 +143,7 @@ def infer( num_cutouts=num_cutouts, seed=seed, unfreeze_unet=unfreeze_unet, - unfreeze_vae=unfreeze_vae, ) + unfreeze_vae=unfreeze_vae, + ) display(grid_image) diff --git a/ppdiffusers/examples/community/interpolate_stable_diffusion.py b/ppdiffusers/examples/community/interpolate_stable_diffusion.py index 82ed3fbc72ad5..d826aad5ac9fb 100644 --- a/ppdiffusers/examples/community/interpolate_stable_diffusion.py +++ b/ppdiffusers/examples/community/interpolate_stable_diffusion.py @@ -20,18 +20,16 @@ import numpy as np import paddle -from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ppdiffusers.configuration_utils import FrozenDict from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel from ppdiffusers.pipeline_utils import DiffusionPipeline -from ppdiffusers.pipelines.stable_diffusion import \ - StableDiffusionPipelineOutput -from ppdiffusers.pipelines.stable_diffusion.safety_checker import \ - StableDiffusionSafetyChecker -from ppdiffusers.schedulers import (DDIMScheduler, LMSDiscreteScheduler, - PNDMScheduler) +from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput +from ppdiffusers.pipelines.stable_diffusion.safety_checker import ( + StableDiffusionSafetyChecker, +) +from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from ppdiffusers.utils import deprecate, logging logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -90,31 +88,27 @@ class StableDiffusionWalkPipeline(DiffusionPipeline): """ def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: Union[DDIMScheduler, PNDMScheduler, - LMSDiscreteScheduler], - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + ): super().__init__() - if (hasattr(scheduler.config, "steps_offset") and - scheduler.config.steps_offset != 1): + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " "to update the config accordingly as leaving `steps_offset` might led to incorrect results" " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" - " file") - deprecate( - "steps_offset!=1", - "1.0.0", - deprecation_message, - standard_warn=False) + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["steps_offset"] = 1 scheduler._internal_dict = FrozenDict(new_config) @@ -136,10 +130,10 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) + feature_extractor=feature_extractor, + ) - def enable_attention_slicing(self, - slice_size: Optional[Union[str, int]]="auto"): + def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): r""" Enable sliced attention computation. When this option is enabled, the attention module will split the input tensor in slices, to compute attention @@ -166,23 +160,24 @@ def disable_attention_slicing(self): @paddle.no_grad() def __call__( - self, - prompt: Optional[Union[str, List[str]]]=None, - height: int=512, - width: int=512, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[paddle.Generator]=None, - latents: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - text_embeddings: Optional[paddle.Tensor]=None, - **kwargs, ): + self, + prompt: Optional[Union[str, List[str]]] = None, + height: int = 512, + width: int = 512, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[paddle.Generator] = None, + latents: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + text_embeddings: Optional[paddle.Tensor] = None, + **kwargs, + ): r""" Function invoked when calling the pipeline for generation. Args: @@ -240,16 +235,15 @@ def __call__( """ if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if text_embeddings is None: if isinstance(prompt, str): @@ -257,37 +251,33 @@ def __call__( elif isinstance(prompt, list): batch_size = len(prompt) else: - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") # get prompt text embeddings text_inputs = self.tokenizer( prompt, padding="max_length", max_length=self.tokenizer.model_max_length, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids if text_input_ids.shape[-1] > self.tokenizer.model_max_length: - removed_text = self.tokenizer.batch_decode( - text_input_ids[:, self.tokenizer.model_max_length:]) + removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :]) print( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") - text_input_ids = text_input_ids[:, : - self.tokenizer.model_max_length] + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length] attention_mask = paddle.ones_like(text_input_ids) - text_embeddings = self.text_encoder( - text_input_ids, attention_mask=attention_mask)[0] + text_embeddings = self.text_encoder(text_input_ids, attention_mask=attention_mask)[0] else: batch_size = text_embeddings.shape[0] # duplicate text embeddings for each generation per prompt, using mps friendly method bs_embed, seq_len, _ = text_embeddings.shape text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1]) - text_embeddings = text_embeddings.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` @@ -301,14 +291,16 @@ def __call__( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt @@ -318,23 +310,20 @@ def __call__( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) attention_mask = paddle.ones_like(uncond_input.input_ids) - uncond_embeddings = self.text_encoder( - uncond_input.input_ids, attention_mask=attention_mask)[0] + uncond_embeddings = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask)[0] # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = uncond_embeddings.shape[1] - uncond_embeddings = uncond_embeddings.tile( - [batch_size, num_images_per_prompt, 1]) - uncond_embeddings = uncond_embeddings.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + uncond_embeddings = uncond_embeddings.tile([batch_size, num_images_per_prompt, 1]) + uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - text_embeddings = paddle.concat( - [uncond_embeddings, text_embeddings]) + text_embeddings = paddle.concat([uncond_embeddings, text_embeddings]) # get the initial random noise unless the user supplied it @@ -349,13 +338,10 @@ def __call__( ] latents_dtype = text_embeddings.dtype if latents is None: - latents = paddle.randn( - latents_shape, generator=generator, dtype=latents_dtype) + latents = paddle.randn(latents_shape, generator=generator, dtype=latents_dtype) else: if latents.shape != latents_shape: - raise ValueError( - f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}" - ) + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}") latents = latents # set timesteps @@ -372,33 +358,26 @@ def __call__( # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta for i, t in enumerate(self.progress_bar(timesteps_tensor)): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) - if do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual - noise_pred = self.unet( - latent_model_input, t, - encoder_hidden_states=text_embeddings).sample + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided if callback is not None and i % callback_steps == 0: @@ -413,12 +392,11 @@ def __call__( image = image.transpose([0, 2, 3, 1]).astype("float32").numpy() if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( images=image, - clip_input=safety_checker_input.pixel_values.astype( - text_embeddings.dtype), ) + clip_input=safety_checker_input.pixel_values.astype(text_embeddings.dtype), + ) else: has_nsfw_concept = None @@ -428,8 +406,7 @@ def __call__( if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) def embed_text(self, text): """takes in text and turns it into text embeddings""" @@ -438,7 +415,8 @@ def embed_text(self, text): padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) with paddle.no_grad(): embed = self.text_encoder(text_input.input_ids)[0] return embed @@ -448,21 +426,23 @@ def get_noise(self, seed, dtype=paddle.float32, height=512, width=512): return paddle.randn( (1, self.unet.in_channels, height // 8, width // 8), generator=paddle.Generator().manual_seed(seed), - dtype=dtype, ) + dtype=dtype, + ) def walk( - self, - prompts: List[str], - seeds: List[int], - num_interpolation_steps: Optional[int]=6, - output_dir: Optional[str]="./dreams", - name: Optional[str]=None, - batch_size: Optional[int]=1, - height: Optional[int]=512, - width: Optional[int]=512, - guidance_scale: Optional[float]=7.5, - num_inference_steps: Optional[int]=50, - eta: Optional[float]=0.0, ) -> List[str]: + self, + prompts: List[str], + seeds: List[int], + num_interpolation_steps: Optional[int] = 6, + output_dir: Optional[str] = "./dreams", + name: Optional[str] = None, + batch_size: Optional[int] = 1, + height: Optional[int] = 512, + width: Optional[int] = 512, + guidance_scale: Optional[float] = 7.5, + num_inference_steps: Optional[int] = 50, + eta: Optional[float] = 0.0, + ) -> List[str]: """ Walks through a series of prompts and seeds, interpolating between them and saving the results to disk. Args: @@ -509,8 +489,7 @@ def walk( frame_idx = 0 frame_filepaths = [] - for prompt_a, prompt_b, seed_a, seed_b in zip(prompts, prompts[1:], - seeds, seeds[1:]): + for prompt_a, prompt_b, seed_a, seed_b in zip(prompts, prompts[1:], seeds, seeds[1:]): # Embed Text embed_a = self.embed_text(prompt_a) embed_b = self.embed_text(prompt_b) @@ -526,14 +505,10 @@ def walk( noise = slerp(float(t), noise_a, noise_b) embed = paddle.lerp(embed_a, embed_b, t) - noise_batch = (noise if noise_batch is None else paddle.concat( - [noise_batch, noise], axis=0)) - embeds_batch = (embed - if embeds_batch is None else paddle.concat( - [embeds_batch, embed], axis=0)) + noise_batch = noise if noise_batch is None else paddle.concat([noise_batch, noise], axis=0) + embeds_batch = embed if embeds_batch is None else paddle.concat([embeds_batch, embed], axis=0) - batch_is_ready = (embeds_batch.shape[0] == batch_size or - i + 1 == T.shape[0]) + batch_is_ready = embeds_batch.shape[0] == batch_size or i + 1 == T.shape[0] if batch_is_ready: outputs = self( latents=noise_batch, @@ -542,12 +517,12 @@ def walk( width=width, guidance_scale=guidance_scale, eta=eta, - num_inference_steps=num_inference_steps, ) + num_inference_steps=num_inference_steps, + ) noise_batch, embeds_batch = None, None for image in outputs["images"]: - frame_filepath = str(save_path / - f"frame_{frame_idx:06d}.png") + frame_filepath = str(save_path / f"frame_{frame_idx:06d}.png") image.save(frame_filepath) frame_filepaths.append(frame_filepath) frame_idx += 1 diff --git a/ppdiffusers/examples/community/lpw_stable_diffusion.py b/ppdiffusers/examples/community/lpw_stable_diffusion.py index 6870f3e68508a..c52d942b0b5a4 100644 --- a/ppdiffusers/examples/community/lpw_stable_diffusion.py +++ b/ppdiffusers/examples/community/lpw_stable_diffusion.py @@ -19,17 +19,18 @@ import numpy as np import paddle import PIL -from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from paddlemix.utils.tools import compare_version from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel from ppdiffusers.pipelines.stable_diffusion import ( - StableDiffusionPipeline, StableDiffusionPipelineOutput) -from ppdiffusers.pipelines.stable_diffusion.safety_checker import \ - StableDiffusionSafetyChecker -from ppdiffusers.schedulers import (DDIMScheduler, LMSDiscreteScheduler, - PNDMScheduler) + StableDiffusionPipeline, + StableDiffusionPipelineOutput, +) +from ppdiffusers.pipelines.stable_diffusion.safety_checker import ( + StableDiffusionSafetyChecker, +) +from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from ppdiffusers.utils import logging if compare_version(PIL.__version__, "9.1.0") >= 0: @@ -55,7 +56,8 @@ [^\\()\[\]:]+| : """, - re.X, ) + re.X, +) def parse_prompt_attention(text): @@ -144,9 +146,7 @@ def multiply_range(start_position, multiplier): return res -def get_prompts_with_weights(pipe: StableDiffusionPipeline, - prompt: List[str], - max_length: int): +def get_prompts_with_weights(pipe: StableDiffusionPipeline, prompt: List[str], max_length: int): r""" Tokenize a list of prompts and return its tokens with weights of each token. No padding, starting or ending token is included. @@ -176,32 +176,20 @@ def get_prompts_with_weights(pipe: StableDiffusionPipeline, tokens.append(text_token) weights.append(text_weight) if truncated: - logger.warning( - "Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples" - ) + logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples") return tokens, weights -def pad_tokens_and_weights(tokens, - weights, - max_length, - bos, - eos, - pad, - no_boseos_middle=True, - chunk_length=77): +def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos_middle=True, chunk_length=77): r""" Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length. """ max_embeddings_multiples = (max_length - 2) // (chunk_length - 2) - weights_length = (max_length if no_boseos_middle else - max_embeddings_multiples * chunk_length) + weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length for i in range(len(tokens)): - tokens[i] = ([bos] + tokens[i] + [eos] + [pad] * - (max_length - 2 - len(tokens[i]))) + tokens[i] = [bos] + tokens[i] + [eos] + [pad] * (max_length - 2 - len(tokens[i])) if no_boseos_middle: - weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - - len(weights[i])) + weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i])) else: w = [] if len(weights[i]) == 0: @@ -209,8 +197,7 @@ def pad_tokens_and_weights(tokens, else: for j in range(max_embeddings_multiples): w.append(1.0) # weight for starting token in this chunk - w += weights[i][j * (chunk_length - 2):min( - len(weights[i]), (j + 1) * (chunk_length - 2))] + w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))] w.append(1.0) # weight for ending token in this chunk w += [1.0] * (weights_length - len(w)) weights[i] = w[:] @@ -219,10 +206,11 @@ def pad_tokens_and_weights(tokens, def get_unweighted_text_embeddings( - pipe: StableDiffusionPipeline, - text_input: paddle.Tensor, - chunk_length: int, - no_boseos_middle: Optional[bool]=True, ): + pipe: StableDiffusionPipeline, + text_input: paddle.Tensor, + chunk_length: int, + no_boseos_middle: Optional[bool] = True, +): """ When the length of tokens is a multiple of the capacity of the text encoder, it should be split into chunks and sent to the text encoder individually. @@ -232,8 +220,7 @@ def get_unweighted_text_embeddings( text_embeddings = [] for i in range(max_embeddings_multiples): # extract the i-th chunk - text_input_chunk = text_input[:, i * (chunk_length - 2):(i + 1) * ( - chunk_length - 2) + 2].clone() + text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone() # cover the head and the tail by the starting and the ending tokens text_input_chunk[:, 0] = text_input[0, 0] @@ -259,14 +246,15 @@ def get_unweighted_text_embeddings( def get_weighted_text_embeddings( - pipe: StableDiffusionPipeline, - prompt: Union[str, List[str]], - uncond_prompt: Optional[Union[str, List[str]]]=None, - max_embeddings_multiples: Optional[int]=1, - no_boseos_middle: Optional[bool]=False, - skip_parsing: Optional[bool]=False, - skip_weighting: Optional[bool]=False, - **kwargs, ): + pipe: StableDiffusionPipeline, + prompt: Union[str, List[str]], + uncond_prompt: Optional[Union[str, List[str]]] = None, + max_embeddings_multiples: Optional[int] = 1, + no_boseos_middle: Optional[bool] = False, + skip_parsing: Optional[bool] = False, + skip_weighting: Optional[bool] = False, + **kwargs, +): r""" Prompts can be assigned with local weights using brackets. For example, prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful', @@ -290,24 +278,19 @@ def get_weighted_text_embeddings( skip_weighting (`bool`, *optional*, defaults to `False`): Skip the weighting. When the parsing is skipped, it is forced True. """ - max_length = (pipe.tokenizer.model_max_length - 2 - ) * max_embeddings_multiples + 2 + max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2 if isinstance(prompt, str): prompt = [prompt] if not skip_parsing: - prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, - max_length - 2) + prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2) if uncond_prompt is not None: if isinstance(uncond_prompt, str): uncond_prompt = [uncond_prompt] - uncond_tokens, uncond_weights = get_prompts_with_weights( - pipe, uncond_prompt, max_length - 2) + uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2) else: prompt_tokens = [ - token[1:-1] - for token in pipe.tokenizer( - prompt, max_length=max_length, truncation=True).input_ids + token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids ] prompt_weights = [[1.0] * len(token) for token in prompt_tokens] if uncond_prompt is not None: @@ -315,33 +298,26 @@ def get_weighted_text_embeddings( uncond_prompt = [uncond_prompt] uncond_tokens = [ token[1:-1] - for token in pipe.tokenizer( - uncond_prompt, max_length=max_length, truncation=True) - .input_ids + for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids ] uncond_weights = [[1.0] * len(token) for token in uncond_tokens] # round up the longest length of tokens to a multiple of (model_max_length - 2) max_length = max([len(token) for token in prompt_tokens]) if uncond_prompt is not None: - max_length = max(max_length, - max([len(token) for token in uncond_tokens])) + max_length = max(max_length, max([len(token) for token in uncond_tokens])) max_embeddings_multiples = min( max_embeddings_multiples, - (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1, ) + (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1, + ) max_embeddings_multiples = max(1, max_embeddings_multiples) - max_length = (pipe.tokenizer.model_max_length - 2 - ) * max_embeddings_multiples + 2 + max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2 # pad the length of tokens and weights # support bert tokenizer - bos = (pipe.tokenizer.bos_token_id - if pipe.tokenizer.bos_token_id is not None else - pipe.tokenizer.cls_token_id) - eos = (pipe.tokenizer.eos_token_id - if pipe.tokenizer.eos_token_id is not None else - pipe.tokenizer.sep_token_id) + bos = pipe.tokenizer.bos_token_id if pipe.tokenizer.bos_token_id is not None else pipe.tokenizer.cls_token_id + eos = pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id is not None else pipe.tokenizer.sep_token_id pad = pipe.tokenizer.pad_token_id prompt_tokens, prompt_weights = pad_tokens_and_weights( prompt_tokens, @@ -351,7 +327,8 @@ def get_weighted_text_embeddings( eos, pad, no_boseos_middle=no_boseos_middle, - chunk_length=pipe.tokenizer.model_max_length, ) + chunk_length=pipe.tokenizer.model_max_length, + ) prompt_tokens = paddle.to_tensor(prompt_tokens, dtype=paddle.int64) if uncond_prompt is not None: uncond_tokens, uncond_weights = pad_tokens_and_weights( @@ -362,7 +339,8 @@ def get_weighted_text_embeddings( eos, pad, no_boseos_middle=no_boseos_middle, - chunk_length=pipe.tokenizer.model_max_length, ) + chunk_length=pipe.tokenizer.model_max_length, + ) uncond_tokens = paddle.to_tensor(uncond_tokens, dtype=paddle.int64) # get the embeddings @@ -370,32 +348,28 @@ def get_weighted_text_embeddings( pipe, prompt_tokens, pipe.tokenizer.model_max_length, - no_boseos_middle=no_boseos_middle, ) - prompt_weights = paddle.to_tensor( - prompt_weights, dtype=text_embeddings.dtype) + no_boseos_middle=no_boseos_middle, + ) + prompt_weights = paddle.to_tensor(prompt_weights, dtype=text_embeddings.dtype) if uncond_prompt is not None: uncond_embeddings = get_unweighted_text_embeddings( pipe, uncond_tokens, pipe.tokenizer.model_max_length, - no_boseos_middle=no_boseos_middle, ) - uncond_weights = paddle.to_tensor( - uncond_weights, dtype=uncond_embeddings.dtype) + no_boseos_middle=no_boseos_middle, + ) + uncond_weights = paddle.to_tensor(uncond_weights, dtype=uncond_embeddings.dtype) # assign weights to the prompts and normalize in the sense of mean # TODO: should we normalize by chunk or in a whole (current implementation)? if (not skip_parsing) and (not skip_weighting): previous_mean = text_embeddings.mean(axis=[-2, -1]) text_embeddings *= prompt_weights.unsqueeze(-1) - text_embeddings *= ( - (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1) - .unsqueeze(-1)) + text_embeddings *= (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1) if uncond_prompt is not None: previous_mean = uncond_embeddings.mean(axis=[-2, -1]) uncond_embeddings *= uncond_weights.unsqueeze(-1) - uncond_embeddings *= ( - (previous_mean / uncond_embeddings.mean(axis=[-2, -1])) - .unsqueeze(-1).unsqueeze(-1)) + uncond_embeddings *= (previous_mean / uncond_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1) if uncond_prompt is not None: return text_embeddings, uncond_embeddings @@ -416,8 +390,7 @@ def preprocess_mask(mask, scale_factor=8): mask = mask.convert("L") w, h = mask.size w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32 - mask = mask.resize( - (w // scale_factor, h // scale_factor), resample=Resampling.NEAREST) + mask = mask.resize((w // scale_factor, h // scale_factor), resample=Resampling.NEAREST) mask = np.array(mask).astype(np.float32) / 255.0 mask = np.tile(mask, (4, 1, 1)) mask = mask[None].transpose(0, 1, 2, 3) # what does this step do? @@ -454,16 +427,16 @@ class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline): """ def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: Union[DDIMScheduler, PNDMScheduler, - LMSDiscreteScheduler], - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, - requires_safety_checker: Optional[bool]=True, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + requires_safety_checker: Optional[bool] = True, + ): super().__init__( vae=vae, text_encoder=text_encoder, @@ -472,7 +445,8 @@ def __init__( scheduler=scheduler, safety_checker=safety_checker, feature_extractor=feature_extractor, - requires_safety_checker=requires_safety_checker, ) + requires_safety_checker=requires_safety_checker, + ) self.__init__additional__() def __init__additional__(self): @@ -480,10 +454,10 @@ def __init__additional__(self): setattr( self, "vae_scale_factor", - 2**(len(self.vae.config.block_out_channels) - 1), ) + 2 ** (len(self.vae.config.block_out_channels) - 1), + ) - def enable_attention_slicing(self, - slice_size: Optional[Union[str, int]]="auto"): + def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): r""" Enable sliced attention computation. When this option is enabled, the attention module will split the input tensor in slices, to compute attention @@ -510,34 +484,31 @@ def disable_attention_slicing(self): def check_inputs(self, prompt, height, width, strength, callback_steps): if not isinstance(prompt, str) and not isinstance(prompt, list): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if strength < 0 or strength > 1: - raise ValueError( - f"The value of strength should in [0.0, 1.0] but is {strength}") + raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) def _encode_prompt( - self, - prompt: Union[str, List[str]], - num_images_per_prompt: int, - do_classifier_free_guidance: bool, - negative_prompt: Union[str, List[str]], - max_embeddings_multiples: Optional[int]=3, - **kwargs, ): + self, + prompt: Union[str, List[str]], + num_images_per_prompt: int, + do_classifier_free_guidance: bool, + negative_prompt: Union[str, List[str]], + max_embeddings_multiples: Optional[int] = 3, + **kwargs, + ): r""" Encodes the prompt into text encoder hidden states. @@ -564,28 +535,25 @@ def _encode_prompt( raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) text_embeddings, uncond_embeddings = get_weighted_text_embeddings( pipe=self, prompt=prompt, - uncond_prompt=negative_prompt - if do_classifier_free_guidance else None, + uncond_prompt=negative_prompt if do_classifier_free_guidance else None, max_embeddings_multiples=max_embeddings_multiples, - **kwargs, ) + **kwargs, + ) bs_embed, seq_len, _ = text_embeddings.shape text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1]) - text_embeddings = text_embeddings.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) if do_classifier_free_guidance: seq_len = uncond_embeddings.shape[1] - uncond_embeddings = uncond_embeddings.tile( - [1, num_images_per_prompt, 1]) - uncond_embeddings = uncond_embeddings.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) - text_embeddings = paddle.concat( - [uncond_embeddings, text_embeddings]) + uncond_embeddings = uncond_embeddings.tile([1, num_images_per_prompt, 1]) + uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1]) + text_embeddings = paddle.concat([uncond_embeddings, text_embeddings]) return text_embeddings @@ -602,29 +570,20 @@ def get_timesteps(self, num_inference_steps, strength, is_text2img): timesteps = self.scheduler.timesteps[t_start:] return timesteps, num_inference_steps - t_start - def prepare_latents(self, - image, - timestep, - batch_size, - height, - width, - dtype, - generator, - latents=None): + def prepare_latents(self, image, timestep, batch_size, height, width, dtype, generator, latents=None): if image is None: shape = ( batch_size, self.unet.in_channels, height // self.vae_scale_factor, - width // self.vae_scale_factor, ) + width // self.vae_scale_factor, + ) if latents is None: latents = paddle.randn(shape, generator=generator, dtype=dtype) else: if latents.shape != shape: - raise ValueError( - f"Unexpected latents shape, got {latents.shape}, expected {shape}" - ) + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") # scale the initial noise by the standard deviation required by the scheduler latents = latents * self.scheduler.init_noise_sigma @@ -644,27 +603,28 @@ def prepare_latents(self, @paddle.no_grad() def __call__( - self, - prompt: Union[str, List[str]], - negative_prompt: Optional[Union[str, List[str]]]=None, - image: Union[paddle.Tensor, PIL.Image.Image]=None, - mask_image: Union[paddle.Tensor, PIL.Image.Image]=None, - height: Optional[int]=512, - width: Optional[int]=512, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[int]=7.5, - strength: Optional[int]=0.8, - num_images_per_prompt: Optional[int]=1, - eta: Optional[int]=0.0, - generator: Optional[paddle.Generator]=None, - latents: Optional[paddle.Tensor]=None, - max_embeddings_multiples: Optional[int]=3, - output_type: Optional[str]="pil", - return_dict: Optional[bool]=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - is_cancelled_callback: Optional[Callable[[], bool]]=None, - callback_steps: Optional[int]=1, - **kwargs, ): + self, + prompt: Union[str, List[str]], + negative_prompt: Optional[Union[str, List[str]]] = None, + image: Union[paddle.Tensor, PIL.Image.Image] = None, + mask_image: Union[paddle.Tensor, PIL.Image.Image] = None, + height: Optional[int] = 512, + width: Optional[int] = 512, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[int] = 7.5, + strength: Optional[int] = 0.8, + num_images_per_prompt: Optional[int] = 1, + eta: Optional[int] = 0.0, + generator: Optional[paddle.Generator] = None, + latents: Optional[paddle.Tensor] = None, + max_embeddings_multiples: Optional[int] = 3, + output_type: Optional[str] = "pil", + return_dict: Optional[bool] = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + is_cancelled_callback: Optional[Callable[[], bool]] = None, + callback_steps: Optional[int] = 1, + **kwargs, + ): r""" Function invoked when calling the pipeline for generation. Args: @@ -753,7 +713,8 @@ def __call__( num_images_per_prompt, do_classifier_free_guidance, negative_prompt, - max_embeddings_multiples, ) + max_embeddings_multiples, + ) dtype = text_embeddings.dtype # 4. Preprocess image and mask @@ -765,17 +726,14 @@ def __call__( mask_image = preprocess_mask(mask_image, self.vae_scale_factor) if mask_image is not None: mask_image = mask_image.astype(dtype=dtype) - mask = paddle.concat([mask_image] * batch_size * - num_images_per_prompt) + mask = paddle.concat([mask_image] * batch_size * num_images_per_prompt) else: mask = None # 5. set timesteps self.scheduler.set_timesteps(num_inference_steps) - timesteps, num_inference_steps = self.get_timesteps( - num_inference_steps, strength, image is None) - latent_timestep = timesteps[:1].tile( - [batch_size * num_images_per_prompt]) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, image is None) + latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt]) # 6. Prepare latent variables latents, init_latents_orig, noise = self.prepare_latents( @@ -786,7 +744,8 @@ def __call__( width, dtype, generator, - latents, ) + latents, + ) # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) @@ -794,46 +753,37 @@ def __call__( # 8. Denoising loop for i, t in enumerate(self.progress_bar(timesteps)): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) - if do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual - noise_pred = self.unet( - latent_model_input, t, - encoder_hidden_states=text_embeddings).sample + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample if mask is not None: # masking - init_latents_proper = self.scheduler.add_noise( - init_latents_orig, noise, t) + init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, t) latents = (init_latents_proper * mask) + (latents * (1 - mask)) # call the callback, if provided if i % callback_steps == 0: if callback is not None: callback(i, t, latents) - if is_cancelled_callback is not None and is_cancelled_callback( - ): + if is_cancelled_callback is not None and is_cancelled_callback(): return None # 9. Post-processing image = self.decode_latents(latents) # 10. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, - text_embeddings.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, text_embeddings.dtype) # 11. Convert to PIL if output_type == "pil": @@ -842,28 +792,28 @@ def __call__( if not return_dict: return image, has_nsfw_concept - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) def text2img( - self, - prompt: Union[str, List[str]], - negative_prompt: Optional[Union[str, List[str]]]=None, - height: Optional[int]=512, - width: Optional[int]=512, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - num_images_per_prompt: Optional[int]=1, - eta: Optional[int]=0.0, - generator: Optional[paddle.Generator]=None, - latents: Optional[paddle.Tensor]=None, - max_embeddings_multiples: Optional[int]=3, - output_type: Optional[str]="pil", - return_dict: Optional[bool]=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - is_cancelled_callback: Optional[Callable[[], bool]]=None, - callback_steps: Optional[int]=1, - **kwargs, ): + self, + prompt: Union[str, List[str]], + negative_prompt: Optional[Union[str, List[str]]] = None, + height: Optional[int] = 512, + width: Optional[int] = 512, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + num_images_per_prompt: Optional[int] = 1, + eta: Optional[int] = 0.0, + generator: Optional[paddle.Generator] = None, + latents: Optional[paddle.Tensor] = None, + max_embeddings_multiples: Optional[int] = 3, + output_type: Optional[str] = "pil", + return_dict: Optional[bool] = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + is_cancelled_callback: Optional[Callable[[], bool]] = None, + callback_steps: Optional[int] = 1, + **kwargs, + ): r""" Function for text-to-image generation. Args: @@ -936,26 +886,28 @@ def text2img( return_dict=return_dict, callback=callback, callback_steps=callback_steps, - **kwargs, ) + **kwargs, + ) def img2img( - self, - prompt: Union[str, List[str]], - image: Union[paddle.Tensor, PIL.Image.Image], - negative_prompt: Optional[Union[str, List[str]]]=None, - strength: Optional[float]=0.8, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - num_images_per_prompt: Optional[int]=1, - eta: Optional[float]=0.0, - generator: Optional[paddle.Generator]=None, - max_embeddings_multiples: Optional[int]=3, - output_type: Optional[str]="pil", - return_dict: Optional[bool]=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - is_cancelled_callback: Optional[Callable[[], bool]]=None, - callback_steps: Optional[int]=1, - **kwargs, ): + self, + prompt: Union[str, List[str]], + image: Union[paddle.Tensor, PIL.Image.Image], + negative_prompt: Optional[Union[str, List[str]]] = None, + strength: Optional[float] = 0.8, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + num_images_per_prompt: Optional[int] = 1, + eta: Optional[float] = 0.0, + generator: Optional[paddle.Generator] = None, + max_embeddings_multiples: Optional[int] = 3, + output_type: Optional[str] = "pil", + return_dict: Optional[bool] = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + is_cancelled_callback: Optional[Callable[[], bool]] = None, + callback_steps: Optional[int] = 1, + **kwargs, + ): r""" Function for image-to-image generation. Args: @@ -1029,27 +981,29 @@ def img2img( callback=callback, is_cancelled_callback=is_cancelled_callback, callback_steps=callback_steps, - **kwargs, ) + **kwargs, + ) def inpaint( - self, - prompt: Union[str, List[str]], - image: Union[paddle.Tensor, PIL.Image.Image], - mask_image: Union[paddle.Tensor, PIL.Image.Image], - negative_prompt: Optional[Union[str, List[str]]]=None, - strength: Optional[float]=0.8, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - num_images_per_prompt: Optional[int]=1, - eta: Optional[float]=0.0, - generator: Optional[paddle.Generator]=None, - max_embeddings_multiples: Optional[int]=3, - output_type: Optional[str]="pil", - return_dict: Optional[bool]=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - is_cancelled_callback: Optional[Callable[[], bool]]=None, - callback_steps: Optional[int]=1, - **kwargs, ): + self, + prompt: Union[str, List[str]], + image: Union[paddle.Tensor, PIL.Image.Image], + mask_image: Union[paddle.Tensor, PIL.Image.Image], + negative_prompt: Optional[Union[str, List[str]]] = None, + strength: Optional[float] = 0.8, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + num_images_per_prompt: Optional[int] = 1, + eta: Optional[float] = 0.0, + generator: Optional[paddle.Generator] = None, + max_embeddings_multiples: Optional[int] = 3, + output_type: Optional[str] = "pil", + return_dict: Optional[bool] = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + is_cancelled_callback: Optional[Callable[[], bool]] = None, + callback_steps: Optional[int] = 1, + **kwargs, + ): r""" Function for inpaint. Args: @@ -1124,4 +1078,5 @@ def inpaint( return_dict=return_dict, callback=callback, callback_steps=callback_steps, - **kwargs, ) + **kwargs, + ) diff --git a/ppdiffusers/examples/community/mixture_tiling.py b/ppdiffusers/examples/community/mixture_tiling.py index 62f8650648596..5ae0911810d10 100644 --- a/ppdiffusers/examples/community/mixture_tiling.py +++ b/ppdiffusers/examples/community/mixture_tiling.py @@ -23,17 +23,18 @@ from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel from ppdiffusers.pipeline_utils import DiffusionPipeline from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker -from ppdiffusers.schedulers import (DDIMScheduler, LMSDiscreteScheduler, - PNDMScheduler) +from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from ppdiffusers.utils import logging try: from ligo.segments import segment - from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel, - CLIPTokenizer) + from paddlenlp.transformers import ( + CLIPFeatureExtractor, + CLIPTextModel, + CLIPTokenizer, + ) except ImportError: - raise ImportError( - "Please install paddlenlp and ligo-segments to use the mixture pipeline") + raise ImportError("Please install paddlenlp and ligo-segments to use the mixture pipeline") logger = logging.get_logger(__name__) EXAMPLE_DOC_STRING = """ Examples: @@ -61,8 +62,7 @@ """ -def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height, - tile_row_overlap, tile_col_overlap): +def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap): """Given a tile row and column numbers returns the range of pixels affected by that tiles in the overall image Returns a tuple with: @@ -71,11 +71,9 @@ def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height, - Starting coordinates of columns in pixel space - Ending coordinates of columns in pixel space """ - px_row_init = 0 if tile_row == 0 else tile_row * ( - tile_height - tile_row_overlap) + px_row_init = 0 if tile_row == 0 else tile_row * (tile_height - tile_row_overlap) px_row_end = px_row_init + tile_height - px_col_init = 0 if tile_col == 0 else tile_col * ( - tile_width - tile_col_overlap) + px_col_init = 0 if tile_col == 0 else tile_col * (tile_width - tile_col_overlap) px_col_end = px_col_init + tile_width return px_row_init, px_row_end, px_col_init, px_col_end @@ -85,8 +83,7 @@ def _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end): return px_row_init // 8, px_row_end // 8, px_col_init // 8, px_col_end // 8 -def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height, - tile_row_overlap, tile_col_overlap): +def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap): """Given a tile row and column numbers returns the range of latents affected by that tiles in the overall image Returns a tuple with: @@ -96,15 +93,14 @@ def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height, - Ending coordinates of columns in latent space """ px_row_init, px_row_end, px_col_init, px_col_end = _tile2pixel_indices( - tile_row, tile_col, tile_width, tile_height, tile_row_overlap, - tile_col_overlap) - return _pixel2latent_indices(px_row_init, px_row_end, px_col_init, - px_col_end) + tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap + ) + return _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end) -def _tile2latent_exclusive_indices(tile_row, tile_col, tile_width, tile_height, - tile_row_overlap, tile_col_overlap, rows, - columns): +def _tile2latent_exclusive_indices( + tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap, rows, columns +): """Given a tile row and column numbers returns the range of latents affected only by that tile in the overall image Returns a tuple with: @@ -114,18 +110,17 @@ def _tile2latent_exclusive_indices(tile_row, tile_col, tile_width, tile_height, - Ending coordinates of columns in latent space """ row_init, row_end, col_init, col_end = _tile2latent_indices( - tile_row, tile_col, tile_width, tile_height, tile_row_overlap, - tile_col_overlap) + tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap + ) row_segment = segment(row_init, row_end) col_segment = segment(col_init, col_end) # Iterate over the rest of tiles, clipping the region for the current tile for row in range(rows): for column in range(columns): if row != tile_row and column != tile_col: - (clip_row_init, clip_row_end, clip_col_init, - clip_col_end) = _tile2latent_indices( - row, column, tile_width, tile_height, tile_row_overlap, - tile_col_overlap) + (clip_row_init, clip_row_end, clip_col_init, clip_col_end) = _tile2latent_indices( + row, column, tile_width, tile_height, tile_row_overlap, tile_col_overlap + ) row_segment = row_segment - segment(clip_row_init, clip_row_end) col_segment = col_segment - segment(clip_col_init, clip_col_end) # return row_init, row_end, col_init, col_end @@ -151,17 +146,17 @@ def decode_latents(self, latents, cpu_vae=False): return self.numpy_to_pil(image) -class StableDiffusionTilingPipeline(DiffusionPipeline, - StableDiffusionExtrasMixin): +class StableDiffusionTilingPipeline(DiffusionPipeline, StableDiffusionExtrasMixin): def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: Union[DDIMScheduler, PNDMScheduler], - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: Union[DDIMScheduler, PNDMScheduler], + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + ): super().__init__() self.register_modules( vae=vae, @@ -170,7 +165,8 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) + feature_extractor=feature_extractor, + ) class SeedTilesMode(Enum): """Modes in which the latents of a particular tile can be re-seeded""" @@ -180,22 +176,22 @@ class SeedTilesMode(Enum): @paddle.no_grad() def __call__( - self, - prompt: Union[str, List[List[str]]], - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - eta: Optional[float]=0.0, - seed: Optional[int]=None, - tile_height: Optional[int]=512, - tile_width: Optional[int]=512, - tile_row_overlap: Optional[int]=256, - tile_col_overlap: Optional[int]=256, - guidance_scale_tiles: Optional[List[List[float]]]=None, - seed_tiles: Optional[List[List[int]]]=None, - seed_tiles_mode: Optional[Union[str, List[List[str]]]]="full", - seed_reroll_regions: Optional[List[Tuple[int, int, int, int, - int]]]=None, - cpu_vae: Optional[bool]=False, ): + self, + prompt: Union[str, List[List[str]]], + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + eta: Optional[float] = 0.0, + seed: Optional[int] = None, + tile_height: Optional[int] = 512, + tile_width: Optional[int] = 512, + tile_row_overlap: Optional[int] = 256, + tile_col_overlap: Optional[int] = 256, + guidance_scale_tiles: Optional[List[List[float]]] = None, + seed_tiles: Optional[List[List[int]]] = None, + seed_tiles_mode: Optional[Union[str, List[List[str]]]] = "full", + seed_reroll_regions: Optional[List[Tuple[int, int, int, int, int]]] = None, + cpu_vae: Optional[bool] = False, + ): """ Function to run the diffusion pipeline with tiling support. @@ -221,24 +217,18 @@ def __call__( A PIL image with the generated image. """ - if not isinstance(prompt, list) or not all( - isinstance(row, list) for row in prompt): - raise ValueError( - f"`prompt` has to be a list of lists but is {type(prompt)}") + if not isinstance(prompt, list) or not all(isinstance(row, list) for row in prompt): + raise ValueError(f"`prompt` has to be a list of lists but is {type(prompt)}") grid_rows = len(prompt) grid_cols = len(prompt[0]) if not all(len(row) == grid_cols for row in prompt): - raise ValueError( - "All prompt rows must have the same number of prompt columns") + raise ValueError("All prompt rows must have the same number of prompt columns") if not isinstance(seed_tiles_mode, str) and ( - not isinstance(seed_tiles_mode, list) or - not all(isinstance(row, list) for row in seed_tiles_mode)): - raise ValueError( - f"`seed_tiles_mode` has to be a string or list of lists but is {type(prompt)}" - ) + not isinstance(seed_tiles_mode, list) or not all(isinstance(row, list) for row in seed_tiles_mode) + ): + raise ValueError(f"`seed_tiles_mode` has to be a string or list of lists but is {type(prompt)}") if isinstance(seed_tiles_mode, str): - seed_tiles_mode = [[seed_tiles_mode for _ in range(len(row))] - for row in prompt] + seed_tiles_mode = [[seed_tiles_mode for _ in range(len(row))] for row in prompt] modes = [mode.value for mode in self.SeedTilesMode] if any(mode not in modes for row in seed_tiles_mode for mode in row): raise ValueError(f"Seed tiles mode must be one of {modes}") @@ -247,11 +237,9 @@ def __call__( batch_size = 1 # create original noisy latents using the timesteps - height = tile_height + (grid_rows - 1) * (tile_height - tile_row_overlap - ) + height = tile_height + (grid_rows - 1) * (tile_height - tile_row_overlap) width = tile_width + (grid_cols - 1) * (tile_width - tile_col_overlap) - latents_shape = (batch_size, self.unet.config.in_channels, height // 8, - width // 8) + latents_shape = (batch_size, self.unet.config.in_channels, height // 8, width // 8) generator = paddle.Generator().manual_seed(seed) latents = paddle.randn(shape=latents_shape, generator=generator) @@ -263,8 +251,8 @@ def __call__( mode = seed_tiles_mode[row][col] if mode == self.SeedTilesMode.FULL.value: row_init, row_end, col_init, col_end = _tile2latent_indices( - row, col, tile_width, tile_height, - tile_row_overlap, tile_col_overlap) + row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap + ) else: row_init, row_end, col_init, col_end = _tile2latent_exclusive_indices( row, @@ -274,29 +262,27 @@ def __call__( tile_row_overlap, tile_col_overlap, grid_rows, - grid_cols, ) - tile_generator = paddle.Generator().manual_seed( - seed_tile) - tile_shape = latents_shape[0], latents_shape[ - 1], row_end - row_init, col_end - col_init - latents[:, :, row_init:row_end, col_init: - col_end] = paddle.randn( - shape=tile_shape, generator=tile_generator) + grid_cols, + ) + tile_generator = paddle.Generator().manual_seed(seed_tile) + tile_shape = latents_shape[0], latents_shape[1], row_end - row_init, col_end - col_init + latents[:, :, row_init:row_end, col_init:col_end] = paddle.randn( + shape=tile_shape, generator=tile_generator + ) # overwrite again for seed reroll regions for row_init, row_end, col_init, col_end, seed_reroll in seed_reroll_regions: row_init, row_end, col_init, col_end = _pixel2latent_indices( - row_init, row_end, col_init, - col_end) # to latent space coordinates + row_init, row_end, col_init, col_end + ) # to latent space coordinates reroll_generator = paddle.Generator().manual_seed(seed_reroll) - region_shape = latents_shape[0], latents_shape[ - 1], row_end - row_init, col_end - col_init + region_shape = latents_shape[0], latents_shape[1], row_end - row_init, col_end - col_init latents[:, :, row_init:row_end, col_init:col_end] = paddle.randn( - shape=region_shape, generator=reroll_generator) + shape=region_shape, generator=reroll_generator + ) # Prepare scheduler - accepts_offset = "offset" in set( - inspect.signature(self.scheduler.set_timesteps).parameters.keys()) + accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys()) extra_set_kwargs = {} if accepts_offset: extra_set_kwargs["offset"] = 1 @@ -306,17 +292,20 @@ def __call__( latents = latents * self.scheduler.sigmas[0] # get prompts text embeddings - text_input = [[ - self.tokenizer( - col, - padding="max_length", - max_length=self.tokenizer.model_max_length, - truncation=True, - return_tensors="pd", ) for col in row - ] for row in prompt] - text_embeddings = [[ - self.text_encoder(col.input_ids)[0] for col in row - ] for row in text_input] + text_input = [ + [ + self.tokenizer( + col, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pd", + ) + for col in row + ] + for row in prompt + ] + text_embeddings = [[self.text_encoder(col.input_ids)[0] for col in row] for row in text_input] # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` @@ -328,32 +317,26 @@ def __call__( for j in range(grid_cols): max_length = text_input[i][j].input_ids.shape[-1] uncond_input = self.tokenizer( - [""] * batch_size, - padding="max_length", - max_length=max_length, - return_tensors="pd") - uncond_embeddings = self.text_encoder( - uncond_input.input_ids)[0] + [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pd" + ) + uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0] # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - text_embeddings[i][j] = paddle.concat( - x=[uncond_embeddings, text_embeddings[i][j]]) + text_embeddings[i][j] = paddle.concat(x=[uncond_embeddings, text_embeddings[i][j]]) # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # Mask for tile weights strenght - tile_weights = self._gaussian_weights(tile_width, tile_height, - batch_size) + tile_weights = self._gaussian_weights(tile_width, tile_height, batch_size) # Diffusion timesteps for i, t in tqdm(enumerate(self.scheduler.timesteps)): @@ -363,33 +346,28 @@ def __call__( noise_preds_row = [] for col in range(grid_cols): px_row_init, px_row_end, px_col_init, px_col_end = _tile2latent_indices( - row, col, tile_width, tile_height, tile_row_overlap, - tile_col_overlap) - tile_latents = latents[:, :, px_row_init:px_row_end, - px_col_init:px_col_end] + row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap + ) + tile_latents = latents[:, :, px_row_init:px_row_end, px_col_init:px_col_end] # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat(x=[tile_latents] * 2) - if do_classifier_free_guidance else - tile_latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = ( + paddle.concat(x=[tile_latents] * 2) if do_classifier_free_guidance else tile_latents + ) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual - noise_pred = self.unet( - latent_model_input, - t, - encoder_hidden_states=text_embeddings[row][col])[ - "sample"] + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings[row][col])[ + "sample" + ] # perform guidance if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk( - chunks=2) - guidance = (guidance_scale - if guidance_scale_tiles is None or - guidance_scale_tiles[row][col] is None else - guidance_scale_tiles[row][col]) - noise_pred_tile = noise_pred_uncond + guidance * ( - noise_pred_text - noise_pred_uncond) + noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2) + guidance = ( + guidance_scale + if guidance_scale_tiles is None or guidance_scale_tiles[row][col] is None + else guidance_scale_tiles[row][col] + ) + noise_pred_tile = noise_pred_uncond + guidance * (noise_pred_text - noise_pred_uncond) noise_preds_row.append(noise_pred_tile) noise_preds.append(noise_preds_row) # Stitch noise predictions for all tiles @@ -399,13 +377,12 @@ def __call__( for row in range(grid_rows): for col in range(grid_cols): px_row_init, px_row_end, px_col_init, px_col_end = _tile2latent_indices( - row, col, tile_width, tile_height, tile_row_overlap, - tile_col_overlap) - noise_pred[:, :, px_row_init:px_row_end, px_col_init: - px_col_end] += (noise_preds[row][col] * - tile_weights) - contributors[:, :, px_row_init:px_row_end, px_col_init: - px_col_end] += tile_weights + row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap + ) + noise_pred[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += ( + noise_preds[row][col] * tile_weights + ) + contributors[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += tile_weights # Average overlapping areas with more than 1 contributor noise_pred /= contributors # compute the previous noisy sample x_t -> x_t-1 @@ -424,14 +401,16 @@ def _gaussian_weights(self, tile_width, tile_height, nbatches): latent_height = tile_height // 8 var = 0.01 midpoint = (latent_width - 1) / 2 - x_probs = [(exp(-(x - midpoint) * (x - midpoint) / - (latent_width * latent_width) / (2 * var)) / - sqrt(2 * pi * var)) for x in range(latent_width)] + x_probs = [ + (exp(-(x - midpoint) * (x - midpoint) / (latent_width * latent_width) / (2 * var)) / sqrt(2 * pi * var)) + for x in range(latent_width) + ] midpoint = latent_height / 2 - y_probs = [(exp(-(y - midpoint) * (y - midpoint) / - (latent_height * latent_height) / (2 * var)) / - sqrt(2 * pi * var)) for y in range(latent_height)] + y_probs = [ + (exp(-(y - midpoint) * (y - midpoint) / (latent_height * latent_height) / (2 * var)) / sqrt(2 * pi * var)) + for y in range(latent_height) + ] weights = np.outer(y_probs, x_probs) return paddle.tile( - x=paddle.to_tensor(data=weights), - repeat_times=(nbatches, self.unet.config.in_channels, 1, 1)) + x=paddle.to_tensor(data=weights), repeat_times=(nbatches, self.unet.config.in_channels, 1, 1) + ) diff --git a/ppdiffusers/examples/community/one_step_unet.py b/ppdiffusers/examples/community/one_step_unet.py index 489cef26e01d8..5baffefdab061 100644 --- a/ppdiffusers/examples/community/one_step_unet.py +++ b/ppdiffusers/examples/community/one_step_unet.py @@ -24,15 +24,14 @@ def __init__(self, unet, scheduler): self.register_modules(unet=unet, scheduler=scheduler) def __call__(self): - image = paddle.randn((1, self.unet.in_channels, self.unet.sample_size, - self.unet.sample_size), ) + image = paddle.randn( + (1, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size), + ) timestep = 1 model_output = self.unet(image, timestep).sample - scheduler_output = self.scheduler.step(model_output, timestep, - image).prev_sample + scheduler_output = self.scheduler.step(model_output, timestep, image).prev_sample - result = (scheduler_output - scheduler_output + - paddle.ones_like(scheduler_output)) + result = scheduler_output - scheduler_output + paddle.ones_like(scheduler_output) return result diff --git a/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_hires_fix.py b/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_hires_fix.py index 218ef8d7ab49c..b32b422bd47ae 100644 --- a/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_hires_fix.py +++ b/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_hires_fix.py @@ -23,17 +23,17 @@ from ppdiffusers import DiffusionPipeline from ppdiffusers.pipelines.fastdeploy_utils import ( - FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel) -from ppdiffusers.pipelines.stable_diffusion import \ - StableDiffusionPipelineOutput + FastDeployDiffusionPipelineMixin, + FastDeployRuntimeModel, +) +from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from ppdiffusers.schedulers import KarrasDiffusionSchedulers from ppdiffusers.utils import logging, randn_tensor logger = logging.get_logger(__name__) -class FastStableDiffusionHiresFixPipeline(DiffusionPipeline, - FastDeployDiffusionPipelineMixin): +class FastStableDiffusionHiresFixPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin): r""" Pipeline for text-to-image generation with high resolution fixing(hires.fix) based on Stable Diffusion. @@ -63,21 +63,20 @@ class FastStableDiffusionHiresFixPipeline(DiffusionPipeline, feature_extractor ([`CLIPImageProcessor`]): Model that extracts features from generated images to be used as inputs for the `safety_checker`. """ - _optional_components = [ - "vae_encoder", "safety_checker", "feature_extractor" - ] + _optional_components = ["vae_encoder", "safety_checker", "feature_extractor"] def __init__( - self, - vae_encoder: FastDeployRuntimeModel, - vae_decoder: FastDeployRuntimeModel, - text_encoder: FastDeployRuntimeModel, - tokenizer: CLIPTokenizer, - unet: FastDeployRuntimeModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: FastDeployRuntimeModel, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=False, ): + self, + vae_encoder: FastDeployRuntimeModel, + vae_decoder: FastDeployRuntimeModel, + text_encoder: FastDeployRuntimeModel, + tokenizer: CLIPTokenizer, + unet: FastDeployRuntimeModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: FastDeployRuntimeModel, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = False, + ): super().__init__() if safety_checker is None and requires_safety_checker: logger.warning( @@ -102,7 +101,8 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) + feature_extractor=feature_extractor, + ) self.register_to_config(requires_safety_checker=requires_safety_checker) self.post_init() @@ -111,7 +111,7 @@ def get_timesteps(self, denoising_steps, denoising_strength): self.scheduler.set_timesteps(steps) t_start = max(steps - denoising_steps, 0) - timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:] + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] if hasattr(self.scheduler, "step_index_offset"): self.scheduler.step_index_offset = t_start * self.scheduler.order @@ -119,48 +119,45 @@ def get_timesteps(self, denoising_steps, denoising_strength): return timesteps.cast("float32"), denoising_steps def check_inputs( - self, - prompt, - height, - width, - callback_steps, - hr_scale, - hr_resize_height, - hr_resize_width, - denoising_strength, - latent_scale_mode, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + height, + width, + callback_steps, + hr_scale, + hr_resize_height, + hr_resize_width, + denoising_strength, + latent_scale_mode, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if hr_scale < 0: - raise ValueError( - "hr_scale shoule be greater that 0, but acceived {hr_scale}") + raise ValueError("hr_scale shoule be greater that 0, but acceived {hr_scale}") if hr_resize_height % 8 != 0 or hr_resize_width % 8 != 0: raise ValueError( @@ -168,9 +165,7 @@ def check_inputs( ) if denoising_strength > 1 or denoising_strength < 0: - raise ValueError( - f"denoising_strength should be set between 0 and 1., but acceived {denoising_strength}" - ) + raise ValueError(f"denoising_strength should be set between 0 and 1., but acceived {denoising_strength}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -188,14 +183,10 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") - - def get_upscaled_width_and_height(self, - width, - height, - hr_scale=2, - hr_resize_width=0, - hr_resize_height=0): + f" {negative_prompt_embeds.shape}." + ) + + def get_upscaled_width_and_height(self, width, height, hr_scale=2, hr_resize_width=0, hr_resize_height=0): if hr_resize_width == 0 and hr_resize_height == 0: hr_upscale_to_width = int(width * hr_scale) hr_upscale_to_height = int(height * hr_scale) @@ -221,36 +212,36 @@ def get_upscaled_width_and_height(self, @paddle.no_grad() def __call__( - self, - prompt: Union[str, List[str]]=None, - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=40, - hires_ratio: Optional[float]=0.5, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - parse_prompt_type: Optional[str]="lpw", - max_embeddings_multiples: Optional[int]=3, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - enable_hr: Optional[bool]=True, - hr_scale: Optional[float]=2.0, - hr_resize_width: Optional[int]=0, - hr_resize_height: Optional[int]=0, - denoising_strength: Optional[float]=0.7, - latent_scale_mode: Optional[str]="nearest", - controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None, - controlnet_conditioning_scale: float=1.0, - infer_op_dict: Dict[str, str]=None, ): + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 40, + hires_ratio: Optional[float] = 0.5, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + parse_prompt_type: Optional[str] = "lpw", + max_embeddings_multiples: Optional[int] = 3, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + enable_hr: Optional[bool] = True, + hr_scale: Optional[float] = 2.0, + hr_resize_width: Optional[int] = 0, + hr_resize_height: Optional[int] = 0, + denoising_strength: Optional[float] = 0.7, + latent_scale_mode: Optional[str] = "nearest", + controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None, + controlnet_conditioning_scale: float = 1.0, + infer_op_dict: Dict[str, str] = None, + ): r""" Function invoked when calling the pipeline for generation. @@ -347,7 +338,8 @@ def __call__( latent_scale_mode, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) infer_op_dict = self.prepare_infer_op_dict(infer_op_dict) # 2. Define call parameters @@ -373,7 +365,8 @@ def __call__( height=height, batch_size=batch_size, num_images_per_prompt=num_images_per_prompt, - do_classifier_free_guidance=do_classifier_free_guidance, ) + do_classifier_free_guidance=do_classifier_free_guidance, + ) # 3. Encode input prompt prompt_embeds = self._encode_prompt( @@ -385,7 +378,8 @@ def __call__( negative_prompt_embeds=negative_prompt_embeds, parse_prompt_type=parse_prompt_type, max_embeddings_multiples=max_embeddings_multiples, - infer_op=infer_op_dict.get("text_encoder", None), ) + infer_op=infer_op_dict.get("text_encoder", None), + ) # 4. Prepare timesteps if enable_hr: @@ -401,18 +395,17 @@ def __call__( # 5. Prepare latent variables if generator is None: generator_state = paddle.get_cuda_rng_state() - paddle.Generator().states_["initial_generator"] = copy.deepcopy( - generator_state) + paddle.Generator().states_["initial_generator"] = copy.deepcopy(generator_state) else: - paddle.Generator().states_["initial_generator"] = copy.deepcopy( - paddle.Generator().states_[generator]) + paddle.Generator().states_["initial_generator"] = copy.deepcopy(paddle.Generator().states_[generator]) latents = self.prepare_latents( batch_size * num_images_per_prompt, height, width, generator, - latents, ) + latents, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) @@ -423,34 +416,29 @@ def __call__( with self.progress_bar(total=sample_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents if is_scheduler_support_step_index: - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t, step_index=i) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i) else: - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) unet_inputs = dict( sample=latent_model_input, timestep=t, encoder_hidden_states=prompt_embeds, infer_op=infer_op_dict.get("unet", None), - output_shape=latent_model_input.shape, ) + output_shape=latent_model_input.shape, + ) if do_controlnet: unet_inputs["controlnet_cond"] = control_image - unet_inputs[ - "controlnet_conditioning_scale"] = control_conditioning_scale + unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale # predict the noise residual noise_pred_unet = self.unet(**unet_inputs)[0] # perform guidance if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk( - 2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) else: noise_pred = noise_pred_unet @@ -462,15 +450,13 @@ def __call__( latents, step_index=i, return_pred_original_sample=False, - **extra_step_kwargs, ) + **extra_step_kwargs, + ) else: - scheduler_output = self.scheduler.step( - noise_pred, t, latents, **extra_step_kwargs) + scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs) latents = scheduler_output.prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -483,19 +469,16 @@ def __call__( # 8. determine the upscaled width and height for upscaled images truncate_width = 0 truncate_height = 0 - ( - hr_upscale_to_width, - hr_upscale_to_height, ) = self.get_upscaled_width_and_height( - width, - height, - hr_scale=hr_scale, - hr_resize_width=hr_resize_width, - hr_resize_height=hr_resize_height, ) + (hr_upscale_to_width, hr_upscale_to_height,) = self.get_upscaled_width_and_height( + width, + height, + hr_scale=hr_scale, + hr_resize_width=hr_resize_width, + hr_resize_height=hr_resize_height, + ) if hr_resize_width != 0 and hr_resize_height != 0: - truncate_width = (hr_upscale_to_width - hr_resize_width - ) // self.vae_scale_factor - truncate_height = (hr_upscale_to_height - hr_resize_height - ) // self.vae_scale_factor + truncate_width = (hr_upscale_to_width - hr_resize_width) // self.vae_scale_factor + truncate_height = (hr_upscale_to_height - hr_resize_height) // self.vae_scale_factor # 9. special case: do nothing if upscaling is not nesscessary if hr_upscale_to_width == width and hr_upscale_to_height == height: @@ -504,77 +487,69 @@ def __call__( if enable_hr: if do_controlnet: - ( - control_image, - control_conditioning_scale, - ) = self.prepare_controlnet_cond( + (control_image, control_conditioning_scale,) = self.prepare_controlnet_cond( controlnet_cond=controlnet_cond, controlnet_conditioning_scale=controlnet_conditioning_scale, width=hr_upscale_to_width, height=hr_upscale_to_height, batch_size=batch_size, num_images_per_prompt=num_images_per_prompt, - do_classifier_free_guidance=do_classifier_free_guidance, ) + do_classifier_free_guidance=do_classifier_free_guidance, + ) # 10. prepare init latents - timesteps, hr_steps = self.get_timesteps(hr_steps, - denoising_strength) + timesteps, hr_steps = self.get_timesteps(hr_steps, denoising_strength) init_timestep = timesteps[:1].tile([latents.shape[0]]) latents = F.interpolate( latents, size=( hr_upscale_to_height // self.vae_scale_factor, - hr_upscale_to_width // self.vae_scale_factor, ), - mode=latent_scale_mode, ) - latents = latents[:, :, truncate_height // 2:latents.shape[2] - ( - truncate_height + 1) // 2, truncate_width // 2:latents.shape[3] - - (truncate_width + 1) // 2, ] - - noise = randn_tensor( - latents.shape, - dtype=latents.dtype, - generator="initial_generator") + hr_upscale_to_width // self.vae_scale_factor, + ), + mode=latent_scale_mode, + ) + latents = latents[ + :, + :, + truncate_height // 2 : latents.shape[2] - (truncate_height + 1) // 2, + truncate_width // 2 : latents.shape[3] - (truncate_width + 1) // 2, + ] + + noise = randn_tensor(latents.shape, dtype=latents.dtype, generator="initial_generator") latents = self.scheduler.add_noise(latents, noise, init_timestep) # 11. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline - extra_step_kwargs = self.prepare_extra_step_kwargs( - "initial_generator", eta) + extra_step_kwargs = self.prepare_extra_step_kwargs("initial_generator", eta) # 12. denoising on hires.fix steps num_warmup_steps = len(timesteps) - hr_steps * self.scheduler.order with self.progress_bar(total=hr_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) - if do_classifier_free_guidance else - latents) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents if is_scheduler_support_step_index: - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t, step_index=i) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i) else: - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) unet_inputs = dict( sample=latent_model_input, timestep=t, encoder_hidden_states=prompt_embeds, infer_op=infer_op_dict.get("unet", None), - output_shape=latent_model_input.shape, ) + output_shape=latent_model_input.shape, + ) if do_controlnet: unet_inputs["controlnet_cond"] = control_image - unet_inputs[ - "controlnet_conditioning_scale"] = control_conditioning_scale + unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale # predict the noise residual noise_pred_unet = self.unet(**unet_inputs)[0] # perform guidance if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk( - 2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) else: noise_pred = noise_pred_unet @@ -586,16 +561,14 @@ def __call__( latents, step_index=i, return_pred_original_sample=False, - **extra_step_kwargs, ) + **extra_step_kwargs, + ) else: - scheduler_output = self.scheduler.step( - noise_pred, t, latents, **extra_step_kwargs) + scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs) latents = scheduler_output.prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -606,7 +579,8 @@ def __call__( if not output_type == "latent": image = self._decode_vae_latents( latents / self.vae_scaling_factor, - infer_op=infer_op_dict.get("vae_decoder", None), ) + infer_op=infer_op_dict.get("vae_decoder", None), + ) image, has_nsfw_concept = self.run_safety_checker(image) else: image = latents @@ -617,11 +591,9 @@ def __call__( else: do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - image = self.image_processor.postprocess( - image, output_type=output_type, do_denormalize=do_denormalize) + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_mixture_tiling.py b/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_mixture_tiling.py index bf9bbf48e6e90..2fb5aa69a20ee 100644 --- a/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_mixture_tiling.py +++ b/ppdiffusers/examples/community/pipeline_fastdeploy_stable_diffusion_mixture_tiling.py @@ -24,10 +24,12 @@ # from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel from ppdiffusers.pipeline_utils import DiffusionPipeline from ppdiffusers.pipelines.fastdeploy_utils import ( - FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel) + FastDeployDiffusionPipelineMixin, + FastDeployRuntimeModel, +) + # from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker -from ppdiffusers.schedulers import (DDIMScheduler, LMSDiscreteScheduler, - PNDMScheduler) +from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from ppdiffusers.utils import logging try: @@ -35,13 +37,11 @@ from paddlenlp.transformers import CLIPFeatureExtractor # CLIPTextModel, from paddlenlp.transformers import CLIPTokenizer except ImportError: - raise ImportError( - "Please install paddlenlp and ligo-segments to use the mixture pipeline") + raise ImportError("Please install paddlenlp and ligo-segments to use the mixture pipeline") logger = logging.get_logger(__name__) -def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height, - tile_row_overlap, tile_col_overlap): +def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap): """Given a tile row and column numbers returns the range of pixels affected by that tiles in the overall image Returns a tuple with: @@ -50,11 +50,9 @@ def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height, - Starting coordinates of columns in pixel space - Ending coordinates of columns in pixel space """ - px_row_init = 0 if tile_row == 0 else tile_row * ( - tile_height - tile_row_overlap) + px_row_init = 0 if tile_row == 0 else tile_row * (tile_height - tile_row_overlap) px_row_end = px_row_init + tile_height - px_col_init = 0 if tile_col == 0 else tile_col * ( - tile_width - tile_col_overlap) + px_col_init = 0 if tile_col == 0 else tile_col * (tile_width - tile_col_overlap) px_col_end = px_col_init + tile_width return px_row_init, px_row_end, px_col_init, px_col_end @@ -64,8 +62,7 @@ def _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end): return px_row_init // 8, px_row_end // 8, px_col_init // 8, px_col_end // 8 -def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height, - tile_row_overlap, tile_col_overlap): +def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap): """Given a tile row and column numbers returns the range of latents affected by that tiles in the overall image Returns a tuple with: @@ -75,21 +72,21 @@ def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height, - Ending coordinates of columns in latent space """ px_row_init, px_row_end, px_col_init, px_col_end = _tile2pixel_indices( - tile_row, tile_col, tile_width, tile_height, tile_row_overlap, - tile_col_overlap) - return _pixel2latent_indices(px_row_init, px_row_end, px_col_init, - px_col_end) + tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap + ) + return _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end) def _tile2latent_exclusive_indices( - tile_row, - tile_col, - tile_width, - tile_height, - tile_row_overlap, - tile_col_overlap, - rows, - columns, ): + tile_row, + tile_col, + tile_width, + tile_height, + tile_row_overlap, + tile_col_overlap, + rows, + columns, +): """Given a tile row and column numbers returns the range of latents affected only by that tile in the overall image Returns a tuple with: @@ -99,25 +96,22 @@ def _tile2latent_exclusive_indices( - Ending coordinates of columns in latent space """ row_init, row_end, col_init, col_end = _tile2latent_indices( - tile_row, tile_col, tile_width, tile_height, tile_row_overlap, - tile_col_overlap) + tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap + ) row_segment = segment(row_init, row_end) col_segment = segment(col_init, col_end) # Iterate over the rest of tiles, clipping the region for the current tile for row in range(rows): for column in range(columns): if row != tile_row and column != tile_col: - ( - clip_row_init, - clip_row_end, - clip_col_init, - clip_col_end, ) = _tile2latent_indices( - row, - column, - tile_width, - tile_height, - tile_row_overlap, - tile_col_overlap, ) + (clip_row_init, clip_row_end, clip_col_init, clip_col_end,) = _tile2latent_indices( + row, + column, + tile_width, + tile_height, + tile_row_overlap, + tile_col_overlap, + ) row_segment = row_segment - segment(clip_row_init, clip_row_end) col_segment = col_segment - segment(clip_col_init, clip_col_end) # return row_init, row_end, col_init, col_end @@ -127,10 +121,7 @@ def _tile2latent_exclusive_indices( class StableDiffusionExtrasMixin: """Mixin providing additional convenience method to Stable Diffusion pipelines""" - def _decode_vae_latents(self, - latents: paddle.Tensor, - infer_op=None, - **kwargs): + def _decode_vae_latents(self, latents: paddle.Tensor, infer_op=None, **kwargs): latents_shape = latents.shape output_shape = [ latents_shape[0], @@ -143,7 +134,8 @@ def _decode_vae_latents(self, images_vae = self.vae_decoder( latent_sample=latents, infer_op=infer_op, - output_shape=output_shape, )[0] + output_shape=output_shape, + )[0] return images_vae @@ -163,19 +155,20 @@ def decode_latents(self, latents, cpu_vae=False): return self.numpy_to_pil(image) -class FastDeployStableDiffusionTilingPipeline(DiffusionPipeline, - StableDiffusionExtrasMixin, - FastDeployDiffusionPipelineMixin): +class FastDeployStableDiffusionTilingPipeline( + DiffusionPipeline, StableDiffusionExtrasMixin, FastDeployDiffusionPipelineMixin +): def __init__( - self, - vae_encoder: FastDeployRuntimeModel, - vae_decoder: FastDeployRuntimeModel, - text_encoder: FastDeployRuntimeModel, - tokenizer: CLIPTokenizer, - unet: FastDeployRuntimeModel, - scheduler: Union[DDIMScheduler, PNDMScheduler], - safety_checker: FastDeployRuntimeModel, - feature_extractor: CLIPFeatureExtractor, ): + self, + vae_encoder: FastDeployRuntimeModel, + vae_decoder: FastDeployRuntimeModel, + text_encoder: FastDeployRuntimeModel, + tokenizer: CLIPTokenizer, + unet: FastDeployRuntimeModel, + scheduler: Union[DDIMScheduler, PNDMScheduler], + safety_checker: FastDeployRuntimeModel, + feature_extractor: CLIPFeatureExtractor, + ): super().__init__() self.register_modules( vae_encoder=vae_encoder, @@ -185,7 +178,8 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) + feature_extractor=feature_extractor, + ) self.post_init() class SeedTilesMode(Enum): @@ -196,24 +190,24 @@ class SeedTilesMode(Enum): @paddle.no_grad() def __call__( - self, - prompt: Union[str, List[List[str]]], - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - eta: Optional[float]=0.0, - seed: Optional[int]=None, - tile_height: Optional[int]=512, - tile_width: Optional[int]=512, - tile_row_overlap: Optional[int]=256, - tile_col_overlap: Optional[int]=256, - guidance_scale_tiles: Optional[List[List[float]]]=None, - seed_tiles: Optional[List[List[int]]]=None, - seed_tiles_mode: Optional[Union[str, List[List[str]]]]="full", - seed_reroll_regions: Optional[List[Tuple[int, int, int, int, - int]]]=None, - # parse_prompt_type: Optional[str] = "lpw", - # max_embeddings_multiples: Optional[int] = 3, - infer_op_dict: Dict[str, str]=None, ): + self, + prompt: Union[str, List[List[str]]], + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + eta: Optional[float] = 0.0, + seed: Optional[int] = None, + tile_height: Optional[int] = 512, + tile_width: Optional[int] = 512, + tile_row_overlap: Optional[int] = 256, + tile_col_overlap: Optional[int] = 256, + guidance_scale_tiles: Optional[List[List[float]]] = None, + seed_tiles: Optional[List[List[int]]] = None, + seed_tiles_mode: Optional[Union[str, List[List[str]]]] = "full", + seed_reroll_regions: Optional[List[Tuple[int, int, int, int, int]]] = None, + # parse_prompt_type: Optional[str] = "lpw", + # max_embeddings_multiples: Optional[int] = 3, + infer_op_dict: Dict[str, str] = None, + ): """ Function to run the diffusion pipeline with tiling support. @@ -244,24 +238,18 @@ def __call__( """ infer_op_dict = self.prepare_infer_op_dict(infer_op_dict) - if not isinstance(prompt, list) or not all( - isinstance(row, list) for row in prompt): - raise ValueError( - f"`prompt` has to be a list of lists but is {type(prompt)}") + if not isinstance(prompt, list) or not all(isinstance(row, list) for row in prompt): + raise ValueError(f"`prompt` has to be a list of lists but is {type(prompt)}") grid_rows = len(prompt) grid_cols = len(prompt[0]) if not all(len(row) == grid_cols for row in prompt): - raise ValueError( - "All prompt rows must have the same number of prompt columns") + raise ValueError("All prompt rows must have the same number of prompt columns") if not isinstance(seed_tiles_mode, str) and ( - not isinstance(seed_tiles_mode, list) or - not all(isinstance(row, list) for row in seed_tiles_mode)): - raise ValueError( - f"`seed_tiles_mode` has to be a string or list of lists but is {type(prompt)}" - ) + not isinstance(seed_tiles_mode, list) or not all(isinstance(row, list) for row in seed_tiles_mode) + ): + raise ValueError(f"`seed_tiles_mode` has to be a string or list of lists but is {type(prompt)}") if isinstance(seed_tiles_mode, str): - seed_tiles_mode = [[seed_tiles_mode for _ in range(len(row))] - for row in prompt] + seed_tiles_mode = [[seed_tiles_mode for _ in range(len(row))] for row in prompt] modes = [mode.value for mode in self.SeedTilesMode] if any(mode not in modes for row in seed_tiles_mode for mode in row): raise ValueError(f"Seed tiles mode must be one of {modes}") @@ -270,14 +258,14 @@ def __call__( batch_size = 1 # create original noisy latents using the timesteps - height = tile_height + (grid_rows - 1) * (tile_height - tile_row_overlap - ) + height = tile_height + (grid_rows - 1) * (tile_height - tile_row_overlap) width = tile_width + (grid_cols - 1) * (tile_width - tile_col_overlap) latents_shape = ( batch_size, self.vae_decoder_num_latent_channels, height // 8, - width // 8, ) + width // 8, + ) generator = paddle.Generator().manual_seed(seed) latents = paddle.randn(shape=latents_shape, generator=generator) @@ -295,49 +283,48 @@ def __call__( tile_width, tile_height, tile_row_overlap, - tile_col_overlap, ) + tile_col_overlap, + ) else: - ( - row_init, - row_end, - col_init, - col_end, ) = _tile2latent_exclusive_indices( - row, - col, - tile_width, - tile_height, - tile_row_overlap, - tile_col_overlap, - grid_rows, - grid_cols, ) - tile_generator = paddle.Generator().manual_seed( - seed_tile) + (row_init, row_end, col_init, col_end,) = _tile2latent_exclusive_indices( + row, + col, + tile_width, + tile_height, + tile_row_overlap, + tile_col_overlap, + grid_rows, + grid_cols, + ) + tile_generator = paddle.Generator().manual_seed(seed_tile) tile_shape = ( latents_shape[0], latents_shape[1], row_end - row_init, - col_end - col_init, ) - latents[:, :, row_init:row_end, col_init: - col_end] = paddle.randn( - shape=tile_shape, generator=tile_generator) + col_end - col_init, + ) + latents[:, :, row_init:row_end, col_init:col_end] = paddle.randn( + shape=tile_shape, generator=tile_generator + ) # overwrite again for seed reroll regions for row_init, row_end, col_init, col_end, seed_reroll in seed_reroll_regions: row_init, row_end, col_init, col_end = _pixel2latent_indices( - row_init, row_end, col_init, - col_end) # to latent space coordinates + row_init, row_end, col_init, col_end + ) # to latent space coordinates reroll_generator = paddle.Generator().manual_seed(seed_reroll) region_shape = ( latents_shape[0], latents_shape[1], row_end - row_init, - col_end - col_init, ) + col_end - col_init, + ) latents[:, :, row_init:row_end, col_init:col_end] = paddle.randn( - shape=region_shape, generator=reroll_generator) + shape=region_shape, generator=reroll_generator + ) # Prepare scheduler - accepts_offset = "offset" in set( - inspect.signature(self.scheduler.set_timesteps).parameters.keys()) + accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys()) extra_set_kwargs = {} if accepts_offset: extra_set_kwargs["offset"] = 1 @@ -347,18 +334,22 @@ def __call__( latents = latents * self.scheduler.sigmas[0] # get prompts text embeddings - text_input = [[ - self.tokenizer( - col, - padding="max_length", - max_length=self.tokenizer.model_max_length, - truncation=True, - return_tensors="pd", ) for col in row - ] for row in prompt] - text_embeddings = [[ - self.text_encoder(input_ids=col.input_ids.astype(np.int64))[0] - for col in row - ] for row in text_input] + text_input = [ + [ + self.tokenizer( + col, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pd", + ) + for col in row + ] + for row in prompt + ] + text_embeddings = [ + [self.text_encoder(input_ids=col.input_ids.astype(np.int64))[0] for col in row] for row in text_input + ] # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` @@ -373,29 +364,26 @@ def __call__( [""] * batch_size, padding="max_length", max_length=max_length, - return_tensors="pd", ) - uncond_embeddings = self.text_encoder( - input_ids=uncond_input.input_ids.astype(np.int64))[0] + return_tensors="pd", + ) + uncond_embeddings = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int64))[0] # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - text_embeddings[i][j] = paddle.concat( - x=[uncond_embeddings, text_embeddings[i][j]]) + text_embeddings[i][j] = paddle.concat(x=[uncond_embeddings, text_embeddings[i][j]]) # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # Mask for tile weights strenght - tile_weights = self._gaussian_weights(tile_width, tile_height, - batch_size) + tile_weights = self._gaussian_weights(tile_width, tile_height, batch_size) # Diffusion timesteps is_scheduler_support_step_index = self.is_scheduler_support_step_index() @@ -406,48 +394,42 @@ def __call__( for row in range(grid_rows): noise_preds_row = [] for col in range(grid_cols): - ( - px_row_init, - px_row_end, - px_col_init, - px_col_end, ) = _tile2latent_indices( - row, - col, - tile_width, - tile_height, - tile_row_overlap, - tile_col_overlap, ) - tile_latents = latents[:, :, px_row_init:px_row_end, - px_col_init:px_col_end] + (px_row_init, px_row_end, px_col_init, px_col_end,) = _tile2latent_indices( + row, + col, + tile_width, + tile_height, + tile_row_overlap, + tile_col_overlap, + ) + tile_latents = latents[:, :, px_row_init:px_row_end, px_col_init:px_col_end] # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat(x=[tile_latents] * 2) - if do_classifier_free_guidance else - tile_latents) + latent_model_input = ( + paddle.concat(x=[tile_latents] * 2) if do_classifier_free_guidance else tile_latents + ) if is_scheduler_support_step_index: - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t, step_index=i) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i) else: - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual unet_inputs = dict( sample=latent_model_input, timestep=t, encoder_hidden_states=text_embeddings[row][col], infer_op=infer_op_dict.get("unet", None), - output_shape=latent_model_input.shape, ) + output_shape=latent_model_input.shape, + ) noise_pred = self.unet(**unet_inputs)[0] # perform guidance if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk( - chunks=2) - guidance = (guidance_scale - if guidance_scale_tiles is None or - guidance_scale_tiles[row][col] is None else - guidance_scale_tiles[row][col]) - noise_pred_tile = noise_pred_uncond + guidance * ( - noise_pred_text - noise_pred_uncond) + noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2) + guidance = ( + guidance_scale + if guidance_scale_tiles is None or guidance_scale_tiles[row][col] is None + else guidance_scale_tiles[row][col] + ) + noise_pred_tile = noise_pred_uncond + guidance * (noise_pred_text - noise_pred_uncond) noise_preds_row.append(noise_pred_tile) noise_preds.append(noise_preds_row) # Stitch noise predictions for all tiles @@ -456,22 +438,18 @@ def __call__( # Add each tile contribution to overall latents for row in range(grid_rows): for col in range(grid_cols): - ( - px_row_init, - px_row_end, - px_col_init, - px_col_end, ) = _tile2latent_indices( - row, - col, - tile_width, - tile_height, - tile_row_overlap, - tile_col_overlap, ) - noise_pred[:, :, px_row_init:px_row_end, px_col_init: - px_col_end] += (noise_preds[row][col] * - tile_weights) - contributors[:, :, px_row_init:px_row_end, px_col_init: - px_col_end] += tile_weights + (px_row_init, px_row_end, px_col_init, px_col_end,) = _tile2latent_indices( + row, + col, + tile_width, + tile_height, + tile_row_overlap, + tile_col_overlap, + ) + noise_pred[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += ( + noise_preds[row][col] * tile_weights + ) + contributors[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += tile_weights # Average overlapping areas with more than 1 contributor noise_pred /= contributors # compute the previous noisy sample x_t -> x_t-1 @@ -481,10 +459,10 @@ def __call__( t, latents, step_index=i, - return_pred_original_sample=False, ).prev_sample + return_pred_original_sample=False, + ).prev_sample else: - latents = self.scheduler.step(noise_pred, t, - latents).prev_sample + latents = self.scheduler.step(noise_pred, t, latents).prev_sample if i == len(self.scheduler.timesteps) - 1: # sync for accuracy it/s measure paddle.device.cuda.synchronize() @@ -505,13 +483,15 @@ def _gaussian_weights(self, tile_width, tile_height, nbatches): latent_height = tile_height // 8 var = 0.01 midpoint = (latent_width - 1) / 2 - x_probs = [(exp(-(x - midpoint) * (x - midpoint) / - (latent_width * latent_width) / (2 * var)) / - sqrt(2 * pi * var)) for x in range(latent_width)] + x_probs = [ + (exp(-(x - midpoint) * (x - midpoint) / (latent_width * latent_width) / (2 * var)) / sqrt(2 * pi * var)) + for x in range(latent_width) + ] midpoint = latent_height / 2 - y_probs = [(exp(-(y - midpoint) * (y - midpoint) / - (latent_height * latent_height) / (2 * var)) / - sqrt(2 * pi * var)) for y in range(latent_height)] + y_probs = [ + (exp(-(y - midpoint) * (y - midpoint) / (latent_height * latent_height) / (2 * var)) / sqrt(2 * pi * var)) + for y in range(latent_height) + ] weights = np.outer(y_probs, x_probs) return paddle.tile( x=paddle.to_tensor(data=weights), diff --git a/ppdiffusers/examples/community/reference_only.py b/ppdiffusers/examples/community/reference_only.py index 7f3035e62a6ea..816ee95647862 100644 --- a/ppdiffusers/examples/community/reference_only.py +++ b/ppdiffusers/examples/community/reference_only.py @@ -20,24 +20,32 @@ import paddle import PIL from packaging import version -from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from PIL import Image from ppdiffusers.configuration_utils import FrozenDict from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel from ppdiffusers.models.cross_attention import CrossAttention from ppdiffusers.models.transformer_2d import Transformer2DModelOutput -from ppdiffusers.models.unet_2d_blocks import (ResnetBlock2D, - Transformer2DModel, Upsample2D) +from ppdiffusers.models.unet_2d_blocks import ( + ResnetBlock2D, + Transformer2DModel, + Upsample2D, +) from ppdiffusers.pipeline_utils import DiffusionPipeline -from ppdiffusers.pipelines.stable_diffusion import \ - StableDiffusionPipelineOutput -from ppdiffusers.pipelines.stable_diffusion.safety_checker import \ - StableDiffusionSafetyChecker +from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput +from ppdiffusers.pipelines.stable_diffusion.safety_checker import ( + StableDiffusionSafetyChecker, +) from ppdiffusers.schedulers import KarrasDiffusionSchedulers -from ppdiffusers.utils import (PIL_INTERPOLATION, check_min_version, deprecate, - logging, randn_tensor, replace_example_docstring) +from ppdiffusers.utils import ( + PIL_INTERPOLATION, + check_min_version, + deprecate, + logging, + randn_tensor, + replace_example_docstring, +) check_min_version("0.14.1") @@ -70,18 +78,14 @@ def stable_var(x, axis=None, unbiased=True, keepdim=False, name=None): dtype = x.dtype u = paddle.mean(x, axis=axis, keepdim=True, name=name) - n = paddle.cast(paddle.numel(x), paddle.int64) / paddle.cast( - paddle.numel(u), paddle.int64) + n = paddle.cast(paddle.numel(x), paddle.int64) / paddle.cast(paddle.numel(u), paddle.int64) n = n.astype(dtype) if unbiased: one_const = paddle.ones([], x.dtype) n = paddle.where(n > one_const, n - 1.0, one_const) n = n**0.5 n.stop_gradient = True - out = paddle.sum(paddle.pow((x - u) / n, 2), - axis=axis, - keepdim=keepdim, - name=name) + out = paddle.sum(paddle.pow((x - u) / n, 2), axis=axis, keepdim=keepdim, name=name) return out @@ -94,11 +98,12 @@ def var_mean(x, axis=-1, keepdim=True, unbiased=True, correction=None): def self_attn_forward( - self, - hidden_states, - encoder_hidden_states=None, - attention_mask=None, - **cross_attention_kwargs, ): + self, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + **cross_attention_kwargs, +): attn_output = None if getattr(self, "enable_attn", False): @@ -114,31 +119,34 @@ def self_attn_forward( hidden_states=image_hidden_states, encoder_hidden_states=image_hidden_states, attention_mask=attention_mask, - **cross_attention_kwargs, ) + **cross_attention_kwargs, + ) latent_self_attn1_uc = self.processor( self, latent_hidden_states, encoder_hidden_states=paddle.concat( - [latent_hidden_states] + image_hidden_states.split( - [chunk_num] * - (image_hidden_states.shape[0] // chunk_num)), - axis=1, ), + [latent_hidden_states] + + image_hidden_states.split([chunk_num] * (image_hidden_states.shape[0] // chunk_num)), + axis=1, + ), attention_mask=attention_mask, - **cross_attention_kwargs, ) + **cross_attention_kwargs, + ) if do_classifier_free_guidance and self.current_style_fidelity > 1e-5: latent_self_attn1_c = latent_self_attn1_uc.clone() latent_self_attn1_c[self.current_uc_indices] = self.processor( self, hidden_states=latent_hidden_states[self.current_uc_indices], - encoder_hidden_states=latent_hidden_states[ - self.current_uc_indices], + encoder_hidden_states=latent_hidden_states[self.current_uc_indices], attention_mask=attention_mask, - **cross_attention_kwargs, ) + **cross_attention_kwargs, + ) latent_self_attn1 = ( - self.current_style_fidelity * latent_self_attn1_c + - (1.0 - self.current_style_fidelity) * latent_self_attn1_uc) + self.current_style_fidelity * latent_self_attn1_c + + (1.0 - self.current_style_fidelity) * latent_self_attn1_uc + ) else: latent_self_attn1 = latent_self_attn1_uc @@ -150,25 +158,28 @@ def self_attn_forward( hidden_states=hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, - **cross_attention_kwargs, ) + **cross_attention_kwargs, + ) return attn_output def transformer_2d_model_forward( - self, - hidden_states, - encoder_hidden_states=None, - timestep=None, - class_labels=None, - cross_attention_kwargs=None, - return_dict: bool=True, ): + self, + hidden_states, + encoder_hidden_states=None, + timestep=None, + class_labels=None, + cross_attention_kwargs=None, + return_dict: bool = True, +): x = self.original_forward( hidden_states, encoder_hidden_states=encoder_hidden_states, timestep=timestep, class_labels=class_labels, cross_attention_kwargs=cross_attention_kwargs, - return_dict=return_dict, )[0] + return_dict=return_dict, + )[0] output = None if getattr(self, "enable_gn", False): if self.gn_auto_machine_weight > self.gn_weight: @@ -177,26 +188,20 @@ def transformer_2d_model_forward( latent_hidden_states = x[:chunk_num] # uc, c image_hidden_states = x[chunk_num:] # uc, c - image_var, image_mean = var_mean( - image_hidden_states, axis=(2, 3), keepdim=True, unbiased=False) - var, mean = var_mean( - latent_hidden_states, axis=(2, 3), keepdim=True, unbiased=False) - std = paddle.maximum(var, paddle.zeros_like(var) + EPS)**0.5 + image_var, image_mean = var_mean(image_hidden_states, axis=(2, 3), keepdim=True, unbiased=False) + var, mean = var_mean(latent_hidden_states, axis=(2, 3), keepdim=True, unbiased=False) + std = paddle.maximum(var, paddle.zeros_like(var) + EPS) ** 0.5 div_num = image_hidden_states.shape[0] // chunk_num mean_acc = sum(image_mean.split([chunk_num] * div_num)) / div_num var_acc = sum(image_var.split([chunk_num] * div_num)) / div_num - std_acc = paddle.maximum(var_acc, - paddle.zeros_like(var_acc) + EPS)**0.5 + std_acc = paddle.maximum(var_acc, paddle.zeros_like(var_acc) + EPS) ** 0.5 y_uc = (((latent_hidden_states - mean) / std) * std_acc) + mean_acc if do_classifier_free_guidance and self.current_style_fidelity > 1e-5: y_c = y_uc.clone() - y_c[self.current_uc_indices] = latent_hidden_states[ - self.current_uc_indices] - latent_hidden_states = ( - self.current_style_fidelity * y_c + - (1.0 - self.current_style_fidelity) * y_uc) + y_c[self.current_uc_indices] = latent_hidden_states[self.current_uc_indices] + latent_hidden_states = self.current_style_fidelity * y_c + (1.0 - self.current_style_fidelity) * y_uc else: latent_hidden_states = y_uc output = paddle.concat([latent_hidden_states, image_hidden_states]) @@ -204,7 +209,7 @@ def transformer_2d_model_forward( if output is None: output = x if not return_dict: - return (output, ) + return (output,) return Transformer2DModelOutput(sample=output) @@ -219,26 +224,20 @@ def resnet_block_2d_forward(self, input_tensor, temb): latent_hidden_states = x[:chunk_num] # uc, c image_hidden_states = x[chunk_num:] # uc, c - image_var, image_mean = var_mean( - image_hidden_states, axis=(2, 3), keepdim=True, unbiased=False) - var, mean = var_mean( - latent_hidden_states, axis=(2, 3), keepdim=True, unbiased=False) - std = paddle.maximum(var, paddle.zeros_like(var) + EPS)**0.5 + image_var, image_mean = var_mean(image_hidden_states, axis=(2, 3), keepdim=True, unbiased=False) + var, mean = var_mean(latent_hidden_states, axis=(2, 3), keepdim=True, unbiased=False) + std = paddle.maximum(var, paddle.zeros_like(var) + EPS) ** 0.5 div_num = image_hidden_states.shape[0] // chunk_num mean_acc = sum(image_mean.split([chunk_num] * div_num)) / div_num var_acc = sum(image_var.split([chunk_num] * div_num)) / div_num - std_acc = paddle.maximum(var_acc, - paddle.zeros_like(var_acc) + EPS)**0.5 + std_acc = paddle.maximum(var_acc, paddle.zeros_like(var_acc) + EPS) ** 0.5 y_uc = (((latent_hidden_states - mean) / std) * std_acc) + mean_acc if do_classifier_free_guidance and self.current_style_fidelity > 1e-5: y_c = y_uc.clone() - y_c[self.current_uc_indices] = latent_hidden_states[ - self.current_uc_indices] - latent_hidden_states = ( - self.current_style_fidelity * y_c + - (1.0 - self.current_style_fidelity) * y_uc) + y_c[self.current_uc_indices] = latent_hidden_states[self.current_uc_indices] + latent_hidden_states = self.current_style_fidelity * y_c + (1.0 - self.current_style_fidelity) * y_uc else: latent_hidden_states = y_uc output = paddle.concat([latent_hidden_states, image_hidden_states]) @@ -259,26 +258,20 @@ def upsample_2d_forward(self, hidden_states, output_size=None): latent_hidden_states = x[:chunk_num] # uc, c image_hidden_states = x[chunk_num:] # uc, c - image_var, image_mean = var_mean( - image_hidden_states, axis=(2, 3), keepdim=True, unbiased=False) - var, mean = var_mean( - latent_hidden_states, axis=(2, 3), keepdim=True, unbiased=False) - std = paddle.maximum(var, paddle.zeros_like(var) + EPS)**0.5 + image_var, image_mean = var_mean(image_hidden_states, axis=(2, 3), keepdim=True, unbiased=False) + var, mean = var_mean(latent_hidden_states, axis=(2, 3), keepdim=True, unbiased=False) + std = paddle.maximum(var, paddle.zeros_like(var) + EPS) ** 0.5 div_num = image_hidden_states.shape[0] // chunk_num mean_acc = sum(image_mean.split([chunk_num] * div_num)) / div_num var_acc = sum(image_var.split([chunk_num] * div_num)) / div_num - std_acc = paddle.maximum(var_acc, - paddle.zeros_like(var_acc) + EPS)**0.5 + std_acc = paddle.maximum(var_acc, paddle.zeros_like(var_acc) + EPS) ** 0.5 y_uc = (((latent_hidden_states - mean) / std) * std_acc) + mean_acc if do_classifier_free_guidance and self.current_style_fidelity > 1e-5: y_c = y_uc.clone() - y_c[self.current_uc_indices] = latent_hidden_states[ - self.current_uc_indices] - latent_hidden_states = ( - self.current_style_fidelity * y_c + - (1.0 - self.current_style_fidelity) * y_uc) + y_c[self.current_uc_indices] = latent_hidden_states[self.current_uc_indices] + latent_hidden_states = self.current_style_fidelity * y_c + (1.0 - self.current_style_fidelity) * y_uc else: latent_hidden_states = y_uc output = paddle.concat([latent_hidden_states, image_hidden_states]) @@ -316,26 +309,16 @@ def preprocess(image, resize_mode, width, height): if isinstance(image, paddle.Tensor): return image elif isinstance(image, PIL.Image.Image): - image = resize_image( - resize_mode=resize_mode, im=image, width=width, height=height) + image = resize_image(resize_mode=resize_mode, im=image, width=width, height=height) image = [image] if isinstance(image[0], PIL.Image.Image): - image = [ - resize_image( - resize_mode=resize_mode, im=im, width=width, height=height) - for im in image - ] + image = [resize_image(resize_mode=resize_mode, im=im, width=width, height=height) for im in image] w, h = image[0].size - w, h = map(lambda x: x - x % 8, - (w, h)) # resize to integer multiple of 8 + w, h = map(lambda x: x - x % 8, (w, h)) # resize to integer multiple of 8 - image = [ - np.array(i.resize( - (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] - for i in image - ] + image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image] image = np.concatenate(image, axis=0) image = np.array(image).astype(np.float32) / 255.0 image = image.transpose(0, 3, 1, 2) @@ -382,8 +365,7 @@ def resize(im, w, h): resized = resize(im, src_w, src_h) res = Image.new("RGB", (width, height)) - res.paste( - resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2)) + res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2)) else: ratio = width / height @@ -394,31 +376,22 @@ def resize(im, w, h): resized = resize(im, src_w, src_h) res = Image.new("RGB", (width, height)) - res.paste( - resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2)) + res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2)) if ratio < src_ratio: fill_height = height // 2 - src_h // 2 + res.paste(resized.resize((width, fill_height), box=(0, 0, width, 0)), box=(0, 0)) res.paste( - resized.resize( - (width, fill_height), box=(0, 0, width, 0)), - box=(0, 0)) - res.paste( - resized.resize( - (width, fill_height), - box=(0, resized.height, width, resized.height)), - box=(0, fill_height + src_h), ) + resized.resize((width, fill_height), box=(0, resized.height, width, resized.height)), + box=(0, fill_height + src_h), + ) elif ratio > src_ratio: fill_width = width // 2 - src_w // 2 + res.paste(resized.resize((fill_width, height), box=(0, 0, 0, height)), box=(0, 0)) res.paste( - resized.resize( - (fill_width, height), box=(0, 0, 0, height)), - box=(0, 0)) - res.paste( - resized.resize( - (fill_width, height), - box=(resized.width, 0, resized.width, height)), - box=(fill_width + src_w, 0), ) + resized.resize((fill_width, height), box=(resized.width, 0, resized.width, height)), + box=(fill_width + src_w, 0), + ) return res @@ -454,37 +427,33 @@ class ReferenceOnlyPipeline(DiffusionPipeline): _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + ): super().__init__() - if (hasattr(scheduler.config, "steps_offset") and - scheduler.config.steps_offset != 1): + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " "to update the config accordingly as leaving `steps_offset` might led to incorrect results" " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" - " file") - deprecate( - "steps_offset!=1", - "1.0.0", - deprecation_message, - standard_warn=False) + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["steps_offset"] = 1 scheduler._internal_dict = FrozenDict(new_config) - if (hasattr(scheduler.config, "clip_sample") and - scheduler.config.clip_sample is True): + if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`." " `clip_sample` should be set to False in the configuration file. Please make sure to update the" @@ -492,11 +461,7 @@ def __init__( " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very" " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file" ) - deprecate( - "clip_sample not set", - "1.0.0", - deprecation_message, - standard_warn=False) + deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["clip_sample"] = False scheduler._internal_dict = FrozenDict(new_config) @@ -517,12 +482,10 @@ def __init__( " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." ) - is_unet_version_less_0_9_0 = hasattr( - unet.config, "_ppdiffusers_version") and version.parse( - version.parse(unet.config._ppdiffusers_version) - .base_version) < version.parse("0.9.0.dev0") - is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and - unet.config.sample_size < 64) + is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse( + version.parse(unet.config._ppdiffusers_version).base_version + ) < version.parse("0.9.0.dev0") + is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: deprecation_message = ( "The configuration file of the unet has set the default `sample_size` to smaller than" @@ -533,12 +496,9 @@ def __init__( " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`" " in the config might lead to incorrect results in future versions. If you have downloaded this" " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" - " the `unet/config.json` file") - deprecate( - "sample_size<64", - "1.0.0", - deprecation_message, - standard_warn=False) + " the `unet/config.json` file" + ) + deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(unet.config) new_config["sample_size"] = 64 unet._internal_dict = FrozenDict(new_config) @@ -550,21 +510,23 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) self.attn_modules = None self.gn_modules = None def set_reference_only( - self, - attention_auto_machine_weight=1.0, - gn_auto_machine_weight=1.0, - current_style_fidelity=0.5, - enable_attn=True, - enable_gn=True, - do_classifier_free_guidance=True, ): + self, + attention_auto_machine_weight=1.0, + gn_auto_machine_weight=1.0, + current_style_fidelity=0.5, + enable_attn=True, + enable_gn=True, + do_classifier_free_guidance=True, + ): assert 0.0 <= attention_auto_machine_weight <= 1.0 assert 0.0 <= gn_auto_machine_weight <= 2.0 assert 0.0 <= current_style_fidelity <= 1.0 @@ -574,18 +536,14 @@ def set_reference_only( module.enable_attn = enable_attn module.attention_auto_machine_weight = attention_auto_machine_weight module.current_style_fidelity = current_style_fidelity - module.current_uc_indices = [ - 0 - ] if do_classifier_free_guidance else [] + module.current_uc_indices = [0] if do_classifier_free_guidance else [] if self.gn_modules is not None: for module in self.gn_modules: module.enable_gn = enable_gn module.gn_auto_machine_weight = gn_auto_machine_weight module.current_style_fidelity = current_style_fidelity - module.current_uc_indices = [ - 0 - ] if do_classifier_free_guidance else [] + module.current_uc_indices = [0] if do_classifier_free_guidance else [] # init attn_modules if self.attn_modules is None: @@ -599,75 +557,54 @@ def set_reference_only( hidden_size = self.unet.config.block_out_channels[-1] elif name.startswith("up_blocks"): block_id = int(name[len("up_blocks.")]) - hidden_size = list( - reversed(self.unet.config.block_out_channels))[block_id] + hidden_size = list(reversed(self.unet.config.block_out_channels))[block_id] elif name.startswith("down_blocks"): block_id = int(name[len("down_blocks.")]) hidden_size = self.unet.config.block_out_channels[block_id] self_attn_processors_keys.append([name, hidden_size]) # sorted by (-hidden_size, name),down -> mid -> up. - for i, (name, _) in enumerate( - sorted( - self_attn_processors_keys, - key=lambda x: (-x[1], x[0]))): + for i, (name, _) in enumerate(sorted(self_attn_processors_keys, key=lambda x: (-x[1], x[0]))): module = self.unet.get_sublayer(name) - module.attn_weight = float(i) / float( - len(self_attn_processors_keys)) + module.attn_weight = float(i) / float(len(self_attn_processors_keys)) module.enable_attn = enable_attn module.attention_auto_machine_weight = attention_auto_machine_weight module.current_style_fidelity = current_style_fidelity - module.current_uc_indices = [ - 0 - ] if do_classifier_free_guidance else [] + module.current_uc_indices = [0] if do_classifier_free_guidance else [] attn_modules.append(module) self.attn_modules = attn_modules # init gn_modules if self.gn_modules is None: - gn_modules = [self.unet.mid_block.attentions[-1], ] - self.unet.mid_block.attentions[ - -1].gn_weight = 0.0 # mid 0.0 + gn_modules = [ + self.unet.mid_block.attentions[-1], + ] + self.unet.mid_block.attentions[-1].gn_weight = 0.0 # mid 0.0 input_block_names = [ - ("down_blocks.1.resnets.0", - "down_blocks.1.attentions.0"), # 4 2.0 - ("down_blocks.1.resnets.1", - "down_blocks.1.attentions.1"), # 5 1.66 - ("down_blocks.2.resnets.0", - "down_blocks.2.attentions.0"), # 7 1.33 - ("down_blocks.2.resnets.1", - "down_blocks.2.attentions.1"), # 8 1.0 - ("down_blocks.3.resnets.0", - ), # 10 0.66 - ("down_blocks.3.resnets.1", - ), # 11 0.33 + ("down_blocks.1.resnets.0", "down_blocks.1.attentions.0"), # 4 2.0 + ("down_blocks.1.resnets.1", "down_blocks.1.attentions.1"), # 5 1.66 + ("down_blocks.2.resnets.0", "down_blocks.2.attentions.0"), # 7 1.33 + ("down_blocks.2.resnets.1", "down_blocks.2.attentions.1"), # 8 1.0 + ("down_blocks.3.resnets.0",), # 10 0.66 + ("down_blocks.3.resnets.1",), # 11 0.33 ] for w, block_names in enumerate(input_block_names): module = self.unet.get_sublayer(block_names[-1]) - module.gn_weight = 1.0 - float(w) / float( - len(input_block_names)) + module.gn_weight = 1.0 - float(w) / float(len(input_block_names)) gn_modules.append(module) output_block_names = [ - ("up_blocks.0.resnets.0", - ), # 0 0.0 - ("up_blocks.0.resnets.1", - ), # 1 0.25 - ("up_blocks.0.resnets.2", - "up_blocks.0.upsamplers.0"), # 2 0.5 - ("up_blocks.1.resnets.0", - "up_blocks.1.attentions.0"), # 3 0.75 - ("up_blocks.1.resnets.1", - "up_blocks.1.attentions.1"), # 4 1.0 - ("up_blocks.1.resnets.2", - "up_blocks.1.attentions.2"), # 5 1.25 - ("up_blocks.2.resnets.0", - "up_blocks.2.attentions.0"), # 6 1.5 - ("up_blocks.2.resnets.1", - "up_blocks.2.attentions.1"), # 7 1.75 + ("up_blocks.0.resnets.0",), # 0 0.0 + ("up_blocks.0.resnets.1",), # 1 0.25 + ("up_blocks.0.resnets.2", "up_blocks.0.upsamplers.0"), # 2 0.5 + ("up_blocks.1.resnets.0", "up_blocks.1.attentions.0"), # 3 0.75 + ("up_blocks.1.resnets.1", "up_blocks.1.attentions.1"), # 4 1.0 + ("up_blocks.1.resnets.2", "up_blocks.1.attentions.2"), # 5 1.25 + ("up_blocks.2.resnets.0", "up_blocks.2.attentions.0"), # 6 1.5 + ("up_blocks.2.resnets.1", "up_blocks.2.attentions.1"), # 7 1.75 ] for w, block_names in enumerate(output_block_names): module = self.unet.get_sublayer(block_names[-1]) @@ -679,20 +616,19 @@ def set_reference_only( module.enable_gn = enable_gn module.gn_auto_machine_weight = gn_auto_machine_weight module.current_style_fidelity = current_style_fidelity - module.current_uc_indices = [ - 0 - ] if do_classifier_free_guidance else [] + module.current_uc_indices = [0] if do_classifier_free_guidance else [] self.gn_modules = gn_modules def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): r""" Encodes the prompt into text encoder hidden states. @@ -728,29 +664,31 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) @@ -758,8 +696,7 @@ def _encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -769,14 +706,16 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt @@ -786,46 +725,42 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) else: has_nsfw_concept = None return image, has_nsfw_concept @@ -844,53 +779,49 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - height, - width, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -903,17 +834,19 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = [ batch_size, num_channels_latents, @@ -934,12 +867,13 @@ def prepare_latents( return latents def prepare_image_latents( - self, - image, - batch_size, - dtype, - generator=None, - do_classifier_free_guidance=False, ): + self, + image, + batch_size, + dtype, + generator=None, + do_classifier_free_guidance=False, + ): if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)): raise ValueError( f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or list but is {type(image)}" @@ -948,8 +882,7 @@ def prepare_image_latents( if isinstance(generator, list): init_latents = [ - self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i]) - for i in range(batch_size) + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) ] init_latents = paddle.concat(init_latents, axis=0) else: @@ -965,33 +898,32 @@ def prepare_image_latents( @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - prompt: Union[str, List[str]]=None, - image: Union[PIL.Image.Image, List[PIL.Image.Image], - paddle.Tensor]=None, - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - # reference - control_name: str="reference_only", # "none", "reference_only", "reference_adain", "reference_adain+attn" - attention_auto_machine_weight: float=1.0, - gn_auto_machine_weight: float=1.0, - current_style_fidelity: float=0.5, - resize_mode: int=-1, ): + self, + prompt: Union[str, List[str]] = None, + image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + # reference + control_name: str = "reference_only", # "none", "reference_only", "reference_adain", "reference_adain+attn" + attention_auto_machine_weight: float = 1.0, + gn_auto_machine_weight: float = 1.0, + current_style_fidelity: float = 0.5, + resize_mode: int = -1, + ): r""" Function invoked when calling the pipeline for generation. @@ -1079,7 +1011,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -1101,7 +1034,8 @@ def __call__( do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -1118,55 +1052,57 @@ def __call__( width, dtype, generator, - latents, ) + latents, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. reference_only - enable_attn = ("only" in control_name or "attn" in control_name and - image is not None and attention_auto_machine_weight > 0) - enable_gn = ("adain" in control_name and image is not None and - gn_auto_machine_weight > 0) + enable_attn = ( + "only" in control_name + or "attn" in control_name + and image is not None + and attention_auto_machine_weight > 0 + ) + enable_gn = "adain" in control_name and image is not None and gn_auto_machine_weight > 0 self.set_reference_only( attention_auto_machine_weight, gn_auto_machine_weight, current_style_fidelity, enable_attn, enable_gn, - do_classifier_free_guidance, ) + do_classifier_free_guidance, + ) if enable_attn or enable_gn: image = preprocess(image, resize_mode, width, height) image_latents = self.prepare_image_latents( - image, batch_size, dtype, generator, - do_classifier_free_guidance) + image, batch_size, dtype, generator, do_classifier_free_guidance + ) prompt_embeds = prompt_embeds.tile([1 + image.shape[0], 1, 1]) # 8. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) if enable_attn or enable_gn: - image_noise = randn_tensor( - image_latents.shape, generator=generator, dtype=dtype) + image_noise = randn_tensor(image_latents.shape, generator=generator, dtype=dtype) image_latent_model_input = self.scheduler.scale_model_input( - self.scheduler.add_noise(image_latents, image_noise, t), - t) + self.scheduler.add_noise(image_latents, image_noise, t), t + ) chunk_num = 2 if do_classifier_free_guidance else 1 noise_pred = self.unet( - paddle.concat([ - latent_model_input, - image_latent_model_input.cast( - latent_model_input.dtype), - ]), + paddle.concat( + [ + latent_model_input, + image_latent_model_input.cast(latent_model_input.dtype), + ] + ), t, encoder_hidden_states=prompt_embeds, cross_attention_kwargs=cross_attention_kwargs, @@ -1176,22 +1112,19 @@ def __call__( latent_model_input, t, encoder_hidden_states=prompt_embeds, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -1204,8 +1137,7 @@ def __call__( image = self.decode_latents(latents) # 10. Run safety checker - image, has_nsfw_concept = self.run_safety_checker( - image, prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) # 11. Convert to PIL image = self.numpy_to_pil(image) @@ -1214,11 +1146,9 @@ def __call__( image = self.decode_latents(latents) # 10. Run safety checker - image, has_nsfw_concept = self.run_safety_checker( - image, prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/examples/community/stable_diffusion_controlnet_img2img.py b/ppdiffusers/examples/community/stable_diffusion_controlnet_img2img.py index 5d0dc0e26b395..25e821228b061 100644 --- a/ppdiffusers/examples/community/stable_diffusion_controlnet_img2img.py +++ b/ppdiffusers/examples/community/stable_diffusion_controlnet_img2img.py @@ -19,23 +19,27 @@ import numpy as np import paddle import PIL.Image -from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ppdiffusers.image_processor import VaeImageProcessor from ppdiffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin -from ppdiffusers.models import (AutoencoderKL, ControlNetModel, - UNet2DConditionModel) +from ppdiffusers.models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ppdiffusers.pipeline_utils import DiffusionPipeline -from ppdiffusers.pipelines.stable_diffusion import \ - StableDiffusionPipelineOutput -from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import \ - MultiControlNetModel -from ppdiffusers.pipelines.stable_diffusion.safety_checker import \ - StableDiffusionSafetyChecker +from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput +from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import ( + MultiControlNetModel, +) +from ppdiffusers.pipelines.stable_diffusion.safety_checker import ( + StableDiffusionSafetyChecker, +) from ppdiffusers.schedulers import KarrasDiffusionSchedulers -from ppdiffusers.utils import (check_min_version, deprecate, logging, - randn_tensor, replace_example_docstring) +from ppdiffusers.utils import ( + check_min_version, + deprecate, + logging, + randn_tensor, + replace_example_docstring, +) check_min_version("0.16.1") @@ -88,8 +92,7 @@ """ -class StableDiffusionControlNetImg2ImgPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): +class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance. @@ -126,17 +129,22 @@ class StableDiffusionControlNetImg2ImgPipeline( _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ - ControlNetModel], MultiControlNetModel, ], - scheduler: KarrasDiffusionSchedulers, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + controlnet: Union[ + ControlNetModel, + List[ControlNetModel], + Tuple[ControlNetModel], + MultiControlNetModel, + ], + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + ): super().__init__() if safety_checker is None and requires_safety_checker: @@ -166,25 +174,27 @@ def __init__( controlnet=controlnet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) - self.image_processor = VaeImageProcessor( - vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True) + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True) self.control_image_processor = VaeImageProcessor( vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, - do_normalize=False, ) + do_normalize=False, + ) self.register_to_config(requires_safety_checker=requires_safety_checker) def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - lora_scale: Optional[float]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + lora_scale: Optional[float] = None, + ): r""" Encodes the prompt into text encoder hidden states. @@ -231,32 +241,36 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - config = (self.text_encoder.config - if isinstance(self.text_encoder.config, dict) else - self.text_encoder.config.to_dict()) - if (config.get("use_attention_mask", None) is not None and - config["use_attention_mask"]): + config = ( + self.text_encoder.config + if isinstance(self.text_encoder.config, dict) + else self.text_encoder.config.to_dict() + ) + if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder.dtype) @@ -264,33 +278,32 @@ def _encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: uncond_tokens: List[str] if negative_prompt is None: uncond_tokens = [""] * batch_size - elif prompt is not None and type(prompt) is not type( - negative_prompt): + elif prompt is not None and type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, - self.tokenizer) + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( @@ -298,39 +311,38 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - config = (self.text_encoder.config - if isinstance(self.text_encoder.config, dict) else - self.text_encoder.config.to_dict()) - if (config.get("use_attention_mask", None) is not None and - config["use_attention_mask"]): + config = ( + self.text_encoder.config + if isinstance(self.text_encoder.config, dict) + else self.text_encoder.config.to_dict() + ) + if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - dtype=self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds @@ -339,16 +351,13 @@ def run_safety_checker(self, image, dtype): has_nsfw_concept = None else: if paddle.is_tensor(image): - feature_extractor_input = self.image_processor.postprocess( - image, output_type="pil") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") else: - feature_extractor_input = self.image_processor.numpy_to_pil( - image) - safety_checker_input = self.feature_extractor( - feature_extractor_input, return_tensors="pd") + feature_extractor_input = self.image_processor.numpy_to_pil(image) + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) return image, has_nsfw_concept def prepare_extra_step_kwargs(self, generator, eta): @@ -357,48 +366,46 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - image, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, - controlnet_conditioning_scale=1.0, ): + self, + prompt, + image, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + controlnet_conditioning_scale=1.0, + ): if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -411,7 +418,8 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) # `prompt` needs more sophisticated handling when there are multiple # conditionings. @@ -426,15 +434,12 @@ def check_inputs( self.check_image(image, prompt, prompt_embeds) elif isinstance(self.controlnet, MultiControlNetModel): if not isinstance(image, list): - raise TypeError( - "For multiple controlnets: `image` must be type `list`") + raise TypeError("For multiple controlnets: `image` must be type `list`") # When `image` is a nested list: # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]]) elif any(isinstance(i, list) for i in image): - raise ValueError( - "A single batch of multiple conditionings are supported at the moment." - ) + raise ValueError("A single batch of multiple conditionings are supported at the moment.") elif len(image) != len(self.controlnet.nets): raise ValueError( f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets." @@ -448,22 +453,18 @@ def check_inputs( # Check `controlnet_conditioning_scale` if isinstance(self.controlnet, ControlNetModel): if not isinstance(controlnet_conditioning_scale, float): - raise TypeError( - "For single controlnet: `controlnet_conditioning_scale` must be type `float`." - ) + raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.") elif isinstance(self.controlnet, MultiControlNetModel): if isinstance(controlnet_conditioning_scale, list): - if any( - isinstance(i, list) - for i in controlnet_conditioning_scale): - raise ValueError( - "A single batch of multiple conditionings are supported at the moment." - ) - elif isinstance(controlnet_conditioning_scale, list) and len( - controlnet_conditioning_scale) != len(self.controlnet.nets): + if any(isinstance(i, list) for i in controlnet_conditioning_scale): + raise ValueError("A single batch of multiple conditionings are supported at the moment.") + elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len( + self.controlnet.nets + ): raise ValueError( "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have" - " the same length as the number of controlnets") + " the same length as the number of controlnets" + ) else: assert False @@ -471,16 +472,18 @@ def check_image(self, image, prompt, prompt_embeds): image_is_pil = isinstance(image, PIL.Image.Image) image_is_tensor = isinstance(image, paddle.Tensor) image_is_np = isinstance(image, np.ndarray) - image_is_pil_list = isinstance(image, list) and isinstance( - image[0], PIL.Image.Image) - image_is_tensor_list = isinstance(image, list) and isinstance( - image[0], paddle.Tensor) - image_is_np_list = isinstance(image, list) and isinstance(image[0], - np.ndarray) - - if (not image_is_pil and not image_is_tensor and not image_is_np and - not image_is_pil_list and not image_is_tensor_list and - not image_is_np_list): + image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image) + image_is_tensor_list = isinstance(image, list) and isinstance(image[0], paddle.Tensor) + image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray) + + if ( + not image_is_pil + and not image_is_tensor + and not image_is_np + and not image_is_pil_list + and not image_is_tensor_list + and not image_is_np_list + ): raise TypeError( f"image must be passed and be one of PIL image, numpy array, paddle tensor, list of PIL images, list of numpy arrays or list of paddle tensors, but is {type(image)}" ) @@ -503,17 +506,17 @@ def check_image(self, image, prompt, prompt_embeds): ) def prepare_control_image( - self, - image, - width, - height, - batch_size, - num_images_per_prompt, - dtype, - do_classifier_free_guidance=False, - guess_mode=False, ): - image = self.control_image_processor.preprocess( - image, height=height, width=width).cast(dtype=paddle.float32) + self, + image, + width, + height, + batch_size, + num_images_per_prompt, + dtype, + do_classifier_free_guidance=False, + guess_mode=False, + ): + image = self.control_image_processor.preprocess(image, height=height, width=width).cast(dtype=paddle.float32) image_batch_size = image.shape[0] if image_batch_size == 1: @@ -533,21 +536,14 @@ def prepare_control_image( def get_timesteps(self, num_inference_steps, strength): # get the original timestep using init_timestep - init_timestep = min( - int(num_inference_steps * strength), num_inference_steps) + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:] + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] return timesteps, num_inference_steps - t_start - def prepare_latents(self, - image, - timestep, - batch_size, - num_images_per_prompt, - dtype, - generator=None): + def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None): if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)): raise ValueError( f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}" @@ -569,18 +565,15 @@ def prepare_latents(self, elif isinstance(generator, list): init_latents = [ - self.vae.encode(image[i:i + 1]).latent_dist.sample( - generator[i]) for i in range(batch_size) + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) ] init_latents = paddle.concat(init_latents, axis=0) else: - init_latents = self.vae.encode(image).latent_dist.sample( - generator) + init_latents = self.vae.encode(image).latent_dist.sample(generator) init_latents = self.vae.config.scaling_factor * init_latents - if (batch_size > init_latents.shape[0] and - batch_size % init_latents.shape[0] == 0): + if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: # expand init_latents for batch_size deprecation_message = ( f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial" @@ -592,12 +585,11 @@ def prepare_latents(self, "len(prompt) != len(image)", "1.0.0", deprecation_message, - standard_warn=False, ) + standard_warn=False, + ) additional_image_per_prompt = batch_size // init_latents.shape[0] - init_latents = paddle.concat( - [init_latents] * additional_image_per_prompt, axis=0) - elif (batch_size > init_latents.shape[0] and - batch_size % init_latents.shape[0] != 0): + init_latents = paddle.concat([init_latents] * additional_image_per_prompt, axis=0) + elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: raise ValueError( f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." ) @@ -616,33 +608,44 @@ def prepare_latents(self, @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - prompt: Union[str, List[str]]=None, - image: Union[paddle.Tensor, PIL.Image.Image, np.ndarray, List[ - paddle.Tensor], List[PIL.Image.Image], List[np.ndarray], ]=None, - control_image: Union[paddle.Tensor, PIL.Image.Image, np.ndarray, - List[paddle.Tensor], List[ - PIL.Image.Image], List[np.ndarray], ]=None, - height: Optional[int]=None, - width: Optional[int]=None, - strength: float=0.8, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: int=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - controlnet_conditioning_scale: Union[float, List[float]]=0.8, - guess_mode: bool=False, ): + self, + prompt: Union[str, List[str]] = None, + image: Union[ + paddle.Tensor, + PIL.Image.Image, + np.ndarray, + List[paddle.Tensor], + List[PIL.Image.Image], + List[np.ndarray], + ] = None, + control_image: Union[ + paddle.Tensor, + PIL.Image.Image, + np.ndarray, + List[paddle.Tensor], + List[PIL.Image.Image], + List[np.ndarray], + ] = None, + height: Optional[int] = None, + width: Optional[int] = None, + strength: float = 0.8, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + controlnet_conditioning_scale: Union[float, List[float]] = 0.8, + guess_mode: bool = False, + ): r""" Function invoked when calling the pipeline for generation. @@ -740,7 +743,8 @@ def __call__( negative_prompt, prompt_embeds, negative_prompt_embeds, - controlnet_conditioning_scale, ) + controlnet_conditioning_scale, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -757,20 +761,20 @@ def __call__( controlnet = self.controlnet - if isinstance(controlnet, MultiControlNetModel) and isinstance( - controlnet_conditioning_scale, float): - controlnet_conditioning_scale = [controlnet_conditioning_scale - ] * len(controlnet.nets) + if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float): + controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets) global_pool_conditions = ( controlnet.config.global_pool_conditions - if isinstance(controlnet, ControlNetModel) else - controlnet.nets[0].config.global_pool_conditions) + if isinstance(controlnet, ControlNetModel) + else controlnet.nets[0].config.global_pool_conditions + ) guess_mode = guess_mode or global_pool_conditions # 3. Encode input prompt - text_encoder_lora_scale = (cross_attention_kwargs.get("scale", None) if - cross_attention_kwargs is not None else None) + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, num_images_per_prompt, @@ -778,10 +782,10 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, - lora_scale=text_encoder_lora_scale, ) + lora_scale=text_encoder_lora_scale, + ) # 4. Prepare image - image = self.image_processor.preprocess(image).cast( - dtype=paddle.float32) + image = self.image_processor.preprocess(image).cast(dtype=paddle.float32) # 5. Prepare controlnet_conditioning_image if isinstance(controlnet, ControlNetModel): @@ -793,7 +797,8 @@ def __call__( num_images_per_prompt=num_images_per_prompt, dtype=controlnet.dtype, do_classifier_free_guidance=do_classifier_free_guidance, - guess_mode=guess_mode, ) + guess_mode=guess_mode, + ) elif isinstance(controlnet, MultiControlNetModel): control_images = [] @@ -806,7 +811,8 @@ def __call__( num_images_per_prompt=num_images_per_prompt, dtype=controlnet.dtype, do_classifier_free_guidance=do_classifier_free_guidance, - guess_mode=guess_mode, ) + guess_mode=guess_mode, + ) control_images.append(control_image_) @@ -815,11 +821,11 @@ def __call__( assert False # 5. Prepare timesteps - self.scheduler.set_timesteps(num_inference_steps, ) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, - strength) - latent_timestep = timesteps[:1].tile( - [batch_size * num_images_per_prompt]) + self.scheduler.set_timesteps( + num_inference_steps, + ) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) + latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt]) # 6. Prepare latent variables latents = self.prepare_latents( @@ -828,28 +834,25 @@ def __call__( batch_size, num_images_per_prompt, prompt_embeds.dtype, - generator, ) + generator, + ) # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 8. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # controlnet(s) inference if guess_mode and do_classifier_free_guidance: # Infer ControlNet only for the conditional batch. control_model_input = latents - control_model_input = self.scheduler.scale_model_input( - control_model_input, t) + control_model_input = self.scheduler.scale_model_input(control_model_input, t) controlnet_prompt_embeds = prompt_embeds.chunk(2)[1] else: control_model_input = latent_model_input @@ -862,20 +865,17 @@ def __call__( controlnet_cond=control_image, conditioning_scale=controlnet_conditioning_scale, guess_mode=guess_mode, - return_dict=False, ) + return_dict=False, + ) if guess_mode and do_classifier_free_guidance: # Infered ControlNet only for the conditional batch. # To apply the output of ControlNet to both the unconditional and conditional batches, # add 0 to the unconditional batch to keep it unchanged. - down_block_res_samples = [ - paddle.concat([paddle.zeros_like(d), d]) - for d in down_block_res_samples - ] - mid_block_res_sample = paddle.concat([ - paddle.zeros_like(mid_block_res_sample), - mid_block_res_sample - ]) + down_block_res_samples = [paddle.concat([paddle.zeros_like(d), d]) for d in down_block_res_samples] + mid_block_res_sample = paddle.concat( + [paddle.zeros_like(mid_block_res_sample), mid_block_res_sample] + ) # predict the noise residual noise_pred = self.unet( @@ -885,35 +885,26 @@ def __call__( cross_attention_kwargs=cross_attention_kwargs, down_block_additional_residuals=down_block_res_samples, mid_block_additional_residual=mid_block_res_sample, - return_dict=False, )[0] + return_dict=False, + )[0] # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step( - noise_pred, - t, - latents, - **extra_step_kwargs, - return_dict=False)[0] + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) if not output_type == "latent": - image = self.vae.decode( - latents / self.vae.config.scaling_factor, return_dict=False)[0] - image, has_nsfw_concept = self.run_safety_checker( - image, prompt_embeds.dtype) + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) else: image = latents has_nsfw_concept = None @@ -923,11 +914,9 @@ def __call__( else: do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - image = self.image_processor.postprocess( - image, output_type=output_type, do_denormalize=do_denormalize) + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/examples/community/stable_diffusion_hires_fix.py b/ppdiffusers/examples/community/stable_diffusion_hires_fix.py index 73eae51ab8e43..420f7c4ee7053 100644 --- a/ppdiffusers/examples/community/stable_diffusion_hires_fix.py +++ b/ppdiffusers/examples/community/stable_diffusion_hires_fix.py @@ -19,18 +19,21 @@ import paddle from packaging import version -from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ppdiffusers import AutoencoderKL, DiffusionPipeline, UNet2DConditionModel from ppdiffusers.configuration_utils import FrozenDict -from ppdiffusers.pipelines.stable_diffusion import \ - StableDiffusionPipelineOutput -from ppdiffusers.pipelines.stable_diffusion.safety_checker import \ - StableDiffusionSafetyChecker +from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput +from ppdiffusers.pipelines.stable_diffusion.safety_checker import ( + StableDiffusionSafetyChecker, +) from ppdiffusers.schedulers import KarrasDiffusionSchedulers -from ppdiffusers.utils import (deprecate, logging, randn_tensor, - replace_example_docstring) +from ppdiffusers.utils import ( + deprecate, + logging, + randn_tensor, + replace_example_docstring, +) logger = logging.get_logger(__name__) @@ -80,37 +83,33 @@ class StableDiffusionHiresFixPipeline(DiffusionPipeline): _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + requires_safety_checker: bool = True, + ): super().__init__() - if (hasattr(scheduler.config, "steps_offset") and - scheduler.config.steps_offset != 1): + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " "to update the config accordingly as leaving `steps_offset` might led to incorrect results" " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" - " file") - deprecate( - "steps_offset!=1", - "1.0.0", - deprecation_message, - standard_warn=False) + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["steps_offset"] = 1 scheduler._internal_dict = FrozenDict(new_config) - if (hasattr(scheduler.config, "clip_sample") and - scheduler.config.clip_sample is True): + if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`." " `clip_sample` should be set to False in the configuration file. Please make sure to update the" @@ -118,11 +117,7 @@ def __init__( " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very" " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file" ) - deprecate( - "clip_sample not set", - "1.0.0", - deprecation_message, - standard_warn=False) + deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["clip_sample"] = False scheduler._internal_dict = FrozenDict(new_config) @@ -143,12 +138,10 @@ def __init__( " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." ) - is_unet_version_less_0_9_0 = hasattr( - unet.config, "_ppdiffusers_version") and version.parse( - version.parse(unet.config._ppdiffusers_version) - .base_version) < version.parse("0.9.0.dev0") - is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and - unet.config.sample_size < 64) + is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse( + version.parse(unet.config._ppdiffusers_version).base_version + ) < version.parse("0.9.0.dev0") + is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: deprecation_message = ( "The configuration file of the unet has set the default `sample_size` to smaller than" @@ -159,12 +152,9 @@ def __init__( " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`" " in the config might lead to incorrect results in future versions. If you have downloaded this" " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" - " the `unet/config.json` file") - deprecate( - "sample_size<64", - "1.0.0", - deprecation_message, - standard_warn=False) + " the `unet/config.json` file" + ) + deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(unet.config) new_config["sample_size"] = 64 unet._internal_dict = FrozenDict(new_config) @@ -176,18 +166,20 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): r""" Encodes the prompt into text encoder hidden states. @@ -223,29 +215,31 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) @@ -253,8 +247,7 @@ def _encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -264,14 +257,16 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt @@ -281,36 +276,33 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds @@ -319,7 +311,7 @@ def get_timesteps(self, denoising_steps, denoising_strength): self.scheduler.set_timesteps(steps) t_start = max(steps - denoising_steps, 0) - timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:] + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] if hasattr(self.scheduler, "step_index_offset"): self.scheduler.step_index_offset = t_start * self.scheduler.order @@ -328,11 +320,10 @@ def get_timesteps(self, denoising_steps, denoising_strength): def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) else: has_nsfw_concept = None return image, has_nsfw_concept @@ -351,62 +342,57 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - height, - width, - callback_steps, - hr_scale, - hr_resize_height, - hr_resize_width, - denoising_strength, - latent_scale_mode, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + height, + width, + callback_steps, + hr_scale, + hr_resize_height, + hr_resize_width, + denoising_strength, + latent_scale_mode, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if hr_scale < 0: - raise ValueError( - "hr_scale shoule be greater that 0, but acceived {hr_scale}") + raise ValueError("hr_scale shoule be greater that 0, but acceived {hr_scale}") if hr_resize_height % 8 != 0 or hr_resize_width % 8 != 0: raise ValueError( @@ -414,9 +400,7 @@ def check_inputs( ) if denoising_strength > 1 or denoising_strength < 0: - raise ValueError( - f"denoising_strength should be set between 0 and 1., but acceived {denoising_strength}" - ) + raise ValueError(f"denoising_strength should be set between 0 and 1., but acceived {denoising_strength}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -434,17 +418,19 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = [ batch_size, num_channels_latents, @@ -464,12 +450,7 @@ def prepare_latents( latents = latents * self.scheduler.init_noise_sigma return latents - def get_upscaled_width_and_height(self, - width, - height, - hr_scale=2, - hr_resize_width=0, - hr_resize_height=0): + def get_upscaled_width_and_height(self, width, height, hr_scale=2, hr_resize_width=0, hr_resize_height=0): if hr_resize_width == 0 and hr_resize_height == 0: hr_upscale_to_width = int(width * hr_scale) hr_upscale_to_height = int(height * hr_scale) @@ -496,32 +477,32 @@ def get_upscaled_width_and_height(self, @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - prompt: Union[str, List[str]]=None, - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=40, - hires_ratio: Optional[float]=0.5, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - enable_hr: Optional[bool]=True, - hr_scale: Optional[float]=2.0, - hr_resize_width: Optional[int]=0, - hr_resize_height: Optional[int]=0, - denoising_strength: Optional[float]=0.7, - latent_scale_mode: Optional[str]="nearest", ): + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 40, + hires_ratio: Optional[float] = 0.5, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + enable_hr: Optional[bool] = True, + hr_scale: Optional[float] = 2.0, + hr_resize_width: Optional[int] = 0, + hr_resize_height: Optional[int] = 0, + denoising_strength: Optional[float] = 0.7, + latent_scale_mode: Optional[str] = "nearest", + ): r""" Function invoked when calling the pipeline for generation. @@ -622,7 +603,8 @@ def __call__( latent_scale_mode, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -644,7 +626,8 @@ def __call__( do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 4. Prepare timesteps if enable_hr: @@ -660,11 +643,9 @@ def __call__( # 5. Prepare latent variables if generator is None: generator_state = paddle.get_cuda_rng_state() - paddle.Generator().states_["initial_generator"] = copy.deepcopy( - generator_state) + paddle.Generator().states_["initial_generator"] = copy.deepcopy(generator_state) else: - paddle.Generator().states_["initial_generator"] = copy.deepcopy( - paddle.Generator().states_[generator]) + paddle.Generator().states_["initial_generator"] = copy.deepcopy(paddle.Generator().states_[generator]) num_channels_latents = self.unet.in_channels latents = self.prepare_latents( @@ -674,7 +655,8 @@ def __call__( width, prompt_embeds.dtype, generator, - latents, ) + latents, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) @@ -684,32 +666,27 @@ def __call__( with self.progress_bar(total=sample_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( latent_model_input, t, encoder_hidden_states=prompt_embeds, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -719,85 +696,74 @@ def __call__( # 8. determine the upscaled width and height for upscaled images truncate_width = 0 truncate_height = 0 - ( - self.hr_upscale_to_width, - self.hr_upscale_to_height, - ) = self.get_upscaled_width_and_height( + (self.hr_upscale_to_width, self.hr_upscale_to_height,) = self.get_upscaled_width_and_height( width, height, hr_scale=hr_scale, hr_resize_width=hr_resize_width, - hr_resize_height=hr_resize_height, ) + hr_resize_height=hr_resize_height, + ) if hr_resize_width != 0 and hr_resize_height != 0: - truncate_width = (self.hr_upscale_to_width - hr_resize_width - ) // self.vae_scale_factor - truncate_height = (self.hr_upscale_to_height - hr_resize_height - ) // self.vae_scale_factor + truncate_width = (self.hr_upscale_to_width - hr_resize_width) // self.vae_scale_factor + truncate_height = (self.hr_upscale_to_height - hr_resize_height) // self.vae_scale_factor # 9. special case: do nothing if upscaling is not nesscessary - if (self.hr_upscale_to_width == width and - self.hr_upscale_to_height == height): + if self.hr_upscale_to_width == width and self.hr_upscale_to_height == height: enable_hr = False denoising_strength = None if enable_hr: # 10. prepare init latents - timesteps, hr_steps = self.get_timesteps(hr_steps, - denoising_strength) + timesteps, hr_steps = self.get_timesteps(hr_steps, denoising_strength) init_timestep = timesteps[:1].tile([latents.shape[0]]) latents = paddle.nn.functional.interpolate( latents, size=( self.hr_upscale_to_height // self.vae_scale_factor, - self.hr_upscale_to_width // self.vae_scale_factor, ), - mode=latent_scale_mode, ) - latents = latents[:, :, truncate_height // 2:latents.shape[2] - ( - truncate_height + 1) // 2, truncate_width // 2:latents.shape[3] - - (truncate_width + 1) // 2, ] - - noise = randn_tensor( - latents.shape, - dtype=latents.dtype, - generator="initial_generator") + self.hr_upscale_to_width // self.vae_scale_factor, + ), + mode=latent_scale_mode, + ) + latents = latents[ + :, + :, + truncate_height // 2 : latents.shape[2] - (truncate_height + 1) // 2, + truncate_width // 2 : latents.shape[3] - (truncate_width + 1) // 2, + ] + + noise = randn_tensor(latents.shape, dtype=latents.dtype, generator="initial_generator") latents = self.scheduler.add_noise(latents, noise, init_timestep) # 11. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline - extra_step_kwargs = self.prepare_extra_step_kwargs( - "initial_generator", eta) + extra_step_kwargs = self.prepare_extra_step_kwargs("initial_generator", eta) # 12. denoising on hires.fix steps num_warmup_steps = len(timesteps) - hr_steps * self.scheduler.order with self.progress_bar(total=hr_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) - if do_classifier_free_guidance else - latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( latent_model_input, t, encoder_hidden_states=prompt_embeds, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step( - noise_pred, t, latents, **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -808,16 +774,13 @@ def __call__( has_nsfw_concept = None elif output_type == "pil": image = self.decode_latents(latents) - image, has_nsfw_concept = self.run_safety_checker( - image, prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) image = self.numpy_to_pil(image) else: image = self.decode_latents(latents) - image, has_nsfw_concept = self.run_safety_checker( - image, prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/examples/community/stable_diffusion_mega.py b/ppdiffusers/examples/community/stable_diffusion_mega.py index ba2adb2a179ec..71ff024d88b08 100644 --- a/ppdiffusers/examples/community/stable_diffusion_mega.py +++ b/ppdiffusers/examples/community/stable_diffusion_mega.py @@ -21,30 +21,44 @@ import paddle import PIL import PIL.Image -from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ppdiffusers import ( - AutoencoderKL, ControlNetModel, DDIMScheduler, DDPMScheduler, - DEISMultistepScheduler, DiffusionPipeline, DPMSolverMultistepScheduler, - DPMSolverSinglestepScheduler, EulerAncestralDiscreteScheduler, - EulerDiscreteScheduler, HeunDiscreteScheduler, - KDPM2AncestralDiscreteScheduler, KDPM2DiscreteScheduler, - LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel, - UniPCMultistepScheduler) + AutoencoderKL, + ControlNetModel, + DDIMScheduler, + DDPMScheduler, + DEISMultistepScheduler, + DiffusionPipeline, + DPMSolverMultistepScheduler, + DPMSolverSinglestepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + HeunDiscreteScheduler, + KDPM2AncestralDiscreteScheduler, + KDPM2DiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + UNet2DConditionModel, + UniPCMultistepScheduler, +) from ppdiffusers.configuration_utils import FrozenDict from ppdiffusers.image_processor import VaeImageProcessor -from ppdiffusers.loaders import (FromCkptMixin, LoraLoaderMixin, - TextualInversionLoaderMixin) -from ppdiffusers.pipelines.stable_diffusion import \ - StableDiffusionPipelineOutput +from ppdiffusers.loaders import ( + FromCkptMixin, + LoraLoaderMixin, + TextualInversionLoaderMixin, +) +from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from ppdiffusers.pipelines.stable_diffusion.pipeline_cycle_diffusion import ( - compute_noise, posterior_sample) -from ppdiffusers.pipelines.stable_diffusion.safety_checker import \ - StableDiffusionSafetyChecker + compute_noise, + posterior_sample, +) +from ppdiffusers.pipelines.stable_diffusion.safety_checker import ( + StableDiffusionSafetyChecker, +) from ppdiffusers.schedulers import KarrasDiffusionSchedulers -from ppdiffusers.utils import (PIL_INTERPOLATION, deprecate, logging, - randn_tensor) +from ppdiffusers.utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -66,7 +80,8 @@ [^\\()\[\]:]+| : """, - re.X, ) + re.X, +) def parse_prompt_attention(text): @@ -185,32 +200,20 @@ def get_prompts_with_weights(pipe, prompt: List[str], max_length: int): tokens.append(text_token) weights.append(text_weight) if truncated: - logger.warning( - "Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples" - ) + logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples") return tokens, weights -def pad_tokens_and_weights(tokens, - weights, - max_length, - bos, - eos, - pad, - no_boseos_middle=True, - chunk_length=77): +def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos_middle=True, chunk_length=77): r""" Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length. """ max_embeddings_multiples = (max_length - 2) // (chunk_length - 2) - weights_length = (max_length if no_boseos_middle else - max_embeddings_multiples * chunk_length) + weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length for i in range(len(tokens)): - tokens[i] = ([bos] + tokens[i] + [eos] + [pad] * - (max_length - 2 - len(tokens[i]))) + tokens[i] = [bos] + tokens[i] + [eos] + [pad] * (max_length - 2 - len(tokens[i])) if no_boseos_middle: - weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - - len(weights[i])) + weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i])) else: w = [] if len(weights[i]) == 0: @@ -218,8 +221,7 @@ def pad_tokens_and_weights(tokens, else: for j in range(max_embeddings_multiples): w.append(1.0) # weight for starting token in this chunk - w += weights[i][j * (chunk_length - 2):min( - len(weights[i]), (j + 1) * (chunk_length - 2))] + w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))] w.append(1.0) # weight for ending token in this chunk w += [1.0] * (weights_length - len(w)) weights[i] = w[:] @@ -228,10 +230,11 @@ def pad_tokens_and_weights(tokens, def get_unweighted_text_embeddings( - pipe, - text_input: paddle.Tensor, - chunk_length: int, - no_boseos_middle: Optional[bool]=True, ): + pipe, + text_input: paddle.Tensor, + chunk_length: int, + no_boseos_middle: Optional[bool] = True, +): """ When the length of tokens is a multiple of the capacity of the text encoder, it should be split into chunks and sent to the text encoder individually. @@ -241,8 +244,7 @@ def get_unweighted_text_embeddings( text_embeddings = [] for i in range(max_embeddings_multiples): # extract the i-th chunk - text_input_chunk = text_input[:, i * (chunk_length - 2):(i + 1) * ( - chunk_length - 2) + 2].clone() + text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone() # cover the head and the tail by the starting and the ending tokens text_input_chunk[:, 0] = text_input[0, 0] @@ -268,14 +270,15 @@ def get_unweighted_text_embeddings( def get_weighted_text_embeddings( - pipe, - prompt: Union[str, List[str]], - uncond_prompt: Optional[Union[str, List[str]]]=None, - max_embeddings_multiples: Optional[int]=1, - no_boseos_middle: Optional[bool]=False, - skip_parsing: Optional[bool]=False, - skip_weighting: Optional[bool]=False, - **kwargs, ): + pipe, + prompt: Union[str, List[str]], + uncond_prompt: Optional[Union[str, List[str]]] = None, + max_embeddings_multiples: Optional[int] = 1, + no_boseos_middle: Optional[bool] = False, + skip_parsing: Optional[bool] = False, + skip_weighting: Optional[bool] = False, + **kwargs, +): r""" Prompts can be assigned with local weights using brackets. For example, prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful', @@ -299,24 +302,19 @@ def get_weighted_text_embeddings( skip_weighting (`bool`, *optional*, defaults to `False`): Skip the weighting. When the parsing is skipped, it is forced True. """ - max_length = (pipe.tokenizer.model_max_length - 2 - ) * max_embeddings_multiples + 2 + max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2 if isinstance(prompt, str): prompt = [prompt] if not skip_parsing: - prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, - max_length - 2) + prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2) if uncond_prompt is not None: if isinstance(uncond_prompt, str): uncond_prompt = [uncond_prompt] - uncond_tokens, uncond_weights = get_prompts_with_weights( - pipe, uncond_prompt, max_length - 2) + uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2) else: prompt_tokens = [ - token[1:-1] - for token in pipe.tokenizer( - prompt, max_length=max_length, truncation=True).input_ids + token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids ] prompt_weights = [[1.0] * len(token) for token in prompt_tokens] if uncond_prompt is not None: @@ -324,33 +322,26 @@ def get_weighted_text_embeddings( uncond_prompt = [uncond_prompt] uncond_tokens = [ token[1:-1] - for token in pipe.tokenizer( - uncond_prompt, max_length=max_length, truncation=True) - .input_ids + for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids ] uncond_weights = [[1.0] * len(token) for token in uncond_tokens] # round up the longest length of tokens to a multiple of (model_max_length - 2) max_length = max([len(token) for token in prompt_tokens]) if uncond_prompt is not None: - max_length = max(max_length, - max([len(token) for token in uncond_tokens])) + max_length = max(max_length, max([len(token) for token in uncond_tokens])) max_embeddings_multiples = min( max_embeddings_multiples, - (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1, ) + (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1, + ) max_embeddings_multiples = max(1, max_embeddings_multiples) - max_length = (pipe.tokenizer.model_max_length - 2 - ) * max_embeddings_multiples + 2 + max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2 # pad the length of tokens and weights # support bert tokenizer - bos = (pipe.tokenizer.bos_token_id - if pipe.tokenizer.bos_token_id is not None else - pipe.tokenizer.cls_token_id) - eos = (pipe.tokenizer.eos_token_id - if pipe.tokenizer.eos_token_id is not None else - pipe.tokenizer.sep_token_id) + bos = pipe.tokenizer.bos_token_id if pipe.tokenizer.bos_token_id is not None else pipe.tokenizer.cls_token_id + eos = pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id is not None else pipe.tokenizer.sep_token_id pad = pipe.tokenizer.pad_token_id prompt_tokens, prompt_weights = pad_tokens_and_weights( prompt_tokens, @@ -360,7 +351,8 @@ def get_weighted_text_embeddings( eos, pad, no_boseos_middle=no_boseos_middle, - chunk_length=pipe.tokenizer.model_max_length, ) + chunk_length=pipe.tokenizer.model_max_length, + ) prompt_tokens = paddle.to_tensor(prompt_tokens, dtype=paddle.int64) if uncond_prompt is not None: uncond_tokens, uncond_weights = pad_tokens_and_weights( @@ -371,7 +363,8 @@ def get_weighted_text_embeddings( eos, pad, no_boseos_middle=no_boseos_middle, - chunk_length=pipe.tokenizer.model_max_length, ) + chunk_length=pipe.tokenizer.model_max_length, + ) uncond_tokens = paddle.to_tensor(uncond_tokens, dtype=paddle.int64) # get the embeddings @@ -379,43 +372,35 @@ def get_weighted_text_embeddings( pipe, prompt_tokens, pipe.tokenizer.model_max_length, - no_boseos_middle=no_boseos_middle, ) - prompt_weights = paddle.to_tensor( - prompt_weights, dtype=text_embeddings.dtype) + no_boseos_middle=no_boseos_middle, + ) + prompt_weights = paddle.to_tensor(prompt_weights, dtype=text_embeddings.dtype) if uncond_prompt is not None: uncond_embeddings = get_unweighted_text_embeddings( pipe, uncond_tokens, pipe.tokenizer.model_max_length, - no_boseos_middle=no_boseos_middle, ) - uncond_weights = paddle.to_tensor( - uncond_weights, dtype=uncond_embeddings.dtype) + no_boseos_middle=no_boseos_middle, + ) + uncond_weights = paddle.to_tensor(uncond_weights, dtype=uncond_embeddings.dtype) # assign weights to the prompts and normalize in the sense of mean # TODO: should we normalize by chunk or in a whole (current implementation)? if (not skip_parsing) and (not skip_weighting): previous_mean = text_embeddings.mean(axis=[-2, -1]) text_embeddings *= prompt_weights.unsqueeze(-1) - text_embeddings *= ( - (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1) - .unsqueeze(-1)) + text_embeddings *= (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1) if uncond_prompt is not None: previous_mean = uncond_embeddings.mean(axis=[-2, -1]) uncond_embeddings *= uncond_weights.unsqueeze(-1) - uncond_embeddings *= ( - (previous_mean / uncond_embeddings.mean(axis=[-2, -1])) - .unsqueeze(-1).unsqueeze(-1)) + uncond_embeddings *= (previous_mean / uncond_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1) if uncond_prompt is not None: return text_embeddings, uncond_embeddings return text_embeddings, None -def prepare_mask_and_masked_image(image, - mask, - height=None, - width=None, - return_image: bool=False): +def prepare_mask_and_masked_image(image, mask, height=None, width=None, return_image: bool = False): """ Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be converted to ``paddle.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the @@ -452,14 +437,11 @@ def prepare_mask_and_masked_image(image, if isinstance(image, paddle.Tensor): if not isinstance(mask, paddle.Tensor): - raise TypeError( - f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not" - ) + raise TypeError(f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not") # Batch single image if image.ndim == 3: - assert (image.shape[0] == 3 - ), "Image outside a batch should be of shape (3, H, W)" + assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)" image = image.unsqueeze(0) # Batch and add channel dim for single mask @@ -476,12 +458,9 @@ def prepare_mask_and_masked_image(image, else: mask = mask.unsqueeze(1) - assert (image.ndim == 4 and - mask.ndim == 4), "Image and Mask must have 4 dimensions" - assert (image.shape[-2:] == mask.shape[-2:] - ), "Image and Mask must have the same spatial dimensions" - assert (image.shape[0] == mask.shape[0] - ), "Image and Mask must have the same batch size" + assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions" + assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions" + assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size" # Check image is in [-1, 1] if image.min() < -1 or image.max() > 1: @@ -498,8 +477,7 @@ def prepare_mask_and_masked_image(image, # Image as float32 image = image.cast(dtype=paddle.float32) elif isinstance(mask, paddle.Tensor): - raise TypeError( - f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not") + raise TypeError(f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not") else: # preprocess image if isinstance(image, (PIL.Image.Image, np.ndarray)): @@ -510,13 +488,8 @@ def prepare_mask_and_masked_image(image, w, h = image[0].size else: w, h = width, height - w, h = (x - x % 8 - for x in (w, h)) # resize to integer multiple of 8 - image = [ - i.resize( - (w, h), resample=PIL_INTERPOLATION["lanczos"]) - for i in image - ] + w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 + image = [i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]) for i in image] image = [np.array(i.convert("RGB"))[None, :] for i in image] image = np.concatenate(image, axis=0) elif isinstance(image, list) and isinstance(image[0], np.ndarray): @@ -535,14 +508,9 @@ def prepare_mask_and_masked_image(image, w, h = mask[0].size else: w, h = width, height - w, h = (x - x % 8 - for x in (w, h)) # resize to integer multiple of 8 - mask = [ - i.resize( - (w, h), resample=PIL_INTERPOLATION["nearest"]) for i in mask - ] - mask = np.concatenate( - [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0) + w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 + mask = [i.resize((w, h), resample=PIL_INTERPOLATION["nearest"]) for i in mask] + mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0) mask = mask.astype(np.float32) / 255.0 elif isinstance(mask, list) and isinstance(mask[0], np.ndarray): mask = np.concatenate([m[None, None, :] for m in mask], axis=0) @@ -563,55 +531,45 @@ def prepare_mask_and_masked_image(image, class CommonMixIn: @property def components(self) -> Dict[str, Any]: - return { - k: getattr(self, k) - for k in self.config.keys() if not k.startswith("_") - } + return {k: getattr(self, k) for k in self.config.keys() if not k.startswith("_")} def change_scheduler(self, scheduler_type="ddim"): scheduler_type = scheduler_type.lower() if scheduler_type == "pndm": - scheduler = PNDMScheduler.from_config( - self.orginal_scheduler_config, skip_prk_steps=True) + scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True) elif scheduler_type == "lms": - scheduler = LMSDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "heun": - scheduler = HeunDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "euler": - scheduler = EulerDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "euler-ancestral": - scheduler = EulerAncestralDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "dpm-multi": - scheduler = DPMSolverMultistepScheduler.from_config( - self.orginal_scheduler_config) + scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "dpm-single": - scheduler = DPMSolverSinglestepScheduler.from_config( - self.orginal_scheduler_config) + scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "kdpm2-ancestral": - scheduler = KDPM2AncestralDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "kdpm2": - scheduler = KDPM2DiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "unipc-multi": - scheduler = UniPCMultistepScheduler.from_config( - self.orginal_scheduler_config) + scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "ddim": scheduler = DDIMScheduler.from_config( self.orginal_scheduler_config, steps_offset=1, clip_sample=False, - set_alpha_to_one=False, ) + set_alpha_to_one=False, + ) elif scheduler_type == "ddpm": - scheduler = DDPMScheduler.from_config(self.orginal_scheduler_config, - ) + scheduler = DDPMScheduler.from_config( + self.orginal_scheduler_config, + ) elif scheduler_type == "deis-multi": scheduler = DEISMultistepScheduler.from_config( - self.orginal_scheduler_config, ) + self.orginal_scheduler_config, + ) else: raise ValueError( f"Scheduler of type {scheduler_type} doesn't exist! Please choose in {self.supported_scheduler}!" @@ -623,11 +581,10 @@ def get_timesteps(self, num_inference_steps, strength=1.0): return self.scheduler.timesteps, num_inference_steps # get the original timestep using init_timestep - init_timestep = min( - int(num_inference_steps * strength), num_inference_steps) + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:] + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] num_inference_steps = num_inference_steps - t_start # check that number of inference steps is not < 1 - as this doesn't make sense @@ -640,26 +597,26 @@ def get_timesteps(self, num_inference_steps, strength=1.0): return timesteps, num_inference_steps def prepare_controlnet_cond( - self, - controlnet_cond, - controlnet_conditioning_scale, - width, - height, - batch_size, - num_images_per_prompt, - dtype, - do_classifier_free_guidance=False, - guess_mode=False, ): + self, + controlnet_cond, + controlnet_conditioning_scale, + width, + height, + batch_size, + num_images_per_prompt, + dtype, + do_classifier_free_guidance=False, + guess_mode=False, + ): control_image = self.control_image_processor.preprocess( controlnet_cond, height=height, - width=width, ) + width=width, + ) if isinstance(controlnet_conditioning_scale, (float, int)): - controlnet_conditioning_scale = paddle.to_tensor( - [controlnet_conditioning_scale] * 13, dtype=dtype) + controlnet_conditioning_scale = paddle.to_tensor([controlnet_conditioning_scale] * 13, dtype=dtype) elif isinstance(controlnet_conditioning_scale, (list, tuple)): - controlnet_conditioning_scale = paddle.to_tensor( - controlnet_conditioning_scale, dtype=dtype) + controlnet_conditioning_scale = paddle.to_tensor(controlnet_conditioning_scale, dtype=dtype) else: raise ValueError( f"`controlnet_conditioning_scale` has to be of type `float` or `int` or `list` or `tuple` but is {type(controlnet_conditioning_scale)}" @@ -678,40 +635,40 @@ def prepare_controlnet_cond( return control_image, controlnet_conditioning_scale def check_inputs( - self, - prompt, - height=512, - width=512, - callback_steps=1, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, - strength=1.0, ): + self, + prompt, + height=512, + width=512, + callback_steps=1, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + strength=1.0, + ): if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0: raise ValueError( f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}." ) if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -724,24 +681,25 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) if strength < 0 or strength > 1: - raise ValueError( - f"The value of strength should in [0.0, 1.0] but is {strength}") + raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") def prepare_latents( - self, - batch_size, - height, - width, - generator, - dtype=None, - latents=None, - image=None, - timestep=None, - is_strength_max=True, - return_noise=False, - return_image_latents=False, ): + self, + batch_size, + height, + width, + generator, + dtype=None, + latents=None, + image=None, + timestep=None, + is_strength_max=True, + return_noise=False, + return_image_latents=False, + ): shape = [ batch_size, self.vae.config.latent_channels, @@ -762,53 +720,50 @@ def prepare_latents( if return_image_latents or (latents is None and not is_strength_max): image = image.cast(dtype=dtype) - image_latents = self._encode_vae_image( - image, batch_size=batch_size, generator=generator) + image_latents = self._encode_vae_image(image, batch_size=batch_size, generator=generator) if latents is None: noise = randn_tensor(shape, generator=generator, dtype=dtype) # if strength is 1. then initialise the latents to noise, else initial to image + noise - latents = (noise if is_strength_max else - self.scheduler.add_noise(image_latents, noise, timestep)) + latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep) # if pure noise then scale the initial latents by the Scheduler's init sigma - latents = (latents * self.scheduler.init_noise_sigma - if is_strength_max else latents) + latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents else: noise = latents if str(noise.dtype).replace("paddle.", "") != dtype: noise = noise.cast(dtype) latents = noise * self.scheduler.init_noise_sigma - outputs = (latents, ) + outputs = (latents,) if return_noise: - outputs += (noise, ) + outputs += (noise,) if return_image_latents: - outputs += (image_latents, ) + outputs += (image_latents,) if len(outputs) == 1: outputs = latents return outputs def prepare_mask_latents( - self, - mask, - masked_image, - batch_size, - height, - width, - generator, - dtype, - do_classifier_free_guidance=False, - return_masked_image_latents=True, ): + self, + mask, + masked_image, + batch_size, + height, + width, + generator, + dtype, + do_classifier_free_guidance=False, + return_masked_image_latents=True, + ): # resize the mask to latents shape as we concatenate the mask to the latents # we do that before converting to dtype to avoid breaking in case we're using cpu_offload # and half precision mask = paddle.nn.functional.interpolate( - mask, - size=(height // self.vae_scale_factor, - width // self.vae_scale_factor)) + mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor) + ) mask = mask.cast(dtype=dtype) # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method @@ -821,14 +776,12 @@ def prepare_mask_latents( ) mask = mask.tile([batch_size // mask.shape[0], 1, 1, 1]) - mask = paddle.concat([mask] * - 2) if do_classifier_free_guidance else mask + mask = paddle.concat([mask] * 2) if do_classifier_free_guidance else mask if not return_masked_image_latents: return mask masked_image = masked_image.cast(dtype=dtype) - masked_image_latents = self._encode_vae_image( - masked_image, batch_size=batch_size, generator=generator) + masked_image_latents = self._encode_vae_image(masked_image, batch_size=batch_size, generator=generator) if masked_image_latents.shape[0] < batch_size: if not batch_size % masked_image_latents.shape[0] == 0: raise ValueError( @@ -836,31 +789,24 @@ def prepare_mask_latents( f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed." " Make sure the number of images that you pass is divisible by the total requested batch size." ) - masked_image_latents = masked_image_latents.tile( - [batch_size // masked_image_latents.shape[0], 1, 1, 1]) + masked_image_latents = masked_image_latents.tile([batch_size // masked_image_latents.shape[0], 1, 1, 1]) - masked_image_latents = (paddle.concat([masked_image_latents] * 2) - if do_classifier_free_guidance else - masked_image_latents) + masked_image_latents = ( + paddle.concat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents + ) # aligning device to prevent device errors when concating it with the latent model input masked_image_latents = masked_image_latents.cast(dtype=dtype) return mask, masked_image_latents def is_scheduler_support_step_index(self): - kwargs_keys = set( - inspect.signature(self.scheduler.step).parameters.keys()) + kwargs_keys = set(inspect.signature(self.scheduler.step).parameters.keys()) return "kwargs" in kwargs_keys or "step_index" in kwargs_keys - def _encode_vae_image(self, - image: paddle.Tensor, - batch_size=1, - generator=None, - **kwargs): + def _encode_vae_image(self, image: paddle.Tensor, batch_size=1, generator=None, **kwargs): if isinstance(generator, list): init_latents = [ - self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i]) - for i in range(batch_size) + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) ] init_latents = paddle.concat(init_latents, axis=0) else: @@ -868,21 +814,24 @@ def _encode_vae_image(self, return self.vae.config.scaling_factor * init_latents def _decode_vae_latents(self, latents: paddle.Tensor, **kwargs): - images_vae = self.vae.decode(latents, )[0] + images_vae = self.vae.decode( + latents, + )[0] return images_vae def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - lora_scale: Optional[float]=None, - parse_prompt_type: Optional[str]="lpw", - max_embeddings_multiples: Optional[int]=3, - **kwargs, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + lora_scale: Optional[float] = None, + parse_prompt_type: Optional[str] = "lpw", + max_embeddings_multiples: Optional[int] = 3, + **kwargs, + ): if parse_prompt_type == "lpw": return self._encode_prompt_lpw( prompt, @@ -893,7 +842,8 @@ def _encode_prompt( negative_prompt_embeds=negative_prompt_embeds, lora_scale=lora_scale, max_embeddings_multiples=max_embeddings_multiples, - **kwargs, ) + **kwargs, + ) elif parse_prompt_type == "raw": return self._encode_prompt_raw( prompt, @@ -902,22 +852,23 @@ def _encode_prompt( negative_prompt=negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, - lora_scale=lora_scale, ) + lora_scale=lora_scale, + ) elif parse_prompt_type == "webui": - raise NotImplementedError( - "`parse_prompt_type=webui` is not implemented yet.") + raise NotImplementedError("`parse_prompt_type=webui` is not implemented yet.") def _encode_prompt_lpw( - self, - prompt: Union[str, List[str]], - num_images_per_prompt: int, - do_classifier_free_guidance: bool, - negative_prompt: Union[str, List[str]], - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - lora_scale: Optional[float]=None, - max_embeddings_multiples: Optional[int]=3, - **kwargs, ): + self, + prompt: Union[str, List[str]], + num_images_per_prompt: int, + do_classifier_free_guidance: bool, + negative_prompt: Union[str, List[str]], + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + lora_scale: Optional[float] = None, + max_embeddings_multiples: Optional[int] = 3, + **kwargs, + ): r""" Encodes the prompt into text encoder hidden states. @@ -953,66 +904,63 @@ def _encode_prompt_lpw( if do_classifier_free_guidance: if negative_prompt is None: uncond_tokens = [""] * batch_size - elif prompt is not None and type(prompt) is not type( - negative_prompt): + elif prompt is not None and type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, - self.tokenizer) + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) prompt_embeds, negative_prompt_embeds = get_weighted_text_embeddings( pipe=self, prompt=prompt, uncond_prompt=uncond_tokens, max_embeddings_multiples=max_embeddings_multiples, - **kwargs, ) + **kwargs, + ) prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder.dtype) bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - dtype=self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds def _encode_prompt_raw( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - lora_scale: Optional[float]=None, - **kwargs, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + lora_scale: Optional[float] = None, + **kwargs, + ): r""" Encodes the prompt into text encoder hidden states. @@ -1059,32 +1007,36 @@ def _encode_prompt_raw( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - config = (self.text_encoder.config - if isinstance(self.text_encoder.config, dict) else - self.text_encoder.config.to_dict()) - if (config.get("use_attention_mask", None) is not None and - config["use_attention_mask"]): + config = ( + self.text_encoder.config + if isinstance(self.text_encoder.config, dict) + else self.text_encoder.config.to_dict() + ) + if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder.dtype) @@ -1092,33 +1044,32 @@ def _encode_prompt_raw( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: uncond_tokens: List[str] if negative_prompt is None: uncond_tokens = [""] * batch_size - elif prompt is not None and type(prompt) is not type( - negative_prompt): + elif prompt is not None and type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, - self.tokenizer) + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( @@ -1126,39 +1077,38 @@ def _encode_prompt_raw( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - config = (self.text_encoder.config - if isinstance(self.text_encoder.config, dict) else - self.text_encoder.config.to_dict()) - if (config.get("use_attention_mask", None) is not None and - config["use_attention_mask"]): + config = ( + self.text_encoder.config + if isinstance(self.text_encoder.config, dict) + else self.text_encoder.config.to_dict() + ) + if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - dtype=self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds @@ -1167,16 +1117,13 @@ def run_safety_checker(self, image, dtype): has_nsfw_concept = None else: if paddle.is_tensor(image): - feature_extractor_input = self.image_processor.postprocess( - image, output_type="pil") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") else: - feature_extractor_input = self.image_processor.numpy_to_pil( - image) - safety_checker_input = self.feature_extractor( - feature_extractor_input, return_tensors="pd") + feature_extractor_input = self.image_processor.numpy_to_pil(image) + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) return image, has_nsfw_concept def prepare_extra_step_kwargs(self, generator, eta): @@ -1185,26 +1132,25 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs class StableDiffusionMegaPipeline( - DiffusionPipeline, - CommonMixIn, - FromCkptMixin, - LoraLoaderMixin, - TextualInversionLoaderMixin, ): + DiffusionPipeline, + CommonMixIn, + FromCkptMixin, + LoraLoaderMixin, + TextualInversionLoaderMixin, +): r""" Pipeline for mega using Stable Diffusion. This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the @@ -1239,37 +1185,33 @@ def __call__(self, *args, **kwargs): return self.text2img(*args, **kwargs) def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - controlnet: ControlNetModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + controlnet: ControlNetModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + requires_safety_checker: bool = True, + ): super().__init__() - if (hasattr(scheduler.config, "steps_offset") and - scheduler.config.steps_offset != 1): + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " "to update the config accordingly as leaving `steps_offset` might led to incorrect results" " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" - " file") - deprecate( - "steps_offset!=1", - "1.0.0", - deprecation_message, - standard_warn=False) + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["steps_offset"] = 1 scheduler._internal_dict = FrozenDict(new_config) - if (hasattr(scheduler.config, "clip_sample") and - scheduler.config.clip_sample is True): + if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`." " `clip_sample` should be set to False in the configuration file. Please make sure to update the" @@ -1277,11 +1219,7 @@ def __init__( " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very" " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file" ) - deprecate( - "clip_sample not set", - "1.0.0", - deprecation_message, - standard_warn=False) + deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["clip_sample"] = False scheduler._internal_dict = FrozenDict(new_config) @@ -1310,15 +1248,16 @@ def __init__( controlnet=controlnet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) + feature_extractor=feature_extractor, + ) self.register_to_config(requires_safety_checker=requires_safety_checker) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) - self.image_processor = VaeImageProcessor( - vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True) self.control_image_processor = VaeImageProcessor( vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, - do_normalize=False, ) + do_normalize=False, + ) self.supported_scheduler = [ "pndm", "lms", @@ -1340,19 +1279,20 @@ def __init__( @paddle.no_grad() def do_unet( - self, - do_controlnet, - latents, - latent_model_input, - t, - i, - prompt_embeds, - control_image, - control_conditioning_scale, - cross_attention_kwargs, - guess_mode, - do_classifier_free_guidance, - is_scheduler_support_step_index=False, ): + self, + do_controlnet, + latents, + latent_model_input, + t, + i, + prompt_embeds, + control_image, + control_conditioning_scale, + cross_attention_kwargs, + guess_mode, + do_classifier_free_guidance, + is_scheduler_support_step_index=False, + ): if not do_controlnet: # predict the noise residual noise_pred_unet = self.unet( @@ -1360,18 +1300,17 @@ def do_unet( timestep=t, encoder_hidden_states=prompt_embeds, cross_attention_kwargs=cross_attention_kwargs, - return_dict=False, )[0] + return_dict=False, + )[0] else: # controlnet inference if guess_mode and do_classifier_free_guidance: # Infer ControlNet only for the conditional batch. control_model_input = latents if is_scheduler_support_step_index: - control_model_input = self.scheduler.scale_model_input( - control_model_input, t, step_index=i) + control_model_input = self.scheduler.scale_model_input(control_model_input, t, step_index=i) else: - control_model_input = self.scheduler.scale_model_input( - control_model_input, t) + control_model_input = self.scheduler.scale_model_input(control_model_input, t) controlnet_prompt_embeds = prompt_embeds.chunk(2)[1] else: control_model_input = latent_model_input @@ -1384,20 +1323,15 @@ def do_unet( controlnet_cond=control_image, conditioning_scale=control_conditioning_scale, guess_mode=guess_mode, - return_dict=False, ) + return_dict=False, + ) if guess_mode and do_classifier_free_guidance: # Infered ControlNet only for the conditional batch. # To apply the output of ControlNet to both the unconditional and conditional batches, # add 0 to the unconditional batch to keep it unchanged. - down_block_res_samples = [ - paddle.concat([paddle.zeros_like(d), d]) - for d in down_block_res_samples - ] - mid_block_res_sample = paddle.concat([ - paddle.zeros_like(mid_block_res_sample), - mid_block_res_sample - ]) + down_block_res_samples = [paddle.concat([paddle.zeros_like(d), d]) for d in down_block_res_samples] + mid_block_res_sample = paddle.concat([paddle.zeros_like(mid_block_res_sample), mid_block_res_sample]) # predict the noise residual noise_pred_unet = self.unet( @@ -1407,35 +1341,36 @@ def do_unet( cross_attention_kwargs=cross_attention_kwargs, down_block_additional_residuals=down_block_res_samples, mid_block_additional_residual=mid_block_res_sample, - return_dict=False, )[0] + return_dict=False, + )[0] return noise_pred_unet @paddle.no_grad() def text2img( - self, - prompt: Union[str, List[str]]=None, - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - parse_prompt_type: Optional[str]="lpw", - max_embeddings_multiples: Optional[int]=3, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None, - controlnet_conditioning_scale: float=1.0, - guess_mode: bool=False, ): + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + parse_prompt_type: Optional[str] = "lpw", + max_embeddings_multiples: Optional[int] = 3, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None, + controlnet_conditioning_scale: float = 1.0, + guess_mode: bool = False, + ): r""" Function invoked when calling the pipeline for generation. @@ -1535,7 +1470,8 @@ def text2img( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -1551,12 +1487,13 @@ def text2img( do_classifier_free_guidance = guidance_scale > 1.0 guess_mode = guess_mode or ( - self.controlnet.config.global_pool_conditions - if self.controlnet is not None else False) + self.controlnet.config.global_pool_conditions if self.controlnet is not None else False + ) # 3. Encode input prompt - text_encoder_lora_scale = (cross_attention_kwargs.get("scale", None) if - cross_attention_kwargs is not None else None) + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, num_images_per_prompt, @@ -1566,7 +1503,8 @@ def text2img( negative_prompt_embeds=negative_prompt_embeds, lora_scale=text_encoder_lora_scale, max_embeddings_multiples=max_embeddings_multiples, - parse_prompt_type=parse_prompt_type, ) + parse_prompt_type=parse_prompt_type, + ) dtype = prompt_embeds.dtype # do_controlnet @@ -1583,7 +1521,8 @@ def text2img( dtype=dtype, num_images_per_prompt=num_images_per_prompt, do_classifier_free_guidance=do_classifier_free_guidance, - guess_mode=guess_mode, ) + guess_mode=guess_mode, + ) else: control_image = None control_conditioning_scale = None @@ -1598,27 +1537,24 @@ def text2img( width, generator=generator, dtype=dtype, - latents=latents, ) + latents=latents, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order is_scheduler_support_step_index = self.is_scheduler_support_step_index() with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents if is_scheduler_support_step_index: - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t, step_index=i) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i) else: - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) noise_pred_unet = self.do_unet( do_controlnet, @@ -1637,10 +1573,8 @@ def text2img( # perform guidance if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk( - 2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) else: noise_pred = noise_pred_unet # compute the previous noisy sample x_t -> x_t-1 @@ -1651,22 +1585,19 @@ def text2img( latents, step_index=i, return_pred_original_sample=False, - **extra_step_kwargs, ) + **extra_step_kwargs, + ) else: - scheduler_output = self.scheduler.step( - noise_pred, t, latents, **extra_step_kwargs) + scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs) latents = scheduler_output.prev_sample.cast(dtype) # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) if not output_type == "latent": - image = self._decode_vae_latents(latents / - self.vae.config.scaling_factor) + image = self._decode_vae_latents(latents / self.vae.config.scaling_factor) image, has_nsfw_concept = self.run_safety_checker(image, dtype) else: image = latents @@ -1677,43 +1608,41 @@ def text2img( else: do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - image = self.image_processor.postprocess( - image, output_type=output_type, do_denormalize=do_denormalize) + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) @paddle.no_grad() def img2img( - self, - prompt: Union[str, List[str]]=None, - image: Union[paddle.Tensor, PIL.Image.Image]=None, - height: Optional[int]=None, - width: Optional[int]=None, - strength: float=0.8, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - parse_prompt_type: Optional[str]="lpw", - max_embeddings_multiples: Optional[int]=3, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None, - controlnet_conditioning_scale: float=1.0, - guess_mode: bool=False, ): + self, + prompt: Union[str, List[str]] = None, + image: Union[paddle.Tensor, PIL.Image.Image] = None, + height: Optional[int] = None, + width: Optional[int] = None, + strength: float = 0.8, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + parse_prompt_type: Optional[str] = "lpw", + max_embeddings_multiples: Optional[int] = 3, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None, + controlnet_conditioning_scale: float = 1.0, + guess_mode: bool = False, + ): r""" Function invoked when calling the pipeline for generation. @@ -1828,10 +1757,10 @@ def img2img( controlnet_conditioning_scale=controlnet_conditioning_scale, guess_mode=guess_mode, max_embeddings_multiples=max_embeddings_multiples, - parse_prompt_type=parse_prompt_type, ) + parse_prompt_type=parse_prompt_type, + ) # 0. Preprocess image - init_image = self.image_processor.preprocess( - image, height=height, width=width) + init_image = self.image_processor.preprocess(image, height=height, width=width) height, width = init_image.shape[-2:] # 1. Check inputs. Raise error if not correct @@ -1843,7 +1772,8 @@ def img2img( negative_prompt, prompt_embeds, negative_prompt_embeds, - strength, ) + strength, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -1857,12 +1787,13 @@ def img2img( # corresponds to doing no classifier free guidance. do_classifier_free_guidance = guidance_scale > 1.0 guess_mode = guess_mode or ( - self.controlnet.config.global_pool_conditions - if self.controlnet is not None else False) + self.controlnet.config.global_pool_conditions if self.controlnet is not None else False + ) # 3. Encode input prompt - text_encoder_lora_scale = (cross_attention_kwargs.get("scale", None) if - cross_attention_kwargs is not None else None) + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, num_images_per_prompt, @@ -1872,7 +1803,8 @@ def img2img( negative_prompt_embeds=negative_prompt_embeds, lora_scale=text_encoder_lora_scale, max_embeddings_multiples=max_embeddings_multiples, - parse_prompt_type=parse_prompt_type, ) + parse_prompt_type=parse_prompt_type, + ) dtype = prompt_embeds.dtype # do_controlnet @@ -1889,19 +1821,18 @@ def img2img( dtype=dtype, num_images_per_prompt=num_images_per_prompt, do_classifier_free_guidance=do_classifier_free_guidance, - guess_mode=guess_mode, ) + guess_mode=guess_mode, + ) else: control_image = None control_conditioning_scale = None # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, - strength) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) # 5. Prepare latent variables # at which timestep to set the initial noise (n.b. 50% if strength is 0.5) - latent_timestep = timesteps[:1].tile( - [batch_size * num_images_per_prompt]) + latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt]) # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise is_strength_max = strength == 1.0 latents = self.prepare_latents( @@ -1913,21 +1844,19 @@ def img2img( latents=latents, image=init_image, timestep=latent_timestep, - is_strength_max=is_strength_max, ) + is_strength_max=is_strength_max, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) noise_pred_unet = self.do_unet( do_controlnet, @@ -1940,35 +1869,26 @@ def img2img( control_conditioning_scale, cross_attention_kwargs, guess_mode, - do_classifier_free_guidance, ) + do_classifier_free_guidance, + ) # perform guidance if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk( - 2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) else: noise_pred = noise_pred_unet # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step( - noise_pred, - t, - latents, - **extra_step_kwargs, - return_dict=False)[0] + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] latents = latents.cast(dtype) # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) if not output_type == "latent": - image = self._decode_vae_latents(latents / - self.vae.config.scaling_factor) + image = self._decode_vae_latents(latents / self.vae.config.scaling_factor) image, has_nsfw_concept = self.run_safety_checker(image, dtype) else: image = latents @@ -1979,45 +1899,43 @@ def img2img( else: do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - image = self.image_processor.postprocess( - image, output_type=output_type, do_denormalize=do_denormalize) + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) @paddle.no_grad() def inpaint_legacy( - self, - prompt: Union[str, List[str]]=None, - image: Union[paddle.Tensor, PIL.Image.Image]=None, - mask_image: Union[paddle.Tensor, PIL.Image.Image]=None, - height: int=None, - width: int=None, - strength: float=1.0, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - add_predicted_noise: Optional[bool]=False, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - parse_prompt_type: Optional[str]="lpw", - max_embeddings_multiples: Optional[int]=3, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None, - controlnet_conditioning_scale: float=1.0, - guess_mode: bool=False, ): + self, + prompt: Union[str, List[str]] = None, + image: Union[paddle.Tensor, PIL.Image.Image] = None, + mask_image: Union[paddle.Tensor, PIL.Image.Image] = None, + height: int = None, + width: int = None, + strength: float = 1.0, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + add_predicted_noise: Optional[bool] = False, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + parse_prompt_type: Optional[str] = "lpw", + max_embeddings_multiples: Optional[int] = 3, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None, + controlnet_conditioning_scale: float = 1.0, + guess_mode: bool = False, + ): r""" Function invoked when calling the pipeline for generation. @@ -2122,7 +2040,8 @@ def inpaint_legacy( mask_image, height, width, - return_image=True, ) + return_image=True, + ) height, width = init_image.shape[-2:] # 1. Check inputs @@ -2134,7 +2053,8 @@ def inpaint_legacy( negative_prompt, prompt_embeds, negative_prompt_embeds, - strength, ) + strength, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): batch_size = 1 @@ -2149,12 +2069,13 @@ def inpaint_legacy( do_classifier_free_guidance = guidance_scale > 1.0 guess_mode = guess_mode or ( - self.controlnet.config.global_pool_conditions - if self.controlnet is not None else False) + self.controlnet.config.global_pool_conditions if self.controlnet is not None else False + ) # 3. Encode input prompt - text_encoder_lora_scale = (cross_attention_kwargs.get("scale", None) if - cross_attention_kwargs is not None else None) + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, num_images_per_prompt, @@ -2164,7 +2085,8 @@ def inpaint_legacy( negative_prompt_embeds=negative_prompt_embeds, lora_scale=text_encoder_lora_scale, max_embeddings_multiples=max_embeddings_multiples, - parse_prompt_type=parse_prompt_type, ) + parse_prompt_type=parse_prompt_type, + ) dtype = prompt_embeds.dtype # do_controlnet @@ -2181,18 +2103,17 @@ def inpaint_legacy( dtype=dtype, num_images_per_prompt=num_images_per_prompt, do_classifier_free_guidance=do_classifier_free_guidance, - guess_mode=guess_mode, ) + guess_mode=guess_mode, + ) else: control_image = None control_conditioning_scale = None # 4. set timesteps self.scheduler.set_timesteps(num_inference_steps) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, - strength) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) # at which timestep to set the initial noise (n.b. 50% if strength is 0.5) - latent_timestep = timesteps[:1].tile( - [batch_size * num_images_per_prompt]) + latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt]) # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise is_strength_max = strength == 1.0 @@ -2208,7 +2129,8 @@ def inpaint_legacy( timestep=latent_timestep, is_strength_max=is_strength_max, return_noise=True, - return_image_latents=True, ) + return_image_latents=True, + ) # 6. Prepare mask latent variables mask = self.prepare_mask_latents( @@ -2220,26 +2142,24 @@ def inpaint_legacy( dtype=dtype, generator=generator, do_classifier_free_guidance=do_classifier_free_guidance, - return_masked_image_latents=False, ) + return_masked_image_latents=False, + ) # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) if do_classifier_free_guidance: - init_mask = mask[:mask.shape[0] // 2] + init_mask = mask[: mask.shape[0] // 2] else: init_mask = mask # 8. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) noise_pred_unet = self.do_unet( do_controlnet, @@ -2252,51 +2172,39 @@ def inpaint_legacy( control_conditioning_scale, cross_attention_kwargs, guess_mode, - do_classifier_free_guidance, ) + do_classifier_free_guidance, + ) # perform guidance if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk( - 2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) else: noise_pred = noise_pred_unet # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step( - noise_pred, - t, - latents, - **extra_step_kwargs, - return_dict=False)[0] + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] if i < len(timesteps) - 1: # masking if add_predicted_noise: - init_latents_proper = self.scheduler.add_noise( - image_latents, noise_pred_uncond, t) + init_latents_proper = self.scheduler.add_noise(image_latents, noise_pred_uncond, t) else: # https://github.com/huggingface/diffusers/pull/3749/files#diff-39d36ab1e622684e35fe6971c12fb44e24756bdc383aba3d7f6e3b1625bdaafc noise_timestep = timesteps[i + 1] - init_latents_proper = self.scheduler.add_noise( - image_latents, noise, noise_timestep) + init_latents_proper = self.scheduler.add_noise(image_latents, noise, noise_timestep) else: init_latents_proper = image_latents - latents = (1 - init_mask - ) * init_latents_proper + init_mask * latents + latents = (1 - init_mask) * init_latents_proper + init_mask * latents latents = latents.cast(dtype) # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) if not output_type == "latent": - image = self._decode_vae_latents(latents / - self.vae.config.scaling_factor) + image = self._decode_vae_latents(latents / self.vae.config.scaling_factor) image, has_nsfw_concept = self.run_safety_checker(image, dtype) else: image = latents @@ -2307,45 +2215,43 @@ def inpaint_legacy( else: do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - image = self.image_processor.postprocess( - image, output_type=output_type, do_denormalize=do_denormalize) + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) @paddle.no_grad() def inpaint( - self, - prompt: Union[str, List[str]]=None, - image: Union[paddle.Tensor, PIL.Image.Image]=None, - mask_image: Union[paddle.Tensor, PIL.Image.Image]=None, - height: int=None, - width: int=None, - strength: float=1.0, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - add_predicted_noise: Optional[bool]=False, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - parse_prompt_type: Optional[str]="lpw", - max_embeddings_multiples: Optional[int]=3, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None, - controlnet_conditioning_scale: float=1.0, - guess_mode: bool=False, ): + self, + prompt: Union[str, List[str]] = None, + image: Union[paddle.Tensor, PIL.Image.Image] = None, + mask_image: Union[paddle.Tensor, PIL.Image.Image] = None, + height: int = None, + width: int = None, + strength: float = 1.0, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + add_predicted_noise: Optional[bool] = False, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + parse_prompt_type: Optional[str] = "lpw", + max_embeddings_multiples: Optional[int] = 3, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None, + controlnet_conditioning_scale: float = 1.0, + guess_mode: bool = False, + ): r""" Function invoked when calling the pipeline for generation. @@ -2452,7 +2358,8 @@ def inpaint( mask_image, height, width, - return_image=True, ) + return_image=True, + ) height, width = init_image.shape[-2:] # 1. Check inputs @@ -2464,7 +2371,8 @@ def inpaint( negative_prompt, prompt_embeds, negative_prompt_embeds, - strength, ) + strength, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -2480,12 +2388,13 @@ def inpaint( do_classifier_free_guidance = guidance_scale > 1.0 guess_mode = guess_mode or ( - self.controlnet.config.global_pool_conditions - if self.controlnet is not None else False) + self.controlnet.config.global_pool_conditions if self.controlnet is not None else False + ) # 3. Encode input prompt - text_encoder_lora_scale = (cross_attention_kwargs.get("scale", None) if - cross_attention_kwargs is not None else None) + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, num_images_per_prompt, @@ -2495,16 +2404,15 @@ def inpaint( negative_prompt_embeds=negative_prompt_embeds, lora_scale=text_encoder_lora_scale, max_embeddings_multiples=max_embeddings_multiples, - parse_prompt_type=parse_prompt_type, ) + parse_prompt_type=parse_prompt_type, + ) dtype = prompt_embeds.dtype # 4. set timesteps self.scheduler.set_timesteps(num_inference_steps) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, - strength) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) # at which timestep to set the initial noise (n.b. 50% if strength is 0.5) - latent_timestep = timesteps[:1].tile( - [batch_size * num_images_per_prompt]) + latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt]) # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise is_strength_max = strength == 1.0 @@ -2524,7 +2432,8 @@ def inpaint( timestep=latent_timestep, is_strength_max=is_strength_max, return_noise=True, - return_image_latents=return_image_latents, ) + return_image_latents=return_image_latents, + ) if return_image_latents: latents, noise, image_latents = latents_outputs @@ -2541,29 +2450,27 @@ def inpaint( dtype=dtype, generator=generator, do_classifier_free_guidance=do_classifier_free_guidance, - return_masked_image_latents=True, ) + return_masked_image_latents=True, + ) # 7. Check that sizes of mask, masked image and latents match if num_channels_unet == 9: # default case for runwayml/stable-diffusion-inpainting num_channels_mask = mask.shape[1] num_channels_masked_image = masked_image_latents.shape[1] - if (num_channels_latents + num_channels_mask + - num_channels_masked_image != self.unet.config.in_channels): + if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels: raise ValueError( f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects" f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +" f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}" f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of" - " `pipeline.unet` or your `mask_image` or `image` input.") + " `pipeline.unet` or your `mask_image` or `image` input." + ) elif num_channels_unet != 4: - raise ValueError( - f"The unet should have either 4 or 9 input channels, not {num_channels_unet}." - ) + raise ValueError(f"The unet should have either 4 or 9 input channels, not {num_channels_unet}.") # do_controlnet - do_controlnet = (controlnet_cond is not None and - self.controlnet is not None and is_legacy) + do_controlnet = controlnet_cond is not None and self.controlnet is not None and is_legacy if not do_controlnet: guess_mode = False if do_controlnet: @@ -2576,7 +2483,8 @@ def inpaint( num_images_per_prompt=num_images_per_prompt, dtype=dtype, do_classifier_free_guidance=do_classifier_free_guidance, - guess_mode=guess_mode, ) + guess_mode=guess_mode, + ) else: control_image = None control_conditioning_scale = None @@ -2584,26 +2492,21 @@ def inpaint( extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) if do_classifier_free_guidance: - init_mask = mask[:mask.shape[0] // 2] + init_mask = mask[: mask.shape[0] // 2] else: init_mask = mask # 9. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) if not is_legacy: # concat latents, mask, masked_image_latents in the channel dimension - latent_model_input = paddle.concat( - [latent_model_input, mask, masked_image_latents], - axis=1) + latent_model_input = paddle.concat([latent_model_input, mask, masked_image_latents], axis=1) noise_pred_unet = self.do_unet( do_controlnet, @@ -2616,51 +2519,39 @@ def inpaint( control_conditioning_scale, cross_attention_kwargs, guess_mode, - do_classifier_free_guidance, ) + do_classifier_free_guidance, + ) # perform guidance if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk( - 2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) else: noise_pred = noise_pred_unet # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step( - noise_pred, - t, - latents, - **extra_step_kwargs, - return_dict=False)[0] + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] if is_legacy: if i < len(timesteps) - 1: # masking if add_predicted_noise: - init_latents_proper = self.scheduler.add_noise( - image_latents, noise_pred_uncond, t) + init_latents_proper = self.scheduler.add_noise(image_latents, noise_pred_uncond, t) else: # https://github.com/huggingface/diffusers/pull/3749/files#diff-39d36ab1e622684e35fe6971c12fb44e24756bdc383aba3d7f6e3b1625bdaafc noise_timestep = timesteps[i + 1] - init_latents_proper = self.scheduler.add_noise( - image_latents, noise, noise_timestep) + init_latents_proper = self.scheduler.add_noise(image_latents, noise, noise_timestep) else: init_latents_proper = image_latents - latents = (1 - init_mask - ) * init_latents_proper + init_mask * latents + latents = (1 - init_mask) * init_latents_proper + init_mask * latents latents = latents.cast(dtype) # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) if not output_type == "latent": - image = self._decode_vae_latents(latents / - self.vae.config.scaling_factor) + image = self._decode_vae_latents(latents / self.vae.config.scaling_factor) image, has_nsfw_concept = self.run_safety_checker(image, dtype) else: image = latents @@ -2671,57 +2562,54 @@ def inpaint( else: do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - image = self.image_processor.postprocess( - image, output_type=output_type, do_denormalize=do_denormalize) + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) def check_inputs_hires_fix( - self, - prompt, - height, - width, - callback_steps, - hr_scale, - hr_resize_height, - hr_resize_width, - denoising_strength, - latent_scale_mode, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + height, + width, + callback_steps, + hr_scale, + hr_resize_height, + hr_resize_width, + denoising_strength, + latent_scale_mode, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0: raise ValueError( f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}." ) if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if hr_scale < 0: - raise ValueError( - "hr_scale shoule be greater that 0, but acceived {hr_scale}") + raise ValueError("hr_scale shoule be greater that 0, but acceived {hr_scale}") if hr_resize_height % 8 != 0 or hr_resize_width % 8 != 0: raise ValueError( @@ -2729,9 +2617,7 @@ def check_inputs_hires_fix( ) if denoising_strength > 1 or denoising_strength < 0: - raise ValueError( - f"denoising_strength should be set between 0 and 1., but acceived {denoising_strength}" - ) + raise ValueError(f"denoising_strength should be set between 0 and 1., but acceived {denoising_strength}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -2749,14 +2635,10 @@ def check_inputs_hires_fix( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") - - def get_upscaled_width_and_height(self, - width, - height, - hr_scale=2, - hr_resize_width=0, - hr_resize_height=0): + f" {negative_prompt_embeds.shape}." + ) + + def get_upscaled_width_and_height(self, width, height, hr_scale=2, hr_resize_width=0, hr_resize_height=0): if hr_resize_width == 0 and hr_resize_height == 0: hr_upscale_to_width = int(width * hr_scale) hr_upscale_to_height = int(height * hr_scale) @@ -2783,42 +2665,42 @@ def get_upscaled_width_and_height(self, def get_hires_fix_timesteps(self, denoising_steps, denoising_strength): steps = int(denoising_steps / min(denoising_strength, 0.999)) self.scheduler.set_timesteps(steps) - timesteps = self.scheduler.timesteps[steps - denoising_steps:] + timesteps = self.scheduler.timesteps[steps - denoising_steps :] return timesteps, denoising_steps @paddle.no_grad() def hires_fix( - self, - prompt: Union[str, List[str]]=None, - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=40, - hires_ratio: Optional[float]=0.5, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - parse_prompt_type: Optional[str]="lpw", - max_embeddings_multiples: Optional[int]=3, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - enable_hr: Optional[bool]=True, - hr_scale: Optional[float]=2.0, - hr_resize_width: Optional[int]=0, - hr_resize_height: Optional[int]=0, - denoising_strength: Optional[float]=0.7, - latent_scale_mode: Optional[str]="nearest", - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None, - controlnet_conditioning_scale: float=1.0, - guess_mode: bool=False, ): + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 40, + hires_ratio: Optional[float] = 0.5, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + parse_prompt_type: Optional[str] = "lpw", + max_embeddings_multiples: Optional[int] = 3, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + enable_hr: Optional[bool] = True, + hr_scale: Optional[float] = 2.0, + hr_resize_width: Optional[int] = 0, + hr_resize_height: Optional[int] = 0, + denoising_strength: Optional[float] = 0.7, + latent_scale_mode: Optional[str] = "nearest", + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None, + controlnet_conditioning_scale: float = 1.0, + guess_mode: bool = False, + ): r""" Function invoked when calling the pipeline for generation. @@ -2942,7 +2824,8 @@ def hires_fix( latent_scale_mode, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -2958,12 +2841,13 @@ def hires_fix( do_classifier_free_guidance = guidance_scale > 1.0 guess_mode = guess_mode or ( - self.controlnet.config.global_pool_conditions - if self.controlnet is not None else False) + self.controlnet.config.global_pool_conditions if self.controlnet is not None else False + ) # 3. Encode input prompt - text_encoder_lora_scale = (cross_attention_kwargs.get("scale", None) if - cross_attention_kwargs is not None else None) + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, num_images_per_prompt, @@ -2973,7 +2857,8 @@ def hires_fix( negative_prompt_embeds=negative_prompt_embeds, lora_scale=text_encoder_lora_scale, max_embeddings_multiples=max_embeddings_multiples, - parse_prompt_type=parse_prompt_type, ) + parse_prompt_type=parse_prompt_type, + ) dtype = prompt_embeds.dtype # do_controlnet @@ -2990,7 +2875,8 @@ def hires_fix( dtype=dtype, num_images_per_prompt=num_images_per_prompt, do_classifier_free_guidance=do_classifier_free_guidance, - guess_mode=guess_mode, ) + guess_mode=guess_mode, + ) else: control_image = None control_conditioning_scale = None @@ -3009,11 +2895,9 @@ def hires_fix( # 5. Prepare latent variables if generator is None: generator_state = paddle.get_cuda_rng_state() - paddle.Generator().states_["initial_generator"] = copy.deepcopy( - generator_state) + paddle.Generator().states_["initial_generator"] = copy.deepcopy(generator_state) else: - paddle.Generator().states_["initial_generator"] = copy.deepcopy( - paddle.Generator().states_[generator]) + paddle.Generator().states_["initial_generator"] = copy.deepcopy(paddle.Generator().states_[generator]) latents = self.prepare_latents( batch_size * num_images_per_prompt, @@ -3021,7 +2905,8 @@ def hires_fix( width, generator=generator, dtype=dtype, - latents=latents, ) + latents=latents, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) @@ -3032,10 +2917,8 @@ def hires_fix( with self.progress_bar(total=sample_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) noise_pred_unet = self.do_unet( do_controlnet, @@ -3048,25 +2931,21 @@ def hires_fix( control_conditioning_scale, cross_attention_kwargs, guess_mode, - do_classifier_free_guidance, ) + do_classifier_free_guidance, + ) # perform guidance if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk( - 2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) else: noise_pred = noise_pred_unet # compute the previous noisy sample x_t -> x_t-1 - scheduler_output = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs) + scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs) latents = scheduler_output.prev_sample.cast(dtype) # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -3076,19 +2955,16 @@ def hires_fix( # 8. determine the upscaled width and height for upscaled images truncate_width = 0 truncate_height = 0 - ( - hr_upscale_to_width, - hr_upscale_to_height, ) = self.get_upscaled_width_and_height( - width, - height, - hr_scale=hr_scale, - hr_resize_width=hr_resize_width, - hr_resize_height=hr_resize_height, ) + (hr_upscale_to_width, hr_upscale_to_height,) = self.get_upscaled_width_and_height( + width, + height, + hr_scale=hr_scale, + hr_resize_width=hr_resize_width, + hr_resize_height=hr_resize_height, + ) if hr_resize_width != 0 and hr_resize_height != 0: - truncate_width = (hr_upscale_to_width - hr_resize_width - ) // self.vae_scale_factor - truncate_height = (hr_upscale_to_height - hr_resize_height - ) // self.vae_scale_factor + truncate_width = (hr_upscale_to_width - hr_resize_width) // self.vae_scale_factor + truncate_height = (hr_upscale_to_height - hr_resize_height) // self.vae_scale_factor # 9. special case: do nothing if upscaling is not nesscessary if hr_upscale_to_width == width and hr_upscale_to_height == height: @@ -3097,10 +2973,7 @@ def hires_fix( if enable_hr: if do_controlnet: - ( - control_image, - control_conditioning_scale, - ) = self.prepare_controlnet_cond( + (control_image, control_conditioning_scale,) = self.prepare_controlnet_cond( controlnet_cond=controlnet_cond, controlnet_conditioning_scale=controlnet_conditioning_scale, width=hr_upscale_to_width, @@ -3109,45 +2982,43 @@ def hires_fix( num_images_per_prompt=num_images_per_prompt, dtype=dtype, do_classifier_free_guidance=do_classifier_free_guidance, - guess_mode=guess_mode, ) + guess_mode=guess_mode, + ) else: control_image = None control_conditioning_scale = None # 10. prepare init latents - timesteps, hr_steps = self.get_hires_fix_timesteps( - hr_steps, denoising_strength) + timesteps, hr_steps = self.get_hires_fix_timesteps(hr_steps, denoising_strength) init_timestep = timesteps[:1].tile([latents.shape[0]]) latents = paddle.nn.functional.interpolate( latents, size=( hr_upscale_to_height // self.vae_scale_factor, - hr_upscale_to_width // self.vae_scale_factor, ), - mode=latent_scale_mode, ) - latents = latents[:, :, truncate_height // 2:latents.shape[2] - ( - truncate_height + 1) // 2, truncate_width // 2:latents.shape[3] - - (truncate_width + 1) // 2, ] - - noise = randn_tensor( - latents.shape, - dtype=latents.dtype, - generator="initial_generator") + hr_upscale_to_width // self.vae_scale_factor, + ), + mode=latent_scale_mode, + ) + latents = latents[ + :, + :, + truncate_height // 2 : latents.shape[2] - (truncate_height + 1) // 2, + truncate_width // 2 : latents.shape[3] - (truncate_width + 1) // 2, + ] + + noise = randn_tensor(latents.shape, dtype=latents.dtype, generator="initial_generator") latents = self.scheduler.add_noise(latents, noise, init_timestep) # 11. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline - extra_step_kwargs = self.prepare_extra_step_kwargs( - "initial_generator", eta) + extra_step_kwargs = self.prepare_extra_step_kwargs("initial_generator", eta) # 12. denoising on hires.fix steps num_warmup_steps = len(timesteps) - hr_steps * self.scheduler.order with self.progress_bar(total=hr_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) - if do_classifier_free_guidance else - latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) noise_pred_unet = self.do_unet( do_controlnet, @@ -3160,31 +3031,26 @@ def hires_fix( control_conditioning_scale, cross_attention_kwargs, guess_mode, - do_classifier_free_guidance, ) + do_classifier_free_guidance, + ) # perform guidance if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk( - 2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) else: noise_pred = noise_pred_unet # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step( - noise_pred, t, latents, **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample latents = latents.cast(dtype) # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) if not output_type == "latent": - image = self._decode_vae_latents(latents / - self.vae.config.scaling_factor) + image = self._decode_vae_latents(latents / self.vae.config.scaling_factor) image, has_nsfw_concept = self.run_safety_checker(image, dtype) else: image = latents @@ -3195,42 +3061,40 @@ def hires_fix( else: do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - image = self.image_processor.postprocess( - image, output_type=output_type, do_denormalize=do_denormalize) + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) @paddle.no_grad() def cycle_diffusion( - self, - prompt: Union[str, List[str]], - source_prompt: Union[str, List[str]], - image: Union[paddle.Tensor, PIL.Image.Image]=None, - height: Optional[int]=None, - width: Optional[int]=None, - strength: float=0.8, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - negative_prompt: Optional[paddle.Tensor]=None, - source_guidance_scale: Optional[float]=1, - num_images_per_prompt: Optional[int]=1, - eta: Optional[float]=0.1, - latents: Optional[paddle.Tensor]=None, - parse_prompt_type: Optional[str]="lpw", - max_embeddings_multiples: Optional[int]=3, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, ): + self, + prompt: Union[str, List[str]], + source_prompt: Union[str, List[str]], + image: Union[paddle.Tensor, PIL.Image.Image] = None, + height: Optional[int] = None, + width: Optional[int] = None, + strength: float = 0.8, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + negative_prompt: Optional[paddle.Tensor] = None, + source_guidance_scale: Optional[float] = 1, + num_images_per_prompt: Optional[int] = 1, + eta: Optional[float] = 0.1, + latents: Optional[paddle.Tensor] = None, + parse_prompt_type: Optional[str] = "lpw", + max_embeddings_multiples: Optional[int] = 3, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + ): r""" Function invoked when calling the pipeline for generation. @@ -3310,8 +3174,7 @@ def cycle_diffusion( """ self.change_scheduler("ddim") # 0. Preprocess image - init_image = self.image_processor.preprocess( - image, height=height, width=width) + init_image = self.image_processor.preprocess(image, height=height, width=width) height, width = init_image.shape[-2:] # 1. Check inputs @@ -3323,7 +3186,8 @@ def cycle_diffusion( negative_prompt, prompt_embeds, negative_prompt_embeds, - strength, ) + strength, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -3339,8 +3203,9 @@ def cycle_diffusion( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode target prompt and source prompt - text_encoder_lora_scale = (cross_attention_kwargs.get("scale", None) if - cross_attention_kwargs is not None else None) + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, @@ -3351,24 +3216,24 @@ def cycle_diffusion( negative_prompt_embeds=negative_prompt_embeds, lora_scale=text_encoder_lora_scale, max_embeddings_multiples=max_embeddings_multiples, - parse_prompt_type=parse_prompt_type, ) + parse_prompt_type=parse_prompt_type, + ) source_prompt_embeds = self._encode_prompt( source_prompt, num_images_per_prompt, do_classifier_free_guidance, lora_scale=text_encoder_lora_scale, max_embeddings_multiples=max_embeddings_multiples, - parse_prompt_type=parse_prompt_type, ) + parse_prompt_type=parse_prompt_type, + ) dtype = prompt_embeds.dtype # 5. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, - strength) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) # 6. Prepare latent variables # at which timestep to set the initial noise (n.b. 50% if strength is 0.5) - latent_timestep = timesteps[:1].tile( - [batch_size * num_images_per_prompt]) + latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt]) is_strength_max = strength == 1.0 latents, clean_latents = self.prepare_latents( batch_size * num_images_per_prompt, @@ -3380,7 +3245,8 @@ def cycle_diffusion( image=init_image, timestep=latent_timestep, is_strength_max=is_strength_max, - return_image_latents=True, ) + return_image_latents=True, + ) source_latents = latents # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline @@ -3388,18 +3254,15 @@ def cycle_diffusion( generator = extra_step_kwargs.pop("generator", None) # 8. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance latent_model_input = paddle.concat([latents] * 2) source_latent_model_input = paddle.concat([source_latents] * 2) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) - source_latent_model_input = self.scheduler.scale_model_input( - source_latent_model_input, t) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + source_latent_model_input = self.scheduler.scale_model_input(source_latent_model_input, t) # predict the noise residual concat_latent_model_input = paddle.stack( @@ -3409,7 +3272,8 @@ def cycle_diffusion( source_latent_model_input[1], latent_model_input[1], ], - axis=0, ) + axis=0, + ) concat_prompt_embeds = paddle.stack( [ source_prompt_embeds[0], @@ -3417,7 +3281,8 @@ def cycle_diffusion( source_prompt_embeds[1], prompt_embeds[1], ], - axis=0, ) + axis=0, + ) # predict the noise residual concat_noise_pred = self.unet( @@ -3425,19 +3290,20 @@ def cycle_diffusion( timestep=t, encoder_hidden_states=concat_prompt_embeds, cross_attention_kwargs=cross_attention_kwargs, - return_dict=False, )[0] + return_dict=False, + )[0] # perform guidance ( source_noise_pred_uncond, noise_pred_uncond, source_noise_pred_text, - noise_pred_text, ) = concat_noise_pred.chunk( - 4, axis=0) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred_text, + ) = concat_noise_pred.chunk(4, axis=0) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) source_noise_pred = source_noise_pred_uncond + source_guidance_scale * ( - source_noise_pred_text - source_noise_pred_uncond) + source_noise_pred_text - source_noise_pred_uncond + ) # Sample source_latents from the posterior distribution. prev_source_latents = posterior_sample( @@ -3446,7 +3312,8 @@ def cycle_diffusion( t, clean_latents, generator=generator, - **extra_step_kwargs, ) + **extra_step_kwargs, + ) # Compute noise. noise = compute_noise( self.scheduler, @@ -3454,29 +3321,24 @@ def cycle_diffusion( source_latents, t, source_noise_pred, - **extra_step_kwargs, ) + **extra_step_kwargs, + ) source_latents = prev_source_latents.cast(dtype) # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step( - noise_pred, - t, - latents, - variance_noise=noise, - **extra_step_kwargs).prev_sample + noise_pred, t, latents, variance_noise=noise, **extra_step_kwargs + ).prev_sample latents = latents.cast(dtype) # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) if not output_type == "latent": - image = self._decode_vae_latents(latents / - self.vae.config.scaling_factor) + image = self._decode_vae_latents(latents / self.vae.config.scaling_factor) image, has_nsfw_concept = self.run_safety_checker(image, dtype) else: image = latents @@ -3487,11 +3349,9 @@ def cycle_diffusion( else: do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - image = self.image_processor.postprocess( - image, output_type=output_type, do_denormalize=do_denormalize) + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/examples/community/webui_stable_diffusion.py b/ppdiffusers/examples/community/webui_stable_diffusion.py index cad5739c1f3c5..c5c7cd4c8c0a9 100644 --- a/ppdiffusers/examples/community/webui_stable_diffusion.py +++ b/ppdiffusers/examples/community/webui_stable_diffusion.py @@ -25,22 +25,27 @@ import paddle.nn as nn import PIL import PIL.Image -from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer -from ppdiffusers.models import (AutoencoderKL, ControlNetModel, - UNet2DConditionModel) +from ppdiffusers.models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ppdiffusers.models.controlnet import ControlNetOutput from ppdiffusers.models.modeling_utils import ModelMixin from ppdiffusers.pipelines.pipeline_utils import DiffusionPipeline -from ppdiffusers.pipelines.stable_diffusion import \ - StableDiffusionPipelineOutput -from ppdiffusers.pipelines.stable_diffusion.safety_checker import \ - StableDiffusionSafetyChecker +from ppdiffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput +from ppdiffusers.pipelines.stable_diffusion.safety_checker import ( + StableDiffusionSafetyChecker, +) from ppdiffusers.schedulers import KarrasDiffusionSchedulers -from ppdiffusers.utils import (PIL_INTERPOLATION, PPDIFFUSERS_CACHE, logging, - ppdiffusers_url_download, randn_tensor, - safetensors_load, smart_load, torch_load) +from ppdiffusers.utils import ( + PIL_INTERPOLATION, + PPDIFFUSERS_CACHE, + logging, + ppdiffusers_url_download, + randn_tensor, + safetensors_load, + smart_load, + torch_load, +) logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -89,8 +94,7 @@ def resize(im, w, h): resized = resize(im, src_w, src_h) res = Image.new("RGB", (width, height)) - res.paste( - resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2)) + res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2)) else: ratio = width / height @@ -101,31 +105,22 @@ def resize(im, w, h): resized = resize(im, src_w, src_h) res = Image.new("RGB", (width, height)) - res.paste( - resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2)) + res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2)) if ratio < src_ratio: fill_height = height // 2 - src_h // 2 + res.paste(resized.resize((width, fill_height), box=(0, 0, width, 0)), box=(0, 0)) res.paste( - resized.resize( - (width, fill_height), box=(0, 0, width, 0)), - box=(0, 0)) - res.paste( - resized.resize( - (width, fill_height), - box=(0, resized.height, width, resized.height)), - box=(0, fill_height + src_h), ) + resized.resize((width, fill_height), box=(0, resized.height, width, resized.height)), + box=(0, fill_height + src_h), + ) elif ratio > src_ratio: fill_width = width // 2 - src_w // 2 + res.paste(resized.resize((fill_width, height), box=(0, 0, 0, height)), box=(0, 0)) res.paste( - resized.resize( - (fill_width, height), box=(0, 0, 0, height)), - box=(0, 0)) - res.paste( - resized.resize( - (fill_width, height), - box=(resized.width, 0, resized.width, height)), - box=(fill_width + src_w, 0), ) + resized.resize((fill_width, height), box=(resized.width, 0, resized.width, height)), + box=(fill_width + src_w, 0), + ) return res @@ -137,8 +132,7 @@ def get_civitai_download_url(display_url, url_prefix="https://civitai.com"): import requests headers = { - "User-Agent": - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE" + "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE" } r = requests.get(display_url, headers=headers) soup = bs4.BeautifulSoup(r.text, "lxml") @@ -151,12 +145,13 @@ def get_civitai_download_url(display_url, url_prefix="https://civitai.com"): def http_file_name( - url: str, - *, - proxies=None, - headers: Optional[Dict[str, str]]=None, - timeout=10.0, - max_retries=0, ): + url: str, + *, + proxies=None, + headers: Optional[Dict[str, str]] = None, + timeout=10.0, + max_retries=0, +): """ Get a remote file name. """ @@ -168,7 +163,8 @@ def http_file_name( proxies=proxies, headers=headers, timeout=timeout, - max_retries=max_retries, ) + max_retries=max_retries, + ) hf_raise_for_status(r) displayed_name = url.split("/")[-1] content_disposition = r.headers.get("Content-Disposition") @@ -180,11 +176,12 @@ def http_file_name( @paddle.no_grad() def load_lora( - pipeline, - state_dict: dict, - LORA_PREFIX_UNET: str="lora_unet", - LORA_PREFIX_TEXT_ENCODER: str="lora_te", - ratio: float=1.0, ): + pipeline, + state_dict: dict, + LORA_PREFIX_UNET: str = "lora_unet", + LORA_PREFIX_TEXT_ENCODER: str = "lora_te", + ratio: float = 1.0, +): ratio = float(ratio) visited = [] for key in state_dict: @@ -192,8 +189,7 @@ def load_lora( continue if "text" in key: - tmp_layer_infos = (key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER - + "_")[-1].split("_")) + tmp_layer_infos = key.split(".")[0].split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_") hf_to_ppnlp = { "encoder": "transformer", "fc1": "linear1", @@ -206,8 +202,7 @@ def load_lora( layer_infos.append(hf_to_ppnlp.get(layer_info, layer_info)) curr_layer: paddle.nn.Linear = pipeline.text_encoder else: - layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[ - -1].split("_") + layer_infos = key.split(".")[0].split(LORA_PREFIX_UNET + "_")[-1].split("_") curr_layer: paddle.nn.Linear = pipeline.unet temp_name = layer_infos.pop(0) @@ -248,24 +243,29 @@ def load_lora( if weight_down.shape[2:4] == [1, 1]: # conv2d 1x1 curr_layer.weight.copy_( - curr_layer.weight + ratio * paddle.matmul( - weight_up.squeeze([-1, -2]), - weight_down.squeeze([-1, -2])).unsqueeze([-1, -2]) * - scale, - True, ) + curr_layer.weight + + ratio + * paddle.matmul(weight_up.squeeze([-1, -2]), weight_down.squeeze([-1, -2])).unsqueeze([-1, -2]) + * scale, + True, + ) else: # conv2d 3x3 curr_layer.weight.copy_( - curr_layer.weight + ratio * paddle.nn.functional.conv2d( - weight_down.transpose([1, 0, 2, 3]), - weight_up).transpose([1, 0, 2, 3]) * scale, - True, ) + curr_layer.weight + + ratio + * paddle.nn.functional.conv2d(weight_down.transpose([1, 0, 2, 3]), weight_up).transpose( + [1, 0, 2, 3] + ) + * scale, + True, + ) else: # linear curr_layer.weight.copy_( - curr_layer.weight + ratio * paddle.matmul( - weight_up, weight_down).T * scale, - True, ) + curr_layer.weight + ratio * paddle.matmul(weight_up, weight_down).T * scale, + True, + ) # update visited list visited.extend(triplet_keys) @@ -285,28 +285,25 @@ class MultiControlNetModel(ModelMixin): `ControlNetModel` as a list. """ - def __init__( - self, - controlnets: Union[List[ControlNetModel], Tuple[ControlNetModel]]): + def __init__(self, controlnets: Union[List[ControlNetModel], Tuple[ControlNetModel]]): super().__init__() self.nets = nn.LayerList(controlnets) def forward( - self, - sample: paddle.Tensor, - timestep: Union[paddle.Tensor, float, int], - encoder_hidden_states: paddle.Tensor, - controlnet_cond: List[paddle.Tensor], - conditioning_scale: List[float], - class_labels: Optional[paddle.Tensor]=None, - timestep_cond: Optional[paddle.Tensor]=None, - attention_mask: Optional[paddle.Tensor]=None, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - guess_mode: bool=False, - return_dict: bool=True, ) -> Union[ControlNetOutput, Tuple]: - for i, ( - image, scale, controlnet - ) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)): + self, + sample: paddle.Tensor, + timestep: Union[paddle.Tensor, float, int], + encoder_hidden_states: paddle.Tensor, + controlnet_cond: List[paddle.Tensor], + conditioning_scale: List[float], + class_labels: Optional[paddle.Tensor] = None, + timestep_cond: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guess_mode: bool = False, + return_dict: bool = True, + ) -> Union[ControlNetOutput, Tuple]: + for i, (image, scale, controlnet) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)): down_samples, mid_sample = controlnet( sample, timestep, @@ -318,7 +315,8 @@ def forward( attention_mask, cross_attention_kwargs, guess_mode, - return_dict, ) + return_dict, + ) # merge samples if i == 0: @@ -326,8 +324,7 @@ def forward( else: down_block_res_samples = [ samples_prev + samples_curr - for samples_prev, samples_curr in zip( - down_block_res_samples, down_samples) + for samples_prev, samples_curr in zip(down_block_res_samples, down_samples) ] mid_block_res_sample += mid_sample @@ -373,17 +370,22 @@ class WebUIStableDiffusionPipeline(DiffusionPipeline): TI_DIR = os.path.join(PPDIFFUSERS_CACHE, "textual_inversion") def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, - controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ - ControlNetModel], MultiControlNetModel, ]=None, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + controlnet: Union[ + ControlNetModel, + List[ControlNetModel], + Tuple[ControlNetModel], + MultiControlNetModel, + ] = None, + requires_safety_checker: bool = True, + ): super().__init__() if safety_checker is None and requires_safety_checker: @@ -413,8 +415,9 @@ def __init__( controlnet=controlnet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) # custom data @@ -441,9 +444,9 @@ def __init__( # register_state_dict_hook to fix text_encoder, when we save_pretrained text model. def map_to(state_dict, *args, **kwargs): if "text_model.token_embedding.wrapped.weight" in state_dict: - state_dict[ - "text_model.token_embedding.weight"] = state_dict.pop( - "text_model.token_embedding.wrapped.weight") + state_dict["text_model.token_embedding.weight"] = state_dict.pop( + "text_model.token_embedding.wrapped.weight" + ) return state_dict self.text_encoder.register_state_dict_hook(map_to) @@ -466,7 +469,8 @@ def download_civitai_lora_file(self, url): file_path = ppdiffusers_url_download( download_url, cache_dir=self.LORA_DIR, - filename=http_file_name(download_url).strip('"'), ) + filename=http_file_name(download_url).strip('"'), + ) return file_path def download_civitai_ti_file(self, url): @@ -479,7 +483,8 @@ def download_civitai_ti_file(self, url): file_path = ppdiffusers_url_download( download_url, cache_dir=self.TI_DIR, - filename=http_file_name(download_url).strip('"'), ) + filename=http_file_name(download_url).strip('"'), + ) return file_path def change_scheduler(self, scheduler_type="ddim"): @@ -488,55 +493,56 @@ def change_scheduler(self, scheduler_type="ddim"): def switch_scheduler(self, scheduler_type="ddim"): scheduler_type = scheduler_type.lower() from ppdiffusers import ( - DDIMScheduler, DDPMScheduler, DEISMultistepScheduler, - DPMSolverMultistepScheduler, DPMSolverSinglestepScheduler, - EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, - HeunDiscreteScheduler, KDPM2AncestralDiscreteScheduler, - KDPM2DiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler, - UniPCMultistepScheduler) + DDIMScheduler, + DDPMScheduler, + DEISMultistepScheduler, + DPMSolverMultistepScheduler, + DPMSolverSinglestepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + HeunDiscreteScheduler, + KDPM2AncestralDiscreteScheduler, + KDPM2DiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + UniPCMultistepScheduler, + ) if scheduler_type == "pndm": - scheduler = PNDMScheduler.from_config( - self.orginal_scheduler_config, skip_prk_steps=True) + scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True) elif scheduler_type == "lms": - scheduler = LMSDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "heun": - scheduler = HeunDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "euler": - scheduler = EulerDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "euler-ancestral": - scheduler = EulerAncestralDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "dpm-multi": - scheduler = DPMSolverMultistepScheduler.from_config( - self.orginal_scheduler_config) + scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "dpm-single": - scheduler = DPMSolverSinglestepScheduler.from_config( - self.orginal_scheduler_config) + scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "kdpm2-ancestral": - scheduler = KDPM2AncestralDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "kdpm2": - scheduler = KDPM2DiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "unipc-multi": - scheduler = UniPCMultistepScheduler.from_config( - self.orginal_scheduler_config) + scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "ddim": scheduler = DDIMScheduler.from_config( self.orginal_scheduler_config, steps_offset=1, clip_sample=False, - set_alpha_to_one=False, ) + set_alpha_to_one=False, + ) elif scheduler_type == "ddpm": - scheduler = DDPMScheduler.from_config(self.orginal_scheduler_config, - ) + scheduler = DDPMScheduler.from_config( + self.orginal_scheduler_config, + ) elif scheduler_type == "deis-multi": scheduler = DEISMultistepScheduler.from_config( - self.orginal_scheduler_config, ) + self.orginal_scheduler_config, + ) else: raise ValueError( f"Scheduler of type {scheduler_type} doesn't exist! Please choose in {self.supported_scheduler}!" @@ -545,30 +551,28 @@ def switch_scheduler(self, scheduler_type="ddim"): @paddle.no_grad() def _encode_prompt( - self, - prompt: str, - do_classifier_free_guidance: float=7.5, - negative_prompt: str=None, - num_inference_steps: int=50, ): + self, + prompt: str, + do_classifier_free_guidance: float = 7.5, + negative_prompt: str = None, + num_inference_steps: int = 50, + ): if do_classifier_free_guidance: assert isinstance(negative_prompt, str) negative_prompt = [negative_prompt] - uc = get_learned_conditioning(self.sj.clip, negative_prompt, - num_inference_steps) + uc = get_learned_conditioning(self.sj.clip, negative_prompt, num_inference_steps) else: uc = None - c = get_multicond_learned_conditioning(self.sj.clip, prompt, - num_inference_steps) + c = get_multicond_learned_conditioning(self.sj.clip, prompt, num_inference_steps) return c, uc def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) else: has_nsfw_concept = None return image, has_nsfw_concept @@ -587,48 +591,43 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - image, - height, - width, - callback_steps, - negative_prompt=None, - controlnet_conditioning_scale=1.0, ): + self, + prompt, + image, + height, + width, + callback_steps, + negative_prompt=None, + controlnet_conditioning_scale=1.0, + ): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and not isinstance(prompt, str): - raise ValueError( - f"`prompt` has to be of type `str` but is {type(prompt)}") + raise ValueError(f"`prompt` has to be of type `str` but is {type(prompt)}") if negative_prompt is not None and not isinstance(negative_prompt, str): - raise ValueError( - f"`negative_prompt` has to be of type `str` but is {type(negative_prompt)}" - ) + raise ValueError(f"`negative_prompt` has to be of type `str` but is {type(negative_prompt)}") # `prompt` needs more sophisticated handling when there are multiple # conditionings. @@ -645,15 +644,12 @@ def check_inputs( self.check_image(image, prompt) elif isinstance(self.controlnet, MultiControlNetModel): if not isinstance(image, list): - raise TypeError( - "For multiple controlnets: `image` must be type `list`") + raise TypeError("For multiple controlnets: `image` must be type `list`") # When `image` is a nested list: # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]]) elif any(isinstance(i, list) for i in image): - raise ValueError( - "A single batch of multiple conditionings are supported at the moment." - ) + raise ValueError("A single batch of multiple conditionings are supported at the moment.") elif len(image) != len(self.controlnet.nets): raise ValueError( "For multiple controlnets: `image` must have the same length as the number of controlnets." @@ -666,39 +662,31 @@ def check_inputs( # Check `controlnet_conditioning_scale` if isinstance(self.controlnet, ControlNetModel): - if not isinstance(controlnet_conditioning_scale, - (float, list, tuple)): + if not isinstance(controlnet_conditioning_scale, (float, list, tuple)): raise TypeError( "For single controlnet: `controlnet_conditioning_scale` must be type `float, list(float) or tuple(float)`." ) elif isinstance(self.controlnet, MultiControlNetModel): if isinstance(controlnet_conditioning_scale, list): - if any( - isinstance(i, list) - for i in controlnet_conditioning_scale): - raise ValueError( - "A single batch of multiple conditionings are supported at the moment." - ) - elif isinstance( - controlnet_conditioning_scale, - list) and len(controlnet_conditioning_scale) != len( - self.controlnet.nets): + if any(isinstance(i, list) for i in controlnet_conditioning_scale): + raise ValueError("A single batch of multiple conditionings are supported at the moment.") + elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len( + self.controlnet.nets + ): raise ValueError( "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have" - " the same length as the number of controlnets") + " the same length as the number of controlnets" + ) else: assert False def check_image(self, image, prompt): image_is_pil = isinstance(image, PIL.Image.Image) image_is_tensor = isinstance(image, paddle.Tensor) - image_is_pil_list = isinstance(image, list) and isinstance( - image[0], PIL.Image.Image) - image_is_tensor_list = isinstance(image, list) and isinstance( - image[0], paddle.Tensor) + image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image) + image_is_tensor_list = isinstance(image, list) and isinstance(image[0], paddle.Tensor) - if (not image_is_pil and not image_is_tensor and - not image_is_pil_list and not image_is_tensor_list): + if not image_is_pil and not image_is_tensor and not image_is_pil_list and not image_is_tensor_list: raise TypeError( "image must be one of PIL image, paddle tensor, list of PIL images, or list of paddle tensors" ) @@ -725,27 +713,16 @@ def check_image(self, image, prompt): def prepare_image(self, image, width, height, dtype, resize_mode=-1): if not isinstance(image, paddle.Tensor): if isinstance(image, PIL.Image.Image): - image = resize_image( - resize_mode=resize_mode, - im=image, - width=width, - height=height) + image = resize_image(resize_mode=resize_mode, im=image, width=width, height=height) image = [image] if isinstance(image[0], PIL.Image.Image): - image = [ - resize_image( - resize_mode=resize_mode, - im=im, - width=width, - height=height) for im in image - ] + image = [resize_image(resize_mode=resize_mode, im=im, width=width, height=height) for im in image] images = [] for image_ in image: image_ = image_.convert("RGB") - image_ = image_.resize( - (width, height), resample=PIL_INTERPOLATION["lanczos"]) + image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"]) image_ = np.array(image_) image_ = image_[None, :] images.append(image_) @@ -761,14 +738,15 @@ def prepare_image(self, image, width, height, dtype, resize_mode=-1): return image def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = [ batch_size, num_channels_latents, @@ -812,31 +790,31 @@ def _default_height_width(self, height, width, image): @paddle.no_grad() def __call__( - self, - prompt: str=None, - image: PIL.Image.Image=None, - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: str=None, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - clip_skip: int=1, - controlnet_conditioning_scale: Union[float, List[float]]=1.0, - enable_lora: bool=True, - resize_mode: int=0, - # ["Just resize", "Crop and resize", "Resize and fill", "Do nothing"] - # 0 1 2 -1 - starting_control_step: float=0.0, - ending_control_step: float=1.0, ): + self, + prompt: str = None, + image: PIL.Image.Image = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: str = None, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + clip_skip: int = 1, + controlnet_conditioning_scale: Union[float, List[float]] = 1.0, + enable_lora: bool = True, + resize_mode: int = 0, + # ["Just resize", "Crop and resize", "Resize and fill", "Do nothing"] + # 0 1 2 -1 + starting_control_step: float = 0.0, + ending_control_step: float = 1.0, + ): r""" Function invoked when calling the pipeline for generation. @@ -914,17 +892,16 @@ def __call__( # 0. Default height and width to unet if enable_control: if isinstance(self.controlnet, ControlNetModel): - height, width = self._default_height_width(height, width, - image) + height, width = self._default_height_width(height, width, image) image = self.prepare_image( image=image, width=width, height=height, dtype=self.controlnet.dtype, - resize_mode=resize_mode, ) + resize_mode=resize_mode, + ) elif isinstance(self.controlnet, MultiControlNetModel): - height, width = self._default_height_width(height, width, - image) + height, width = self._default_height_width(height, width, image) images = [] for image_ in image: @@ -933,16 +910,15 @@ def __call__( width=width, height=height, dtype=self.controlnet.dtype, - resize_mode=resize_mode, ) + resize_mode=resize_mode, + ) images.append(image_) image = images else: - height = height or max(self.unet.config.sample_size * - self.vae_scale_factor, 512) - width = width or max(self.unet.config.sample_size * - self.vae_scale_factor, 512) + height = height or max(self.unet.config.sample_size * self.vae_scale_factor, 512) + width = width or max(self.unet.config.sample_size * self.vae_scale_factor, 512) # 1. Check inputs. Raise error if not correct self.check_inputs( @@ -952,7 +928,8 @@ def __call__( width, callback_steps, negative_prompt, - controlnet_conditioning_scale, ) + controlnet_conditioning_scale, + ) # 2. Define call parameters batch_size = 1 @@ -966,47 +943,34 @@ def __call__( if enable_lora and self.LORA_DIR is not None: if os.path.exists(self.LORA_DIR): - lora_mapping = { - p.stem: p.absolute() - for p in Path(self.LORA_DIR).glob("*.safetensors") - } + lora_mapping = {p.stem: p.absolute() for p in Path(self.LORA_DIR).glob("*.safetensors")} for params in extra_network_data["lora"]: assert len(params.items) > 0 name = params.items[0] if name in lora_mapping: - ratio = (float(params.items[1]) - if len(params.items) > 1 else 1.0) - lora_state_dict = smart_load( - lora_mapping[name], - map_location=paddle.get_device()) + ratio = float(params.items[1]) if len(params.items) > 1 else 1.0 + lora_state_dict = smart_load(lora_mapping[name], map_location=paddle.get_device()) self.weights_has_changed = True - load_lora( - self, state_dict=lora_state_dict, ratio=ratio) + load_lora(self, state_dict=lora_state_dict, ratio=ratio) del lora_state_dict else: - print( - f"We can't find lora weight: {name}! Please make sure that exists!" - ) + print(f"We can't find lora weight: {name}! Please make sure that exists!") else: if len(extra_network_data["lora"]) > 0: - print( - f"{self.LORA_DIR} not exists, so we cant load loras!" - ) + print(f"{self.LORA_DIR} not exists, so we cant load loras!") self.sj.clip.CLIP_stop_at_last_layers = clip_skip - if isinstance(self.controlnet, MultiControlNetModel) and isinstance( - controlnet_conditioning_scale, float): - controlnet_conditioning_scale = [ - controlnet_conditioning_scale - ] * len(self.controlnet.nets) + if isinstance(self.controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float): + controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(self.controlnet.nets) # 3. Encode input prompt prompt_embeds, negative_prompt_embeds = self._encode_prompt( prompts, do_classifier_free_guidance, negative_prompt, - num_inference_steps=num_inference_steps, ) + num_inference_steps=num_inference_steps, + ) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -1021,127 +985,107 @@ def __call__( width, self.unet.dtype, generator, - latents, ) + latents, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. Denoising loop - num_warmup_steps = ( - len(timesteps) - num_inference_steps * self.scheduler.order) + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): current_control_step = i / len(timesteps) step = i // self.scheduler.order do_batch = False - conds_list, cond_tensor = reconstruct_multicond_batch( - prompt_embeds, step) + conds_list, cond_tensor = reconstruct_multicond_batch(prompt_embeds, step) try: weight = conds_list[0][0][1] except Exception: weight = 1.0 if do_classifier_free_guidance: - uncond_tensor = reconstruct_cond_batch( - negative_prompt_embeds, step) - do_batch = cond_tensor.shape[1] == uncond_tensor.shape[ - 1] and not isinstance(self.controlnet, - MultiControlNetModel) + uncond_tensor = reconstruct_cond_batch(negative_prompt_embeds, step) + do_batch = cond_tensor.shape[1] == uncond_tensor.shape[1] and not isinstance( + self.controlnet, MultiControlNetModel + ) # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) - if do_batch else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_batch else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) if do_batch: - encoder_hidden_states = paddle.concat( - [uncond_tensor, cond_tensor]) + encoder_hidden_states = paddle.concat([uncond_tensor, cond_tensor]) control_kwargs = {} - if (enable_control and starting_control_step < - current_control_step < ending_control_step): - ( - down_block_res_samples, - mid_block_res_sample, - ) = self.controlnet( + if enable_control and starting_control_step < current_control_step < ending_control_step: + (down_block_res_samples, mid_block_res_sample,) = self.controlnet( latent_model_input, t, encoder_hidden_states=encoder_hidden_states, controlnet_cond=paddle.concat([image, image]), conditioning_scale=controlnet_conditioning_scale, - return_dict=False, ) - control_kwargs[ - "down_block_additional_residuals"] = down_block_res_samples - control_kwargs[ - "mid_block_additional_residual"] = mid_block_res_sample + return_dict=False, + ) + control_kwargs["down_block_additional_residuals"] = down_block_res_samples + control_kwargs["mid_block_additional_residual"] = mid_block_res_sample noise_pred = self.unet( latent_model_input, t, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=cross_attention_kwargs, - **control_kwargs, ).sample + **control_kwargs, + ).sample noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) noise_pred = noise_pred_uncond + weight * guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred_text - noise_pred_uncond + ) else: control_kwargs = {} - if (enable_control and starting_control_step < - current_control_step < ending_control_step): - ( - down_block_res_samples, - mid_block_res_sample, - ) = self.controlnet( + if enable_control and starting_control_step < current_control_step < ending_control_step: + (down_block_res_samples, mid_block_res_sample,) = self.controlnet( latent_model_input, t, encoder_hidden_states=cond_tensor, controlnet_cond=image, conditioning_scale=controlnet_conditioning_scale, - return_dict=False, ) - control_kwargs[ - "down_block_additional_residuals"] = down_block_res_samples - control_kwargs[ - "mid_block_additional_residual"] = mid_block_res_sample + return_dict=False, + ) + control_kwargs["down_block_additional_residuals"] = down_block_res_samples + control_kwargs["mid_block_additional_residual"] = mid_block_res_sample noise_pred = self.unet( latent_model_input, t, encoder_hidden_states=cond_tensor, cross_attention_kwargs=cross_attention_kwargs, - **control_kwargs, ).sample + **control_kwargs, + ).sample if do_classifier_free_guidance: control_kwargs = {} - if (enable_control and starting_control_step < - current_control_step < ending_control_step): - ( - down_block_res_samples, - mid_block_res_sample, - ) = self.controlnet( + if enable_control and starting_control_step < current_control_step < ending_control_step: + (down_block_res_samples, mid_block_res_sample,) = self.controlnet( latent_model_input, t, encoder_hidden_states=uncond_tensor, controlnet_cond=image, conditioning_scale=controlnet_conditioning_scale, - return_dict=False, ) - control_kwargs[ - "down_block_additional_residuals"] = down_block_res_samples - control_kwargs[ - "mid_block_additional_residual"] = mid_block_res_sample + return_dict=False, + ) + control_kwargs["down_block_additional_residuals"] = down_block_res_samples + control_kwargs["mid_block_additional_residual"] = mid_block_res_sample noise_pred_uncond = self.unet( latent_model_input, t, encoder_hidden_states=uncond_tensor, cross_attention_kwargs=cross_attention_kwargs, - **control_kwargs, ).sample - noise_pred = noise_pred_uncond + weight * guidance_scale * ( - noise_pred - noise_pred_uncond) + **control_kwargs, + ).sample + noise_pred = noise_pred_uncond + weight * guidance_scale * (noise_pred - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step( - noise_pred, t, latents, **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -1154,8 +1098,7 @@ def __call__( image = self.decode_latents(latents) # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker( - image, self.unet.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, self.unet.dtype) # 10. Convert to PIL image = self.numpy_to_pil(image) @@ -1164,14 +1107,12 @@ def __call__( image = self.decode_latents(latents) # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker( - image, self.unet.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, self.unet.dtype) if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) except Exception as e: raise ValueError(e) finally: @@ -1215,12 +1156,7 @@ class FrozenCLIPEmbedder(nn.Layer): LAYERS = ["last", "pooled", "hidden"] - def __init__(self, - text_encoder, - tokenizer, - freeze=True, - layer="last", - layer_idx=None): + def __init__(self, text_encoder, tokenizer, freeze=True, layer="last", layer_idx=None): super().__init__() assert layer in self.LAYERS self.tokenizer = tokenizer @@ -1244,12 +1180,14 @@ def forward(self, text): truncation=True, max_length=self.tokenizer.model_max_length, padding="max_length", - return_tensors="pd", ) + return_tensors="pd", + ) tokens = batch_encoding["input_ids"] outputs = self.text_encoder( input_ids=tokens, output_hidden_states=self.layer == "hidden", - return_dict=True, ) + return_dict=True, + ) if self.layer == "last": z = outputs.last_hidden_state elif self.layer == "pooled": @@ -1288,8 +1226,7 @@ def empty_chunk(self): def get_target_prompt_token_count(self, token_count): """returns the maximum number of tokens a prompt of a known length can have before it requires one more PromptChunk to be represented""" - return math.ceil(max(token_count, 1) / - self.chunk_length) * self.chunk_length + return math.ceil(max(token_count, 1) / self.chunk_length) * self.chunk_length def tokenize(self, texts): """Converts a batch of texts into a batch of token ids""" @@ -1370,10 +1307,12 @@ def next_chunk(is_last=False): # this is when we are at the end of alloted 75 tokens for the current chunk, and the current token is not a comma. opts.comma_padding_backtrack # is a setting that specifies that if there is a comma nearby, the text after the comma should be moved out of this chunk and into the next. - elif (WebUIStableDiffusionPipeline.comma_padding_backtrack != 0 - and len(chunk.tokens) == self.chunk_length and - last_comma != -1 and len(chunk.tokens) - last_comma <= - WebUIStableDiffusionPipeline.comma_padding_backtrack): + elif ( + WebUIStableDiffusionPipeline.comma_padding_backtrack != 0 + and len(chunk.tokens) == self.chunk_length + and last_comma != -1 + and len(chunk.tokens) - last_comma <= WebUIStableDiffusionPipeline.comma_padding_backtrack + ): break_location = last_comma + 1 reloc_tokens = chunk.tokens[break_location:] @@ -1392,8 +1331,7 @@ def next_chunk(is_last=False): ( embedding, embedding_length_in_tokens, - ) = self.hijack.embedding_db.find_embedding_at_position( - tokens, position) + ) = self.hijack.embedding_db.find_embedding_at_position(tokens, position) if embedding is None: chunk.tokens.append(token) chunk.multipliers.append(weight) @@ -1455,10 +1393,7 @@ def forward(self, texts): zs = [] for i in range(chunk_count): - batch_chunk = [ - chunks[i] if i < len(chunks) else self.empty_chunk() - for chunks in batch_chunks - ] + batch_chunk = [chunks[i] if i < len(chunks) else self.empty_chunk() for chunks in batch_chunks] tokens = [x.tokens for x in batch_chunk] multipliers = [x.multipliers for x in batch_chunk] @@ -1472,10 +1407,9 @@ def forward(self, texts): zs.append(z) if len(used_embeddings) > 0: - embeddings_list = ", ".join([ - f"{name} [{embedding.checksum()}]" - for name, embedding in used_embeddings.items() - ]) + embeddings_list = ", ".join( + [f"{name} [{embedding.checksum()}]" for name, embedding in used_embeddings.items()] + ) self.hijack.comments.append(f"Used embeddings: {embeddings_list}") return paddle.concat(zs, axis=1) @@ -1494,15 +1428,19 @@ def process_tokens(self, remade_batch_tokens, batch_multipliers): if self.id_end != self.id_pad: for batch_pos in range(len(remade_batch_tokens)): index = remade_batch_tokens[batch_pos].index(self.id_end) - tokens[batch_pos, index + 1:tokens.shape[1]] = self.id_pad + tokens[batch_pos, index + 1 : tokens.shape[1]] = self.id_pad z = self.encode_with_text_encoder(tokens) # restoring original mean is likely not correct, but it seems to work well to prevent artifacts that happen otherwise batch_multipliers = paddle.to_tensor(batch_multipliers) original_mean = z.mean() - z = z * batch_multipliers.reshape(batch_multipliers.shape + - [1, ]).expand(z.shape) + z = z * batch_multipliers.reshape( + batch_multipliers.shape + + [ + 1, + ] + ).expand(z.shape) new_mean = z.mean() z = z * (original_mean / new_mean) @@ -1520,8 +1458,7 @@ def __init__(self, wrapped, hijack, CLIP_stop_at_last_layers=-1): self.comma_token = vocab.get(",", None) self.token_mults = {} - tokens_with_parens = [(k, v) for k, v in vocab.items() - if "(" in k or ")" in k or "[" in k or "]" in k] + tokens_with_parens = [(k, v) for k, v in vocab.items() if "(" in k or ")" in k or "[" in k or "]" in k] for text, ident in tokens_with_parens: mult = 1.0 for c in text: @@ -1542,8 +1479,7 @@ def __init__(self, wrapped, hijack, CLIP_stop_at_last_layers=-1): self.id_pad = self.id_end def tokenize(self, texts): - tokenized = self.wrapped.tokenizer( - texts, truncation=False, add_special_tokens=False)["input_ids"] + tokenized = self.wrapped.tokenizer(texts, truncation=False, add_special_tokens=False)["input_ids"] return tokenized @@ -1552,7 +1488,8 @@ def encode_with_text_encoder(self, tokens): outputs = self.wrapped.text_encoder( input_ids=tokens, output_hidden_states=output_hidden_states, - return_dict=True, ) + return_dict=True, + ) if output_hidden_states: z = outputs.hidden_states[-self.CLIP_stop_at_last_layers] @@ -1564,11 +1501,9 @@ def encode_with_text_encoder(self, tokens): def encode_embedding_init_text(self, init_text, nvpt): embedding_layer = self.wrapped.text_encoder.text_model - ids = self.wrapped.tokenizer( - init_text, - max_length=nvpt, - return_tensors="pd", - add_special_tokens=False)["input_ids"] + ids = self.wrapped.tokenizer(init_text, max_length=nvpt, return_tensors="pd", add_special_tokens=False)[ + "input_ids" + ] embedded = embedding_layer.token_embedding.wrapped(ids).squeeze(0) return embedded @@ -1630,8 +1565,7 @@ def parse_prompts(prompts): class EmbeddingDecoder(json.JSONDecoder): def __init__(self, *args, **kwargs): - json.JSONDecoder.__init__( - self, object_hook=self.object_hook, *args, **kwargs) + json.JSONDecoder.__init__(self, object_hook=self.object_hook, *args, **kwargs) def object_hook(self, d): if "TORCHTENSOR" in d: @@ -1652,8 +1586,7 @@ def lcg(m=2**32, a=1664525, c=1013904223, seed=0): def xor_block(block): g = lcg() - randblock = (np.array([next(g) for _ in range(np.product(block.shape))]) - .astype(np.uint8).reshape(block.shape)) + randblock = np.array([next(g) for _ in range(np.product(block.shape))]).astype(np.uint8).reshape(block.shape) return np.bitwise_xor(block.astype(np.uint8), randblock & 0x0F) @@ -1667,16 +1600,17 @@ def crop_black(img, tol=0): def extract_image_data_embed(image): d = 3 - outarr = (crop_black( - np.array(image.convert("RGB").getdata()) - .reshape(image.size[1], image.size[0], d).astype(np.uint8)) & 0x0F) + outarr = ( + crop_black(np.array(image.convert("RGB").getdata()).reshape(image.size[1], image.size[0], d).astype(np.uint8)) + & 0x0F + ) black_cols = np.where(np.sum(outarr, axis=(0, 2)) == 0) if black_cols[0].shape[0] < 2: print("No Image data blocks found.") return None - data_block_lower = outarr[:, :black_cols[0].min(), :].astype(np.uint8) - data_block_upper = outarr[:, black_cols[0].max() + 1:, :].astype(np.uint8) + data_block_lower = outarr[:, : black_cols[0].min(), :].astype(np.uint8) + data_block_upper = outarr[:, black_cols[0].max() + 1 :, :].astype(np.uint8) data_block_lower = xor_block(data_block_lower) data_block_upper = xor_block(data_block_upper) @@ -1703,7 +1637,8 @@ def extract_image_data_embed(image): # [75, 'fantasy landscape with a lake and an oak in background masterful'] # [100, 'fantasy landscape with a lake and a christmas tree in background masterful'] -schedule_parser = lark.Lark(r""" +schedule_parser = lark.Lark( + r""" !start: (prompt | /[][():]/+)* prompt: (emphasized | scheduled | alternate | plain | WHITESPACE)* !emphasized: "(" prompt ")" @@ -1714,7 +1649,8 @@ def extract_image_data_embed(image): WHITESPACE: /\s+/ plain: /([^\\\[\]():|]|\\.)+/ %import common.SIGNED_NUMBER -> NUMBER -""") +""" +) def get_learned_conditioning_prompt_schedules(prompts, steps): @@ -1806,8 +1742,7 @@ def get_schedule(prompt): return [promptdict[prompt] for prompt in prompts] -ScheduledPromptConditioning = namedtuple("ScheduledPromptConditioning", - ["end_at_step", "cond"]) +ScheduledPromptConditioning = namedtuple("ScheduledPromptConditioning", ["end_at_step", "cond"]) def get_learned_conditioning(model, prompts, steps): @@ -1845,8 +1780,7 @@ def get_learned_conditioning(model, prompts, steps): cond_schedule = [] for i, (end_at_step, text) in enumerate(prompt_schedule): - cond_schedule.append( - ScheduledPromptConditioning(end_at_step, conds[i])) + cond_schedule.append(ScheduledPromptConditioning(end_at_step, conds[i])) cache[prompt] = cond_schedule res.append(cond_schedule) @@ -1871,8 +1805,7 @@ def get_multicond_prompt_list(prompts): for subprompt in subprompts: match = re_weight.search(subprompt) - text, weight = match.groups() if match is not None else (subprompt, - 1.0) + text, weight = match.groups() if match is not None else (subprompt, 1.0) weight = float(weight) if weight is not None else 1.0 @@ -1897,43 +1830,37 @@ def __init__(self, schedules, weight=1.0): class MulticondLearnedConditioning: def __init__(self, shape, batch): - self.shape: tuple = ( - shape # the shape field is needed to send this object to DDIM/PLMS - ) + self.shape: tuple = shape # the shape field is needed to send this object to DDIM/PLMS self.batch: List[List[ComposableScheduledPromptConditioning]] = batch -def get_multicond_learned_conditioning(model, prompts, - steps) -> MulticondLearnedConditioning: +def get_multicond_learned_conditioning(model, prompts, steps) -> MulticondLearnedConditioning: """same as get_learned_conditioning, but returns a list of ScheduledPromptConditioning along with the weight objects for each prompt. For each prompt, the list is obtained by splitting the prompt using the AND separator. https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/ """ - res_indexes, prompt_flat_list, prompt_indexes = get_multicond_prompt_list( - prompts) + res_indexes, prompt_flat_list, prompt_indexes = get_multicond_prompt_list(prompts) - learned_conditioning = get_learned_conditioning(model, prompt_flat_list, - steps) + learned_conditioning = get_learned_conditioning(model, prompt_flat_list, steps) res = [] for indexes in res_indexes: - res.append([ - ComposableScheduledPromptConditioning(learned_conditioning[i], - weight) - for i, weight in indexes - ]) + res.append([ComposableScheduledPromptConditioning(learned_conditioning[i], weight) for i, weight in indexes]) - return MulticondLearnedConditioning(shape=(len(prompts), ), batch=res) + return MulticondLearnedConditioning(shape=(len(prompts),), batch=res) -def reconstruct_cond_batch(c: List[List[ScheduledPromptConditioning]], - current_step): +def reconstruct_cond_batch(c: List[List[ScheduledPromptConditioning]], current_step): param = c[0][0].cond res = paddle.zeros( - [len(c), ] + param.shape, - dtype=param.dtype, ) + [ + len(c), + ] + + param.shape, + dtype=param.dtype, + ) for i, cond_schedule in enumerate(c): target_index = 0 for current, (end_at, cond) in enumerate(cond_schedule): @@ -1956,8 +1883,7 @@ def reconstruct_multicond_batch(c: MulticondLearnedConditioning, current_step): for cond_index, composable_prompt in enumerate(composable_prompts): target_index = 0 - for current, (end_at, - cond) in enumerate(composable_prompt.schedules): + for current, (end_at, cond) in enumerate(composable_prompt.schedules): if current_step <= end_at: target_index = current break @@ -1973,10 +1899,8 @@ def reconstruct_multicond_batch(c: MulticondLearnedConditioning, current_step): for i in range(len(tensors)): if tensors[i].shape[0] != token_count: last_vector = tensors[i][-1:] - last_vector_repeated = last_vector.tile( - [token_count - tensors[i].shape[0], 1]) - tensors[i] = paddle.concat( - [tensors[i], last_vector_repeated], axis=0) + last_vector_repeated = last_vector.tile([token_count - tensors[i].shape[0], 1]) + tensors[i] = paddle.concat([tensors[i], last_vector_repeated], axis=0) return conds_list, paddle.stack(tensors).cast(dtype=param.dtype) @@ -1997,7 +1921,8 @@ def reconstruct_multicond_batch(c: MulticondLearnedConditioning, current_step): [^\\()\[\]:]+| : """, - re.X, ) + re.X, +) re_break = re.compile(r"\s*\bBREAK\b\s*", re.S) @@ -2102,15 +2027,12 @@ class StableDiffusionModelHijack: layers = None circular_enabled = False - def __init__(self, - clip_model, - embeddings_dir=None, - CLIP_stop_at_last_layers=-1): + def __init__(self, clip_model, embeddings_dir=None, CLIP_stop_at_last_layers=-1): model_embeddings = clip_model.text_encoder.text_model - model_embeddings.token_embedding = EmbeddingsWithFixes( - model_embeddings.token_embedding, self) + model_embeddings.token_embedding = EmbeddingsWithFixes(model_embeddings.token_embedding, self) clip_model = FrozenCLIPEmbedderWithCustomWords( - clip_model, self, CLIP_stop_at_last_layers=CLIP_stop_at_last_layers) + clip_model, self, CLIP_stop_at_last_layers=CLIP_stop_at_last_layers + ) self.embedding_db = EmbeddingDatabase(clip_model) self.embedding_db.add_embedding_dir(embeddings_dir) @@ -2148,8 +2070,7 @@ def forward(self, input_ids): inputs_embeds = self.wrapped(input_ids) - if (batch_fixes is None or len(batch_fixes) == 0 or - max([len(x) for x in batch_fixes]) == 0): + if batch_fixes is None or len(batch_fixes) == 0 or max([len(x) for x in batch_fixes]) == 0: return inputs_embeds vecs = [] @@ -2157,11 +2078,13 @@ def forward(self, input_ids): for offset, embedding in fixes: emb = embedding.vec.cast(self.wrapped.dtype) emb_len = min(tensor.shape[0] - offset - 1, emb.shape[0]) - tensor = paddle.concat([ - tensor[0:offset + 1], - emb[0:emb_len], - tensor[offset + 1 + emb_len:], - ]) + tensor = paddle.concat( + [ + tensor[0 : offset + 1], + emb[0:emb_len], + tensor[offset + 1 + emb_len :], + ] + ) vecs.append(tensor) @@ -2190,12 +2113,8 @@ def __init__(self, vec, name, step=None): def save(self, filename): embedding_data = { - "string_to_token": { - "*": 265 - }, - "string_to_param": { - "*": self.vec - }, + "string_to_token": {"*": 265}, + "string_to_param": {"*": self.vec}, "name": self.name, "step": self.step, "sd_checkpoint": self.sd_checkpoint, @@ -2267,7 +2186,8 @@ def register_embedding(self, embedding, model): self.ids_lookup[first_id] = sorted( self.ids_lookup[first_id] + [(ids, embedding)], key=lambda x: len(x[0]), - reverse=True, ) + reverse=True, + ) return embedding @@ -2285,8 +2205,7 @@ def load_from_file(self, path, filename): return embed_image = Image.open(path) - if hasattr(embed_image, - "text") and "sd-ti-embedding" in embed_image.text: + if hasattr(embed_image, "text") and "sd-ti-embedding" in embed_image.text: data = embedding_from_b64(embed_image.text["sd-ti-embedding"]) name = data.get("name", name) else: @@ -2308,14 +2227,11 @@ def load_from_file(self, path, filename): param_dict = data["string_to_param"] if hasattr(param_dict, "_parameters"): param_dict = getattr(param_dict, "_parameters") - assert len( - param_dict) == 1, "embedding file has multiple terms in it" + assert len(param_dict) == 1, "embedding file has multiple terms in it" emb = next(iter(param_dict.items()))[1] # diffuser concepts - elif type(data) == dict and type(next(iter(data.values( - )))) == paddle.Tensor: - assert len(data.keys( - )) == 1, "embedding file has multiple terms in it" + elif type(data) == dict and type(next(iter(data.values()))) == paddle.Tensor: + assert len(data.keys()) == 1, "embedding file has multiple terms in it" emb = next(iter(data.values())) if len(emb.shape) == 1: @@ -2387,7 +2303,8 @@ def load_textual_inversion_embeddings(self, force_reload=False): displayed_embeddings = ( tuple(self.word_embeddings.keys()), - tuple(self.skipped_embeddings.keys()), ) + tuple(self.skipped_embeddings.keys()), + ) if self.previously_displayed_embeddings != displayed_embeddings: self.previously_displayed_embeddings = displayed_embeddings print( @@ -2406,7 +2323,7 @@ def find_embedding_at_position(self, tokens, offset): return None, None for ids, embedding in possible_matches: - if tokens[offset:offset + len(ids)] == ids: + if tokens[offset : offset + len(ids)] == ids: return embedding, len(ids) return None, None diff --git a/ppdiffusers/examples/community/wildcard_stable_diffusion.py b/ppdiffusers/examples/community/wildcard_stable_diffusion.py index 80eb36c2a700c..93ad2d40a130a 100644 --- a/ppdiffusers/examples/community/wildcard_stable_diffusion.py +++ b/ppdiffusers/examples/community/wildcard_stable_diffusion.py @@ -21,18 +21,18 @@ from typing import Callable, Dict, List, Optional, Union import paddle -from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ppdiffusers.configuration_utils import FrozenDict from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel from ppdiffusers.pipeline_utils import DiffusionPipeline -from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import \ - StableDiffusionPipelineOutput -from ppdiffusers.pipelines.stable_diffusion.safety_checker import \ - StableDiffusionSafetyChecker -from ppdiffusers.schedulers import (DDIMScheduler, LMSDiscreteScheduler, - PNDMScheduler) +from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import ( + StableDiffusionPipelineOutput, +) +from ppdiffusers.pipelines.stable_diffusion.safety_checker import ( + StableDiffusionSafetyChecker, +) +from ppdiffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from ppdiffusers.utils import deprecate, logging logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -50,8 +50,7 @@ def read_wildcard_values(path: str): return f.read().splitlines() -def grab_wildcard_values(wildcard_option_dict: Dict[str, List[str]]={}, - wildcard_files: List[str]=[]): +def grab_wildcard_values(wildcard_option_dict: Dict[str, List[str]] = {}, wildcard_files: List[str] = []): for wildcard_file in wildcard_files: filename = get_filename(wildcard_file) read_values = read_wildcard_values(wildcard_file) @@ -62,19 +61,18 @@ def grab_wildcard_values(wildcard_option_dict: Dict[str, List[str]]={}, def replace_prompt_with_wildcards( - prompt: str, - wildcard_option_dict: Dict[str, List[str]]={}, - wildcard_files: List[str]=[], ): + prompt: str, + wildcard_option_dict: Dict[str, List[str]] = {}, + wildcard_files: List[str] = [], +): new_prompt = prompt # get wildcard options - wildcard_option_dict = grab_wildcard_values(wildcard_option_dict, - wildcard_files) + wildcard_option_dict = grab_wildcard_values(wildcard_option_dict, wildcard_files) for m in global_re_wildcard.finditer(new_prompt): wildcard_value = m.group() - replace_value = random.choice(wildcard_option_dict[wildcard_value.strip( - "__")]) + replace_value = random.choice(wildcard_option_dict[wildcard_value.strip("__")]) new_prompt = new_prompt.replace(wildcard_value, replace_value, 1) return new_prompt @@ -125,31 +123,27 @@ class WildcardStableDiffusionPipeline(DiffusionPipeline): """ def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: Union[DDIMScheduler, PNDMScheduler, - LMSDiscreteScheduler], - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + ): super().__init__() - if (hasattr(scheduler.config, "steps_offset") and - scheduler.config.steps_offset != 1): + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " "to update the config accordingly as leaving `steps_offset` might led to incorrect results" " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" - " file") - deprecate( - "steps_offset!=1", - "1.0.0", - deprecation_message, - standard_warn=False) + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["steps_offset"] = 1 scheduler._internal_dict = FrozenDict(new_config) @@ -171,29 +165,31 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) + feature_extractor=feature_extractor, + ) @paddle.no_grad() def __call__( - self, - prompt: Union[str, List[str]], - height: int=512, - width: int=512, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - seed: Optional[int]=None, - latents: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - wildcard_option_dict: Dict[str, List[str]]={}, - wildcard_files: List[str]=[], - num_prompt_samples: Optional[int]=1, - **kwargs, ): + self, + prompt: Union[str, List[str]], + height: int = 512, + width: int = 512, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + seed: Optional[int] = None, + latents: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + wildcard_option_dict: Dict[str, List[str]] = {}, + wildcard_files: List[str] = [], + num_prompt_samples: Optional[int] = 1, + **kwargs, + ): r""" Function invoked when calling the pipeline for generation. Args: @@ -254,8 +250,7 @@ def __call__( if isinstance(prompt, str): prompt = [ - replace_prompt_with_wildcards(prompt, wildcard_option_dict, - wildcard_files) + replace_prompt_with_wildcards(prompt, wildcard_option_dict, wildcard_files) for i in range(num_prompt_samples) ] batch_size = len(prompt) @@ -263,52 +258,46 @@ def __call__( prompt_list = [] for p in prompt: for i in range(num_prompt_samples): - prompt_list.append( - replace_prompt_with_wildcards(p, wildcard_option_dict, - wildcard_files)) + prompt_list.append(replace_prompt_with_wildcards(p, wildcard_option_dict, wildcard_files)) prompt = prompt_list batch_size = len(prompt) else: - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) # get prompt text embeddings text_inputs = self.tokenizer( prompt, padding="max_length", max_length=self.tokenizer.model_max_length, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids if text_input_ids.shape[-1] > self.tokenizer.model_max_length: - removed_text = self.tokenizer.batch_decode( - text_input_ids[:, self.tokenizer.model_max_length:]) + removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :]) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") - text_input_ids = text_input_ids[:, :self.tokenizer.model_max_length] + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length] attention_mask = paddle.ones_like(text_input_ids) - text_embeddings = self.text_encoder( - text_input_ids, attention_mask=attention_mask)[0] + text_embeddings = self.text_encoder(text_input_ids, attention_mask=attention_mask)[0] # duplicate text embeddings for each generation per prompt, using mps friendly method bs_embed, seq_len, _ = text_embeddings.shape text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1]) - text_embeddings = text_embeddings.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` @@ -322,14 +311,16 @@ def __call__( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt @@ -339,23 +330,20 @@ def __call__( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) attention_mask = paddle.ones_like(uncond_input.input_ids) - uncond_embeddings = self.text_encoder( - uncond_input.input_ids, attention_mask=attention_mask)[0] + uncond_embeddings = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask)[0] # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = uncond_embeddings.shape[1] - uncond_embeddings = uncond_embeddings.tile( - [batch_size, num_images_per_prompt, 1]) - uncond_embeddings = uncond_embeddings.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + uncond_embeddings = uncond_embeddings.tile([batch_size, num_images_per_prompt, 1]) + uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - text_embeddings = paddle.concat( - [uncond_embeddings, text_embeddings]) + text_embeddings = paddle.concat([uncond_embeddings, text_embeddings]) # get the initial random noise unless the user supplied it @@ -375,9 +363,7 @@ def __call__( latents = paddle.randn(latents_shape, dtype=latents_dtype) else: if latents.shape != latents_shape: - raise ValueError( - f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}" - ) + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}") latents = latents # set timesteps @@ -394,33 +380,26 @@ def __call__( # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta for i, t in enumerate(self.progress_bar(timesteps_tensor)): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) - if do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual - noise_pred = self.unet( - latent_model_input, t, - encoder_hidden_states=text_embeddings).sample + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided if callback is not None and i % callback_steps == 0: @@ -435,12 +414,11 @@ def __call__( image = image.transpose([0, 2, 3, 1]).astype("float32").numpy() if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( images=image, - clip_input=safety_checker_input.pixel_values.astype( - text_embeddings.dtype), ) + clip_input=safety_checker_input.pixel_values.astype(text_embeddings.dtype), + ) else: has_nsfw_concept = None @@ -450,7 +428,4 @@ def __call__( if not return_dict: return (image, has_nsfw_concept) - return WildcardStableDiffusionOutput( - images=image, - nsfw_content_detected=has_nsfw_concept, - prompts=prompt) + return WildcardStableDiffusionOutput(images=image, nsfw_content_detected=has_nsfw_concept, prompts=prompt) diff --git a/ppdiffusers/examples/controlnet/annotator/hed/__init__.py b/ppdiffusers/examples/controlnet/annotator/hed/__init__.py index bd00e8dcc89f6..2088a37dbd9a5 100644 --- a/ppdiffusers/examples/controlnet/annotator/hed/__init__.py +++ b/ppdiffusers/examples/controlnet/annotator/hed/__init__.py @@ -27,133 +27,60 @@ def __init__(self, model_path=None): super().__init__() self.netVggOne = paddle.nn.Sequential( - paddle.nn.Conv2D( - in_channels=3, - out_channels=64, - kernel_size=3, - stride=1, - padding=1), + paddle.nn.Conv2D(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1), paddle.nn.ReLU(), - paddle.nn.Conv2D( - in_channels=64, - out_channels=64, - kernel_size=3, - stride=1, - padding=1), - paddle.nn.ReLU(), ) + paddle.nn.Conv2D(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1), + paddle.nn.ReLU(), + ) self.netVggTwo = paddle.nn.Sequential( - paddle.nn.MaxPool2D( - kernel_size=2, stride=2), - paddle.nn.Conv2D( - in_channels=64, - out_channels=128, - kernel_size=3, - stride=1, - padding=1), + paddle.nn.MaxPool2D(kernel_size=2, stride=2), + paddle.nn.Conv2D(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1), + paddle.nn.ReLU(), + paddle.nn.Conv2D(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1), paddle.nn.ReLU(), - paddle.nn.Conv2D( - in_channels=128, - out_channels=128, - kernel_size=3, - stride=1, - padding=1), - paddle.nn.ReLU(), ) + ) self.netVggThr = paddle.nn.Sequential( - paddle.nn.MaxPool2D( - kernel_size=2, stride=2), - paddle.nn.Conv2D( - in_channels=128, - out_channels=256, - kernel_size=3, - stride=1, - padding=1), + paddle.nn.MaxPool2D(kernel_size=2, stride=2), + paddle.nn.Conv2D(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1), + paddle.nn.ReLU(), + paddle.nn.Conv2D(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1), paddle.nn.ReLU(), - paddle.nn.Conv2D( - in_channels=256, - out_channels=256, - kernel_size=3, - stride=1, - padding=1), + paddle.nn.Conv2D(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1), paddle.nn.ReLU(), - paddle.nn.Conv2D( - in_channels=256, - out_channels=256, - kernel_size=3, - stride=1, - padding=1), - paddle.nn.ReLU(), ) + ) self.netVggFou = paddle.nn.Sequential( - paddle.nn.MaxPool2D( - kernel_size=2, stride=2), - paddle.nn.Conv2D( - in_channels=256, - out_channels=512, - kernel_size=3, - stride=1, - padding=1), + paddle.nn.MaxPool2D(kernel_size=2, stride=2), + paddle.nn.Conv2D(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1), paddle.nn.ReLU(), - paddle.nn.Conv2D( - in_channels=512, - out_channels=512, - kernel_size=3, - stride=1, - padding=1), + paddle.nn.Conv2D(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1), paddle.nn.ReLU(), - paddle.nn.Conv2D( - in_channels=512, - out_channels=512, - kernel_size=3, - stride=1, - padding=1), - paddle.nn.ReLU(), ) + paddle.nn.Conv2D(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1), + paddle.nn.ReLU(), + ) self.netVggFiv = paddle.nn.Sequential( - paddle.nn.MaxPool2D( - kernel_size=2, stride=2), - paddle.nn.Conv2D( - in_channels=512, - out_channels=512, - kernel_size=3, - stride=1, - padding=1), + paddle.nn.MaxPool2D(kernel_size=2, stride=2), + paddle.nn.Conv2D(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1), + paddle.nn.ReLU(), + paddle.nn.Conv2D(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1), paddle.nn.ReLU(), - paddle.nn.Conv2D( - in_channels=512, - out_channels=512, - kernel_size=3, - stride=1, - padding=1), + paddle.nn.Conv2D(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1), paddle.nn.ReLU(), - paddle.nn.Conv2D( - in_channels=512, - out_channels=512, - kernel_size=3, - stride=1, - padding=1), - paddle.nn.ReLU(), ) - - self.netScoreOne = paddle.nn.Conv2D( - in_channels=64, out_channels=1, kernel_size=1, stride=1, padding=0) - self.netScoreTwo = paddle.nn.Conv2D( - in_channels=128, out_channels=1, kernel_size=1, stride=1, padding=0) - self.netScoreThr = paddle.nn.Conv2D( - in_channels=256, out_channels=1, kernel_size=1, stride=1, padding=0) - self.netScoreFou = paddle.nn.Conv2D( - in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0) - self.netScoreFiv = paddle.nn.Conv2D( - in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0) + ) + + self.netScoreOne = paddle.nn.Conv2D(in_channels=64, out_channels=1, kernel_size=1, stride=1, padding=0) + self.netScoreTwo = paddle.nn.Conv2D(in_channels=128, out_channels=1, kernel_size=1, stride=1, padding=0) + self.netScoreThr = paddle.nn.Conv2D(in_channels=256, out_channels=1, kernel_size=1, stride=1, padding=0) + self.netScoreFou = paddle.nn.Conv2D(in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0) + self.netScoreFiv = paddle.nn.Conv2D(in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0) self.netCombine = paddle.nn.Sequential( - paddle.nn.Conv2D( - in_channels=5, - out_channels=1, - kernel_size=1, - stride=1, - padding=0), - paddle.nn.Sigmoid(), ) + paddle.nn.Conv2D(in_channels=5, out_channels=1, kernel_size=1, stride=1, padding=0), + paddle.nn.Sigmoid(), + ) if model_path: self.set_state_dict(paddle.load(model_path)) @@ -162,7 +89,8 @@ def forward(self, tenInput): tenInput = tenInput * 255.0 tenInput = tenInput - paddle.to_tensor( [104.00698793, 116.66876762, 122.67891434], - dtype=tenInput.dtype, ).reshape([1, 3, 1, 1]) + dtype=tenInput.dtype, + ).reshape([1, 3, 1, 1]) tenVggOne = self.netVggOne(tenInput) tenVggTwo = self.netVggTwo(tenVggOne) @@ -180,47 +108,48 @@ def forward(self, tenInput): tenScoreOne, size=(tenInput.shape[2], tenInput.shape[3]), mode="bilinear", - align_corners=False, ) + align_corners=False, + ) tenScoreTwo = paddle.nn.functional.interpolate( tenScoreTwo, size=(tenInput.shape[2], tenInput.shape[3]), mode="bilinear", - align_corners=False, ) + align_corners=False, + ) tenScoreThr = paddle.nn.functional.interpolate( tenScoreThr, size=(tenInput.shape[2], tenInput.shape[3]), mode="bilinear", - align_corners=False, ) + align_corners=False, + ) tenScoreFou = paddle.nn.functional.interpolate( tenScoreFou, size=(tenInput.shape[2], tenInput.shape[3]), mode="bilinear", - align_corners=False, ) + align_corners=False, + ) tenScoreFiv = paddle.nn.functional.interpolate( tenScoreFiv, size=(tenInput.shape[2], tenInput.shape[3]), mode="bilinear", - align_corners=False, ) + align_corners=False, + ) - return self.netCombine( - paddle.concat([ - tenScoreOne, tenScoreTwo, tenScoreThr, tenScoreFou, tenScoreFiv - ], 1)) + return self.netCombine(paddle.concat([tenScoreOne, tenScoreTwo, tenScoreThr, tenScoreFou, tenScoreFiv], 1)) -remote_model_path = "https://paddlenlp.bj.bcebos.com/models/community/westfish/network-bsds500-paddle/network-bsds500.pdparams" +remote_model_path = ( + "https://paddlenlp.bj.bcebos.com/models/community/westfish/network-bsds500-paddle/network-bsds500.pdparams" +) class HEDdetector: def __init__(self, modelpath=None): - modelpath = os.path.join(annotator_ckpts_path, - "network-bsds500.pdparams") + modelpath = os.path.join(annotator_ckpts_path, "network-bsds500.pdparams") if not os.path.exists(modelpath): - from paddlenlp.utils.downloader import \ - get_path_from_url_with_filelock + from paddlenlp.utils.downloader import get_path_from_url_with_filelock - get_path_from_url_with_filelock( - remote_model_path, root_dir=annotator_ckpts_path) + get_path_from_url_with_filelock(remote_model_path, root_dir=annotator_ckpts_path) self.model_path = modelpath self.netNetwork = Network(modelpath) self.netNetwork.eval() diff --git a/ppdiffusers/examples/controlnet/annotator/midas_paddle/__init__.py b/ppdiffusers/examples/controlnet/annotator/midas_paddle/__init__.py index ecd0bf926d74d..543d0774c523a 100644 --- a/ppdiffusers/examples/controlnet/annotator/midas_paddle/__init__.py +++ b/ppdiffusers/examples/controlnet/annotator/midas_paddle/__init__.py @@ -44,7 +44,6 @@ def __call__(self, input_image, a=np.pi * 2.0, bg_th=0.1): x[depth_pt < bg_th] = 0 y[depth_pt < bg_th] = 0 normal = np.stack([x, y, z], axis=2) - normal /= np.sum(normal**2.0, axis=2, keepdims=True)**0.5 - normal_image = (normal * 127.5 + 127.5).clip( - min=0, max=255).astype(np.uint8) + normal /= np.sum(normal**2.0, axis=2, keepdims=True) ** 0.5 + normal_image = (normal * 127.5 + 127.5).clip(min=0, max=255).astype(np.uint8) return depth_image, normal_image diff --git a/ppdiffusers/examples/controlnet/annotator/midas_paddle/api_inference.py b/ppdiffusers/examples/controlnet/annotator/midas_paddle/api_inference.py index f93fa96d31b20..4726391519074 100644 --- a/ppdiffusers/examples/controlnet/annotator/midas_paddle/api_inference.py +++ b/ppdiffusers/examples/controlnet/annotator/midas_paddle/api_inference.py @@ -19,25 +19,23 @@ def checkmodel(model_dir, model_name): - if not os.path.exists( - os.path.join(model_dir, model_name, model_name + ".pdmodel")): + if not os.path.exists(os.path.join(model_dir, model_name, model_name + ".pdmodel")): model_url = "https://bj.bcebos.com/v1/paddledet/models/dpt_hybrid.zip" get_path_from_url_with_filelock(model_url, root_dir=model_dir) class MidasInference: def __init__( - self, - model_dir, - model_name="dpt_hybrid", - batchsize=8, - device="GPU", - run_mode="paddle", ): + self, + model_dir, + model_name="dpt_hybrid", + batchsize=8, + device="GPU", + run_mode="paddle", + ): checkmodel(model_dir, model_name) - model_file = os.path.join(model_dir, model_name, - model_name + ".pdmodel") - params_file = os.path.join(model_dir, model_name, - model_name + ".pdiparams") + model_file = os.path.join(model_dir, model_name, model_name + ".pdmodel") + params_file = os.path.join(model_dir, model_name, model_name + ".pdiparams") config = paddle_infer.Config(model_file, params_file) self.batchsize = batchsize if device == "GPU": @@ -69,12 +67,12 @@ def __init__( min_subgraph_size=3, precision_mode=precision_map[run_mode], use_static=False, - use_calib_mode=False, ) + use_calib_mode=False, + ) min_input_shape = {"image": [1, 3, 224, 224]} max_input_shape = {"image": [1, 3, 1280, 1280]} opt_input_shape = {"image": [1, 3, 384, 384]} - config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape, - opt_input_shape) + config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape, opt_input_shape) # disable print log when predict config.disable_glog_info() diff --git a/ppdiffusers/examples/controlnet/annotator/mlsd/__init__.py b/ppdiffusers/examples/controlnet/annotator/mlsd/__init__.py index 8e9d9e35206a6..8e453eef33c28 100644 --- a/ppdiffusers/examples/controlnet/annotator/mlsd/__init__.py +++ b/ppdiffusers/examples/controlnet/annotator/mlsd/__init__.py @@ -27,13 +27,11 @@ class MLSDdetector: def __init__(self): - model_path = os.path.join(annotator_ckpts_path, - "mlsd_large_512_fp32.pdparams") + model_path = os.path.join(annotator_ckpts_path, "mlsd_large_512_fp32.pdparams") if not os.path.exists(model_path): from basicsr.utils.download_util import load_file_from_url - load_file_from_url( - remote_model_path, model_dir=annotator_ckpts_path) + load_file_from_url(remote_model_path, model_dir=annotator_ckpts_path) self.model = MobileV2_MLSD_Large() self.model.eval() self.model.set_dict(paddle.load(model_path)) @@ -43,10 +41,8 @@ def __call__(self, input_image, thr_v, thr_d): img = input_image img_output = np.zeros_like(img) with paddle.no_grad(): - lines = pred_lines(img, self.model, [img.shape[0], img.shape[1]], - thr_v, thr_d) + lines = pred_lines(img, self.model, [img.shape[0], img.shape[1]], thr_v, thr_d) for line in lines: x_start, y_start, x_end, y_end = [int(val) for val in line] - cv2.line(img_output, (x_start, y_start), (x_end, y_end), - [255, 255, 255], 1) + cv2.line(img_output, (x_start, y_start), (x_end, y_end), [255, 255, 255], 1) return img_output[:, :, (0)] diff --git a/ppdiffusers/examples/controlnet/annotator/mlsd/models/mbv2_mlsd_large.py b/ppdiffusers/examples/controlnet/annotator/mlsd/models/mbv2_mlsd_large.py index c1f08257cff39..d9123b0102d3c 100644 --- a/ppdiffusers/examples/controlnet/annotator/mlsd/models/mbv2_mlsd_large.py +++ b/ppdiffusers/examples/controlnet/annotator/mlsd/models/mbv2_mlsd_large.py @@ -20,35 +20,36 @@ class BlockTypeA(paddle.nn.Layer): def __init__(self, in_c1, in_c2, out_c1, out_c2, upscale=True): super(BlockTypeA, self).__init__() self.conv1 = paddle.nn.Sequential( - paddle.nn.Conv2D( - in_channels=in_c2, out_channels=out_c2, kernel_size=1), + paddle.nn.Conv2D(in_channels=in_c2, out_channels=out_c2, kernel_size=1), paddle.nn.BatchNorm2D( num_features=out_c2, momentum=1 - 0.1, epsilon=1e-05, weight_attr=None, bias_attr=None, - use_global_stats=True, ), - paddle.nn.ReLU(), ) + use_global_stats=True, + ), + paddle.nn.ReLU(), + ) self.conv2 = paddle.nn.Sequential( - paddle.nn.Conv2D( - in_channels=in_c1, out_channels=out_c1, kernel_size=1), + paddle.nn.Conv2D(in_channels=in_c1, out_channels=out_c1, kernel_size=1), paddle.nn.BatchNorm2D( num_features=out_c1, momentum=1 - 0.1, epsilon=1e-05, weight_attr=None, bias_attr=None, - use_global_stats=True, ), - paddle.nn.ReLU(), ) + use_global_stats=True, + ), + paddle.nn.ReLU(), + ) self.upscale = upscale def forward(self, a, b): b = self.conv1(b) a = self.conv2(a) if self.upscale: - b = paddle.nn.functional.interpolate( - x=b, scale_factor=2.0, mode="bilinear", align_corners=True) + b = paddle.nn.functional.interpolate(x=b, scale_factor=2.0, mode="bilinear", align_corners=True) return paddle.concat(x=(a, b), axis=1) @@ -56,27 +57,29 @@ class BlockTypeB(paddle.nn.Layer): def __init__(self, in_c, out_c): super(BlockTypeB, self).__init__() self.conv1 = paddle.nn.Sequential( - paddle.nn.Conv2D( - in_channels=in_c, out_channels=in_c, kernel_size=3, padding=1), + paddle.nn.Conv2D(in_channels=in_c, out_channels=in_c, kernel_size=3, padding=1), paddle.nn.BatchNorm2D( num_features=in_c, momentum=1 - 0.1, epsilon=1e-05, weight_attr=None, bias_attr=None, - use_global_stats=True, ), - paddle.nn.ReLU(), ) + use_global_stats=True, + ), + paddle.nn.ReLU(), + ) self.conv2 = paddle.nn.Sequential( - paddle.nn.Conv2D( - in_channels=in_c, out_channels=out_c, kernel_size=3, padding=1), + paddle.nn.Conv2D(in_channels=in_c, out_channels=out_c, kernel_size=3, padding=1), paddle.nn.BatchNorm2D( num_features=out_c, momentum=1 - 0.1, epsilon=1e-05, weight_attr=None, bias_attr=None, - use_global_stats=True, ), - paddle.nn.ReLU(), ) + use_global_stats=True, + ), + paddle.nn.ReLU(), + ) def forward(self, x): x = self.conv1(x) + x @@ -93,28 +96,31 @@ def __init__(self, in_c, out_c): out_channels=in_c, kernel_size=3, padding=5, - dilation=5, ), + dilation=5, + ), paddle.nn.BatchNorm2D( num_features=in_c, momentum=1 - 0.1, epsilon=1e-05, weight_attr=None, bias_attr=None, - use_global_stats=True, ), - paddle.nn.ReLU(), ) + use_global_stats=True, + ), + paddle.nn.ReLU(), + ) self.conv2 = paddle.nn.Sequential( - paddle.nn.Conv2D( - in_channels=in_c, out_channels=in_c, kernel_size=3, padding=1), + paddle.nn.Conv2D(in_channels=in_c, out_channels=in_c, kernel_size=3, padding=1), paddle.nn.BatchNorm2D( num_features=in_c, momentum=1 - 0.1, epsilon=1e-05, weight_attr=None, bias_attr=None, - use_global_stats=True, ), - paddle.nn.ReLU(), ) - self.conv3 = paddle.nn.Conv2D( - in_channels=in_c, out_channels=out_c, kernel_size=1) + use_global_stats=True, + ), + paddle.nn.ReLU(), + ) + self.conv3 = paddle.nn.Conv2D(in_channels=in_c, out_channels=out_c, kernel_size=1) def forward(self, x): x = self.conv1(x) @@ -143,8 +149,7 @@ def _make_divisible(v, divisor, min_value=None): class ConvBNReLU(paddle.nn.Sequential): - def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, - groups=1): + def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1): self.channel_pad = out_planes - in_planes self.stride = stride if stride == 2: @@ -159,23 +164,23 @@ def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, stride=stride, padding=padding, groups=groups, - bias_attr=False, ), + bias_attr=False, + ), paddle.nn.BatchNorm2D( num_features=out_planes, momentum=1 - 0.1, epsilon=1e-05, weight_attr=None, bias_attr=None, - use_global_stats=True, ), - paddle.nn.ReLU6(), ) + use_global_stats=True, + ), + paddle.nn.ReLU6(), + ) self.max_pool = paddle.nn.MaxPool2D(kernel_size=stride, stride=stride) def forward(self, x): if self.stride == 2: - x = paddle.nn.functional.pad(x=x, - pad=(0, 1, 0, 1), - mode="constant", - value=0) + x = paddle.nn.functional.pad(x=x, pad=(0, 1, 0, 1), mode="constant", value=0) for module in self: if not isinstance(module, paddle.nn.MaxPool2D): x = module(x) @@ -192,24 +197,27 @@ def __init__(self, inp, oup, stride, expand_ratio): layers = [] if expand_ratio != 1: layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1)) - layers.extend([ - ConvBNReLU( - hidden_dim, hidden_dim, stride=stride, groups=hidden_dim), - paddle.nn.Conv2D( - in_channels=hidden_dim, - out_channels=oup, - kernel_size=1, - stride=1, - padding=0, - bias_attr=False, ), - paddle.nn.BatchNorm2D( - num_features=oup, - momentum=1 - 0.1, - epsilon=1e-05, - weight_attr=None, - bias_attr=None, - use_global_stats=True, ), - ]) + layers.extend( + [ + ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim), + paddle.nn.Conv2D( + in_channels=hidden_dim, + out_channels=oup, + kernel_size=1, + stride=1, + padding=0, + bias_attr=False, + ), + paddle.nn.BatchNorm2D( + num_features=oup, + momentum=1 - 0.1, + epsilon=1e-05, + weight_attr=None, + bias_attr=None, + use_global_stats=True, + ), + ] + ) self.conv = paddle.nn.Sequential(*layers) def forward(self, x): @@ -244,23 +252,20 @@ def __init__(self): [6, 64, 4, 2], [6, 96, 3, 1], ] - if (len(inverted_residual_setting) == 0 or - len(inverted_residual_setting[0]) != 4): + if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4: raise ValueError( - "inverted_residual_setting should be non-empty or a 4-element list, got {}". - format(inverted_residual_setting)) - input_channel = _make_divisible(input_channel * width_mult, - round_nearest) - self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), - round_nearest) + "inverted_residual_setting should be non-empty or a 4-element list, got {}".format( + inverted_residual_setting + ) + ) + input_channel = _make_divisible(input_channel * width_mult, round_nearest) + self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest) features = [ConvBNReLU(4, input_channel, stride=2)] for t, c, n, s in inverted_residual_setting: output_channel = _make_divisible(c * width_mult, round_nearest) for i in range(n): stride = s if i == 0 else 1 - features.append( - block( - input_channel, output_channel, stride, expand_ratio=t)) + features.append(block(input_channel, output_channel, stride, expand_ratio=t)) input_channel = output_channel self.features = paddle.nn.Sequential(*features) self.fpn_selected = [1, 3, 6, 10, 13] @@ -295,8 +300,7 @@ class MobileV2_MLSD_Large(paddle.nn.Layer): def __init__(self): super(MobileV2_MLSD_Large, self).__init__() self.backbone = MobileNetV2() - self.block15 = BlockTypeA( - in_c1=64, in_c2=96, out_c1=64, out_c2=64, upscale=False) + self.block15 = BlockTypeA(in_c1=64, in_c2=96, out_c1=64, out_c2=64, upscale=False) self.block16 = BlockTypeB(128, 64) self.block17 = BlockTypeA(in_c1=32, in_c2=64, out_c1=64, out_c2=64) self.block18 = BlockTypeB(128, 64) diff --git a/ppdiffusers/examples/controlnet/annotator/mlsd/utils.py b/ppdiffusers/examples/controlnet/annotator/mlsd/utils.py index e533433631fb1..1ad8429e69fb9 100644 --- a/ppdiffusers/examples/controlnet/annotator/mlsd/utils.py +++ b/ppdiffusers/examples/controlnet/annotator/mlsd/utils.py @@ -17,6 +17,7 @@ import cv2 import numpy as np import paddle + """ M-LSD Copyright 2021-present NAVER Corp. @@ -48,11 +49,7 @@ def zeros_(tensor): return _no_grad_fill_(tensor, 0) -def kaiming_normal_(tensor, - a=0, - mode="fan_in", - nonlinearity="leaky_relu", - reverse=False): +def kaiming_normal_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False): """ Modified tensor inspace using kaiming_normal_ Args: @@ -100,13 +97,11 @@ def _calculate_gain(nonlinearity, param=None): elif nonlinearity == "leaky_relu": if param is None: negative_slope = 0.01 - elif (not isinstance(param, bool) and isinstance(param, int) or - isinstance(param, float)): + elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float): # True/False are instances of int, hence check above negative_slope = param else: - raise ValueError("negative_slope {} not a valid number".format( - param)) + raise ValueError("negative_slope {} not a valid number".format(param)) return math.sqrt(2.0 / (1 + negative_slope**2)) elif nonlinearity == "selu": return 3.0 / 4 @@ -119,8 +114,7 @@ def _calculate_correct_fan(tensor, mode, reverse=False): mode = mode.lower() valid_modes = ["fan_in", "fan_out"] if mode not in valid_modes: - raise ValueError("Mode {} not supported, please use one of {}".format( - mode, valid_modes)) + raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes)) fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse) @@ -137,9 +131,7 @@ def _calculate_fan_in_and_fan_out(tensor, reverse=False): Tuple[fan_in, fan_out] """ if tensor.ndim < 2: - raise ValueError( - "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions" - ) + raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions") if reverse: num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1] @@ -168,8 +160,8 @@ def deccode_output_score_and_ptss(tpMap, topk_n=200, ksize=5): center = tpMap[:, (0), :, :] heat = paddle.nn.functional.sigmoid(x=center).unsqueeze(0) hmax = paddle.nn.functional.max_pool2d( - kernel_size=(ksize, ksize), stride=1, padding=(ksize - 1) // 2, - x=heat).squeeze(0) + kernel_size=(ksize, ksize), stride=1, padding=(ksize - 1) // 2, x=heat + ).squeeze(0) keep = (hmax == heat).astype(dtype="float32") heat = heat * keep heat = heat.reshape([-1]) @@ -185,21 +177,16 @@ def deccode_output_score_and_ptss(tpMap, topk_n=200, ksize=5): return ptss, scores, displacement -def pred_lines(image, - model, - input_shape=[512, 512], - score_thr=0.1, - dist_thr=20.0): +def pred_lines(image, model, input_shape=[512, 512], score_thr=0.1, dist_thr=20.0): h, w, _ = image.shape h_ratio, w_ratio = [h / input_shape[0], w / input_shape[1]] resized_image = np.concatenate( [ - cv2.resize( - image, (input_shape[1], input_shape[0]), - interpolation=cv2.INTER_AREA), + cv2.resize(image, (input_shape[1], input_shape[0]), interpolation=cv2.INTER_AREA), np.ones([input_shape[0], input_shape[1], 1]), ], - axis=-1, ) + axis=-1, + ) resized_image = resized_image.transpose((2, 0, 1)) batch_image = np.expand_dims(resized_image, axis=0).astype("float32") batch_image = batch_image / 127.5 - 1.0 @@ -208,14 +195,13 @@ def pred_lines(image, pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3) start = vmap[:, :, :2] end = vmap[:, :, 2:] - dist_map = np.sqrt(np.sum((start - end)**2, axis=-1)) + dist_map = np.sqrt(np.sum((start - end) ** 2, axis=-1)) segments_list = [] for center, score in zip(pts, pts_score): y, x = center distance = dist_map[y, x] if score > score_thr and distance > dist_thr: - disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[(y), ( - x), :] + disp_x_start, disp_y_start, disp_x_end, disp_y_end = vmap[(y), (x), :] x_start = x + disp_x_start y_start = y + disp_y_start x_end = x + disp_x_end diff --git a/ppdiffusers/examples/controlnet/annotator/openpose/__init__.py b/ppdiffusers/examples/controlnet/annotator/openpose/__init__.py index 7dc16bd999550..e07f249e8c9fe 100644 --- a/ppdiffusers/examples/controlnet/annotator/openpose/__init__.py +++ b/ppdiffusers/examples/controlnet/annotator/openpose/__init__.py @@ -33,25 +33,19 @@ def __call__(self, oriImg, hand=False): with paddle.no_grad(): canvas = oriImg[:, :, ::-1].copy() canvas.fill(0) - result = self.body_estimation.predict( - oriImg, save_path="saved_images", visualization=False) - canvas = self.body_estimation.draw_pose(canvas, result["candidate"], - result["subset"]) + result = self.body_estimation.predict(oriImg, save_path="saved_images", visualization=False) + canvas = self.body_estimation.draw_pose(canvas, result["candidate"], result["subset"]) if hand: - hands_list = util.hand_detect(result["candidate"], - result["subset"], oriImg) + hands_list = util.hand_detect(result["candidate"], result["subset"], oriImg) all_hand_peaks = [] for x, y, w, is_left in hands_list: scale_search = [0.5, 1.0, 1.5, 2.0] peaks = self.hand_estimation.hand_estimation( - oriImg[y:y + w, x:x + w, :], scale_search=scale_search) - peaks[:, 0] = np.where(peaks[:, 0] == 0, peaks[:, 0], - peaks[:, 0] + x) - peaks[:, 1] = np.where(peaks[:, 1] == 0, peaks[:, 1], - peaks[:, 1] + y) + oriImg[y : y + w, x : x + w, :], scale_search=scale_search + ) + peaks[:, 0] = np.where(peaks[:, 0] == 0, peaks[:, 0], peaks[:, 0] + x) + peaks[:, 1] = np.where(peaks[:, 1] == 0, peaks[:, 1], peaks[:, 1] + y) all_hand_peaks.append(peaks) canvas = self.hand_estimation.draw_hand(canvas, all_hand_peaks) - return canvas, dict( - candidate=result["candidate"].tolist(), - subset=result["subset"].tolist()) + return canvas, dict(candidate=result["candidate"].tolist(), subset=result["subset"].tolist()) diff --git a/ppdiffusers/examples/controlnet/annotator/openpose/util.py b/ppdiffusers/examples/controlnet/annotator/openpose/util.py index 10028380bbd8a..899e38121eaea 100644 --- a/ppdiffusers/examples/controlnet/annotator/openpose/util.py +++ b/ppdiffusers/examples/controlnet/annotator/openpose/util.py @@ -47,8 +47,7 @@ def pad_right_down_corner(img, stride, padValue): def transfer(model, model_weights): transfered_model_weights = {} for weights_name in model.state_dict().keys(): - transfered_model_weights[weights_name] = model_weights[".".join( - weights_name.split(".")[1:])] + transfered_model_weights[weights_name] = model_weights[".".join(weights_name.split(".")[1:])] return transfered_model_weights @@ -114,11 +113,9 @@ def draw_bodypose(canvas, candidate, subset): X = candidate[index.astype(int), 1] mX = np.mean(X) mY = np.mean(Y) - length = ((X[0] - X[1])**2 + (Y[0] - Y[1])**2)**0.5 + length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5 angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1])) - polygon = cv2.ellipse2Poly((int(mY), int(mX)), - (int(length / 2), stickwidth), - int(angle), 0, 360, 1) + polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1) cv2.fillConvexPoly(cur_canvas, polygon, colors[i]) canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0) return canvas @@ -158,9 +155,9 @@ def draw_handpose(canvas, all_hand_peaks, show_number=False): canvas, (x1, y1), (x2, y2), - matplotlib.colors.hsv_to_rgb( - [ie / float(len(edges)), 1.0, 1.0]) * 255, - thickness=2, ) + matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255, + thickness=2, + ) for i, keyponit in enumerate(peaks): x, y = keyponit @@ -173,7 +170,8 @@ def draw_handpose(canvas, all_hand_peaks, show_number=False): cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0, 0, 0), - lineType=cv2.LINE_AA, ) + lineType=cv2.LINE_AA, + ) return canvas @@ -194,16 +192,14 @@ def hand_detect(candidate, subset, oriImg): hands = [] # left hand if has_left: - left_shoulder_index, left_elbow_index, left_wrist_index = person[ - [5, 6, 7]] + left_shoulder_index, left_elbow_index, left_wrist_index = person[[5, 6, 7]] x1, y1 = candidate[left_shoulder_index][:2] x2, y2 = candidate[left_elbow_index][:2] x3, y3 = candidate[left_wrist_index][:2] hands.append([x1, y1, x2, y2, x3, y3, True]) # right hand if has_right: - right_shoulder_index, right_elbow_index, right_wrist_index = person[ - [2, 3, 4]] + right_shoulder_index, right_elbow_index, right_wrist_index = person[[2, 3, 4]] x1, y1 = candidate[right_shoulder_index][:2] x2, y2 = candidate[right_elbow_index][:2] x3, y3 = candidate[right_wrist_index][:2] @@ -218,8 +214,8 @@ def hand_detect(candidate, subset, oriImg): # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder); x = x3 + ratioWristElbow * (x3 - x2) y = y3 + ratioWristElbow * (y3 - y2) - distanceWristElbow = math.sqrt((x3 - x2)**2 + (y3 - y2)**2) - distanceElbowShoulder = math.sqrt((x2 - x1)**2 + (y2 - y1)**2) + distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2) + distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2) width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder) # x-y refers to the center --> offset to topLeft point # handRectangle.x -= handRectangle.width / 2.f; diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/__init__.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/__init__.py index 0bb742e72d02a..d2d5ee7249851 100644 --- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/__init__.py +++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/__init__.py @@ -39,10 +39,8 @@ def keypoint_to_openpose_kpts(coco_keypoints_list): l_shoulder_keypoint = coco_keypoints_list[l_shoulder_index] r_shoulder_keypoint = coco_keypoints_list[r_shoulder_index] - neck_keypoint_y = int( - (l_shoulder_keypoint[1] + r_shoulder_keypoint[1]) / 2.0) - neck_keypoint_x = int( - (l_shoulder_keypoint[0] + r_shoulder_keypoint[0]) / 2.0) + neck_keypoint_y = int((l_shoulder_keypoint[1] + r_shoulder_keypoint[1]) / 2.0) + neck_keypoint_x = int((l_shoulder_keypoint[0] + r_shoulder_keypoint[0]) / 2.0) neck_keypoint = [ neck_keypoint_x, neck_keypoint_y, @@ -65,33 +63,24 @@ def __call__(self, oriImg, detect_resolution=512, hand=False): img_scalarfactor = detect_resolution / min(oriImg.shape[:2]) result = self.ppdetpose_pred(oriImg) result["candidate"] = result["candidate"] * img_scalarfactor - oriImg = cv2.resize( - oriImg, (0, 0), fx=img_scalarfactor, fy=img_scalarfactor) + oriImg = cv2.resize(oriImg, (0, 0), fx=img_scalarfactor, fy=img_scalarfactor) canvas = oriImg.copy() canvas.fill(0) - canvas = self.body_estimation.draw_pose(canvas, result["candidate"], - result["subset"]) + canvas = self.body_estimation.draw_pose(canvas, result["candidate"], result["subset"]) if hand: - hands_list = util.hand_detect(result["candidate"], - result["subset"], oriImg) + hands_list = util.hand_detect(result["candidate"], result["subset"], oriImg) all_hand_peaks = [] for x, y, w, is_left in hands_list: - scale_search = [ - x * img_scalarfactor for x in [0.5, 1.0, 1.5, 2.0] - ] + scale_search = [x * img_scalarfactor for x in [0.5, 1.0, 1.5, 2.0]] peaks = self.hand_estimation.hand_estimation( - oriImg[y:y + w, x:x + w, ::-1], - scale_search=scale_search) - peaks[:, 0] = np.where(peaks[:, 0] == 0, peaks[:, 0], - peaks[:, 0] + x) - peaks[:, 1] = np.where(peaks[:, 1] == 0, peaks[:, 1], - peaks[:, 1] + y) + oriImg[y : y + w, x : x + w, ::-1], scale_search=scale_search + ) + peaks[:, 0] = np.where(peaks[:, 0] == 0, peaks[:, 0], peaks[:, 0] + x) + peaks[:, 1] = np.where(peaks[:, 1] == 0, peaks[:, 1], peaks[:, 1] + y) all_hand_peaks.append(peaks) canvas = util.draw_handpose(canvas, all_hand_peaks) - return canvas, dict( - candidate=result["candidate"].tolist(), - subset=result["subset"].tolist()) + return canvas, dict(candidate=result["candidate"].tolist(), subset=result["subset"].tolist()) def ppdetpose_pred(self, image, kpt_threshold=0.3): poseres = self.ppdetpose.ppdet_hrnet_infer(image) @@ -105,7 +94,12 @@ def ppdetpose_pred(self, image, kpt_threshold=0.3): for idx, item in enumerate(openpose_kpts): if item[2] > kpt_threshold: subset[kptid][idx] = posnum - kpt = np.array(item + [posnum, ]) + kpt = np.array( + item + + [ + posnum, + ] + ) candidate = np.vstack((candidate, kpt)) posnum += 1 return {"candidate": candidate, "subset": subset} diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/benchmark_utils.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/benchmark_utils.py index 86f7aca10c143..9236875761299 100644 --- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/benchmark_utils.py +++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/benchmark_utils.py @@ -25,13 +25,14 @@ class PaddleInferBenchmark(object): def __init__( - self, - config, - model_info: dict={}, - data_info: dict={}, - perf_info: dict={}, - resource_info: dict={}, - **kwargs, ): + self, + config, + model_info: dict = {}, + data_info: dict = {}, + perf_info: dict = {}, + resource_info: dict = {}, + **kwargs, + ): """ Construct PaddleInferBenchmark Class to format logs. args: @@ -84,8 +85,7 @@ def __init__( self.inference_time_s = round(perf_info["inference_time_s"], 4) except: self.print_help() - raise ValueError( - "Set argument wrong, please check input argument and its type") + raise ValueError("Set argument wrong, please check input argument and its type") self.preprocess_time_s = perf_info.get("preprocess_time_s", 0) self.postprocess_time_s = perf_info.get("postprocess_time_s", 0) @@ -142,13 +142,12 @@ def benchmark_logger(self): level=logging.INFO, format=FORMAT, handlers=[ - logging.FileHandler( - filename=log_output, mode="w"), + logging.FileHandler(filename=log_output, mode="w"), logging.StreamHandler(), - ], ) + ], + ) self.logger = logging.getLogger(__name__) - self.logger.info( - f"Paddle Inference benchmark log will be saved to {log_output}") + self.logger.info(f"Paddle Inference benchmark log will be saved to {log_output}") def parse_config(self, config) -> dict: """ @@ -160,28 +159,22 @@ def parse_config(self, config) -> dict: """ if isinstance(config, paddle_infer.Config): config_status = {} - config_status["runtime_device"] = "gpu" if config.use_gpu( - ) else "cpu" + config_status["runtime_device"] = "gpu" if config.use_gpu() else "cpu" config_status["ir_optim"] = config.ir_optim() config_status["enable_tensorrt"] = config.tensorrt_engine_enabled() config_status["precision"] = self.precision config_status["enable_mkldnn"] = config.mkldnn_enabled() - config_status[ - "cpu_math_library_num_threads"] = config.cpu_math_library_num_threads( - ) + config_status["cpu_math_library_num_threads"] = config.cpu_math_library_num_threads() elif isinstance(config, dict): config_status["runtime_device"] = config.get("runtime_device", "") config_status["ir_optim"] = config.get("ir_optim", "") config_status["enable_tensorrt"] = config.get("enable_tensorrt", "") config_status["precision"] = config.get("precision", "") config_status["enable_mkldnn"] = config.get("enable_mkldnn", "") - config_status["cpu_math_library_num_threads"] = config.get( - "cpu_math_library_num_threads", "") + config_status["cpu_math_library_num_threads"] = config.get("cpu_math_library_num_threads", "") else: self.print_help() - raise ValueError( - "Set argument config wrong, please check input argument and its type" - ) + raise ValueError("Set argument config wrong, please check input argument and its type") return config_status def report(self, identifier=None): @@ -196,54 +189,43 @@ def report(self, identifier=None): identifier = "" self.logger.info("\n") - self.logger.info( - "---------------------- Paddle info ----------------------") + self.logger.info("---------------------- Paddle info ----------------------") self.logger.info(f"{identifier} paddle_version: {self.paddle_version}") self.logger.info(f"{identifier} paddle_commit: {self.paddle_commit}") self.logger.info(f"{identifier} paddle_branch: {self.paddle_branch}") self.logger.info(f"{identifier} log_api_version: {self.log_version}") - self.logger.info( - "----------------------- Conf info -----------------------") - self.logger.info( - f"{identifier} runtime_device: {self.config_status['runtime_device']}" - ) - self.logger.info( - f"{identifier} ir_optim: {self.config_status['ir_optim']}") + self.logger.info("----------------------- Conf info -----------------------") + self.logger.info(f"{identifier} runtime_device: {self.config_status['runtime_device']}") + self.logger.info(f"{identifier} ir_optim: {self.config_status['ir_optim']}") self.logger.info(f"{identifier} enable_memory_optim: {True}") - self.logger.info( - f"{identifier} enable_tensorrt: {self.config_status['enable_tensorrt']}" - ) - self.logger.info( - f"{identifier} enable_mkldnn: {self.config_status['enable_mkldnn']}") + self.logger.info(f"{identifier} enable_tensorrt: {self.config_status['enable_tensorrt']}") + self.logger.info(f"{identifier} enable_mkldnn: {self.config_status['enable_mkldnn']}") self.logger.info( f"{identifier} cpu_math_library_num_threads: {self.config_status['cpu_math_library_num_threads']}" ) - self.logger.info( - "----------------------- Model info ----------------------") + self.logger.info("----------------------- Model info ----------------------") self.logger.info(f"{identifier} model_name: {self.model_name}") self.logger.info(f"{identifier} precision: {self.precision}") - self.logger.info( - "----------------------- Data info -----------------------") + self.logger.info("----------------------- Data info -----------------------") self.logger.info(f"{identifier} batch_size: {self.batch_size}") self.logger.info(f"{identifier} input_shape: {self.shape}") self.logger.info(f"{identifier} data_num: {self.data_num}") - self.logger.info( - "----------------------- Perf info -----------------------") + self.logger.info("----------------------- Perf info -----------------------") self.logger.info( f"{identifier} cpu_rss(MB): {self.cpu_rss_mb}, cpu_vms: {self.cpu_vms_mb}, cpu_shared_mb: {self.cpu_shared_mb}, cpu_dirty_mb: {self.cpu_dirty_mb}, cpu_util: {self.cpu_util}%" ) self.logger.info( f"{identifier} gpu_rss(MB): {self.gpu_rss_mb}, gpu_util: {self.gpu_util}%, gpu_mem_util: {self.gpu_mem_util}%" ) - self.logger.info( - f"{identifier} total time spent(s): {self.total_time_s}") + self.logger.info(f"{identifier} total time spent(s): {self.total_time_s}") if self.with_tracker: self.logger.info( f"{identifier} preprocess_time(ms): {round(self.preprocess_time_s*1000, 1)}, " f"inference_time(ms): {round(self.inference_time_s*1000, 1)}, " f"postprocess_time(ms): {round(self.postprocess_time_s*1000, 1)}, " - f"tracking_time(ms): {round(self.tracking_time_s*1000, 1)}") + f"tracking_time(ms): {round(self.tracking_time_s*1000, 1)}" + ) else: self.logger.info( f"{identifier} preprocess_time(ms): {round(self.preprocess_time_s*1000, 1)}, " @@ -261,7 +243,8 @@ def print_help(self): """ print function help """ - print("""Usage: + print( + """Usage: ==== Print inference benchmark logs. ==== config = paddle.inference.Config() model_info = {'model_name': 'resnet50' @@ -278,7 +261,8 @@ def print_help(self): 'gpu_util': 60} log = PaddleInferBenchmark(config, model_info, data_info, perf_info, resource_info) log('Test') - """) + """ + ) def __call__(self, identifier=None): """ diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_infer.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_infer.py index a89c4c830c5be..3d3a8578fd2bd 100644 --- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_infer.py +++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_infer.py @@ -24,8 +24,7 @@ from .det_keypoint_unite_utils import argsparser from .infer import PredictConfig # noqa F401 -from .infer import (Detector, DetectorPicoDet, bench_log, get_test_images, - print_arguments) +from .infer import bench_log, get_test_images, print_arguments from .keypoint_infer import KeyPointDetector from .keypoint_postprocess import translate_to_ori_images from .preprocess import decode_image @@ -38,12 +37,10 @@ } -def predict_with_given_det(image, det_res, keypoint_detector, - keypoint_batch_size, run_benchmark): +def predict_with_given_det(image, det_res, keypoint_detector, keypoint_batch_size, run_benchmark): keypoint_res = {} - rec_images, records, det_rects = keypoint_detector.get_person_from_rect( - image, det_res) + rec_images, records, det_rects = keypoint_detector.get_person_from_rect(image, det_res) if len(det_rects) == 0: keypoint_res["keypoint"] = [[], []] @@ -53,23 +50,22 @@ def predict_with_given_det(image, det_res, keypoint_detector, score_vector = [] rect_vector = det_rects - keypoint_results = keypoint_detector.predict_image( - rec_images, run_benchmark, repeats=10, visual=False) - keypoint_vector, score_vector = translate_to_ori_images(keypoint_results, - np.array(records)) + keypoint_results = keypoint_detector.predict_image(rec_images, run_benchmark, repeats=10, visual=False) + keypoint_vector, score_vector = translate_to_ori_images(keypoint_results, np.array(records)) keypoint_res["keypoint"] = ( - [keypoint_vector.tolist(), score_vector.tolist()] - if len(keypoint_vector) > 0 else [[], []]) + [keypoint_vector.tolist(), score_vector.tolist()] if len(keypoint_vector) > 0 else [[], []] + ) keypoint_res["bbox"] = rect_vector return keypoint_res def topdown_unite_predict( - detector, - topdown_keypoint_detector, - image_list, - keypoint_batch_size=1, - save_res=False, ): + detector, + topdown_keypoint_detector, + image_list, + keypoint_batch_size=1, + save_res=False, +): det_timer = detector.get_timer() store_res = [] for i, img_file in enumerate(image_list): @@ -79,8 +75,7 @@ def topdown_unite_predict( det_timer.preprocess_time_s.end() if FLAGS.run_benchmark: - results = detector.predict_image( - [image], run_benchmark=True, repeats=10) + results = detector.predict_image([image], run_benchmark=True, repeats=10) cm, gm, gu = get_current_memory_mb() detector.cpu_mem += cm @@ -95,15 +90,18 @@ def topdown_unite_predict( results, topdown_keypoint_detector, keypoint_batch_size, - FLAGS.run_benchmark, ) + FLAGS.run_benchmark, + ) if save_res: save_name = img_file if isinstance(img_file, str) else i - store_res.append([ - save_name, - keypoint_res["bbox"], - [keypoint_res["keypoint"][0], keypoint_res["keypoint"][1]], - ]) + store_res.append( + [ + save_name, + keypoint_res["bbox"], + [keypoint_res["keypoint"][0], keypoint_res["keypoint"][1]], + ] + ) else: results["keypoint"] = [[], []] keypoint_res = results @@ -119,7 +117,8 @@ def topdown_unite_predict( img_file, keypoint_res, visual_thresh=FLAGS.keypoint_threshold, - save_dir=FLAGS.output_dir, ) + save_dir=FLAGS.output_dir, + ) if save_res: """ 1) store_res: a list of image_data @@ -133,18 +132,17 @@ def topdown_unite_predict( def topdown_unite_predict_singleimage( - detector, - topdown_keypoint_detector, - image, - keypoint_batch_size=8, - det_threshold=0.25, ): + detector, + topdown_keypoint_detector, + image, + keypoint_batch_size=8, + det_threshold=0.25, +): results = detector.predict_image([image], visual=False) results = detector.filter_box(results, det_threshold) if results["boxes_num"] > 0: - keypoint_res = predict_with_given_det(image, results, - topdown_keypoint_detector, - keypoint_batch_size, False) + keypoint_res = predict_with_given_det(image, results, topdown_keypoint_detector, keypoint_batch_size, False) else: results["keypoint"] = [[], []] @@ -153,11 +151,12 @@ def topdown_unite_predict_singleimage( def topdown_unite_predict_video( - detector, - topdown_keypoint_detector, - camera_id, - keypoint_batch_size=1, - save_res=False, ): + detector, + topdown_keypoint_detector, + camera_id, + keypoint_batch_size=1, + save_res=False, +): video_name = "output.mp4" if camera_id != -1: capture = cv2.VideoCapture(camera_id) @@ -174,12 +173,11 @@ def topdown_unite_predict_video( if not os.path.exists(FLAGS.output_dir): os.makedirs(FLAGS.output_dir) out_path = os.path.join(FLAGS.output_dir, video_name) - fourcc = cv2.VideoWriter_fourcc(* "mp4v") + fourcc = cv2.VideoWriter_fourcc(*"mp4v") writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height)) index = 0 store_res = [] - keypoint_smoothing = KeypointSmoothing( - width, height, filter_type=FLAGS.filter_type, beta=0.05) + keypoint_smoothing = KeypointSmoothing(width, height, filter_type=FLAGS.filter_type, beta=0.05) while 1: ret, frame = capture.read() @@ -201,27 +199,25 @@ def topdown_unite_predict_video( results, topdown_keypoint_detector, keypoint_batch_size, - FLAGS.run_benchmark, ) + FLAGS.run_benchmark, + ) if FLAGS.smooth and len(keypoint_res["keypoint"][0]) == 1: current_keypoints = np.array(keypoint_res["keypoint"][0][0]) - smooth_keypoints = keypoint_smoothing.smooth_process( - current_keypoints) + smooth_keypoints = keypoint_smoothing.smooth_process(current_keypoints) keypoint_res["keypoint"][0][0] = smooth_keypoints.tolist() - im = visualize_pose( - frame, - keypoint_res, - visual_thresh=FLAGS.keypoint_threshold, - returnimg=True) + im = visualize_pose(frame, keypoint_res, visual_thresh=FLAGS.keypoint_threshold, returnimg=True) if save_res: - store_res.append([ - index, - keypoint_res["bbox"], - [keypoint_res["keypoint"][0], keypoint_res["keypoint"][1]], - ]) + store_res.append( + [ + index, + keypoint_res["bbox"], + [keypoint_res["keypoint"][0], keypoint_res["keypoint"][1]], + ] + ) writer.write(im) if camera_id != -1: @@ -247,37 +243,43 @@ class KeypointSmoothing(object): # https://github.com/jaantollander/OneEuroFilter def __init__( - self, - width, - height, - filter_type, - alpha=0.5, - fc_d=0.1, - fc_min=0.1, - beta=0.1, - thres_mult=0.3, ): + self, + width, + height, + filter_type, + alpha=0.5, + fc_d=0.1, + fc_min=0.1, + beta=0.1, + thres_mult=0.3, + ): super(KeypointSmoothing, self).__init__() self.image_width = width self.image_height = height - self.threshold = (np.array([ - 0.005, - 0.005, - 0.005, - 0.005, - 0.005, - 0.01, - 0.01, - 0.01, - 0.01, - 0.01, - 0.01, - 0.01, - 0.01, - 0.01, - 0.01, - 0.01, - 0.01, - ]) * thres_mult) + self.threshold = ( + np.array( + [ + 0.005, + 0.005, + 0.005, + 0.005, + 0.005, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + 0.01, + ] + ) + * thres_mult + ) self.filter_type = filter_type self.alpha = alpha self.dx_prev_hat = None @@ -302,20 +304,18 @@ def smooth_process(self, current_keypoints): result = current_keypoints num_keypoints = len(current_keypoints) for i in range(num_keypoints): - result[i, :2] = self.smooth(current_keypoints[i, :2], - self.threshold[i], i) + result[i, :2] = self.smooth(current_keypoints[i, :2], self.threshold[i], i) return result def smooth(self, current_keypoint, threshold, index): distance = np.sqrt( - np.square((current_keypoint[0] - self.x_prev_hat[index][0]) / - self.image_width) + np.square((current_keypoint[ - 1] - self.x_prev_hat[index][1]) / self.image_height)) + np.square((current_keypoint[0] - self.x_prev_hat[index][0]) / self.image_width) + + np.square((current_keypoint[1] - self.x_prev_hat[index][1]) / self.image_height) + ) if distance < threshold: result = self.x_prev_hat[index] else: - result = self.smooth_func(current_keypoint, self.x_prev_hat[index], - index) + result = self.smooth_func(current_keypoint, self.x_prev_hat[index], index) return result @@ -360,15 +360,13 @@ def exponential_smoothing(self, x_cur, x_pre, index=0): det_threshold = 0.4 if not os.path.exists(det_model_dir): - detmodel_url = "https://bj.bcebos.com/v1/paddledet/models/keypoint/tinypose_enhance/picodet_s_320_lcnet_pedestrian.zip" - get_path_from_url_with_filelock( - detmodel_url, root_dir="annotator/ppdet_hrnet/models/") -if not os.path.exists(keypoint_model_dir): - kptmodel_url = ( - "https://bj.bcebos.com/v1/paddledet/models/pipeline/dark_hrnet_w32_256x192.zip" + detmodel_url = ( + "https://bj.bcebos.com/v1/paddledet/models/keypoint/tinypose_enhance/picodet_s_320_lcnet_pedestrian.zip" ) - get_path_from_url_with_filelock( - kptmodel_url, root_dir="annotator/ppdet_hrnet/models/") + get_path_from_url_with_filelock(detmodel_url, root_dir="annotator/ppdet_hrnet/models/") +if not os.path.exists(keypoint_model_dir): + kptmodel_url = "https://bj.bcebos.com/v1/paddledet/models/pipeline/dark_hrnet_w32_256x192.zip" + get_path_from_url_with_filelock(kptmodel_url, root_dir="annotator/ppdet_hrnet/models/") class PPDetPose(object): @@ -391,7 +389,8 @@ def __init__(self) -> None: trt_calib_mode=trt_calib_mode, cpu_threads=cpu_threads, enable_mkldnn=enable_mkldnn, - threshold=det_threshold, ) + threshold=det_threshold, + ) self.topdown_keypoint_detector = KeyPointDetector( keypoint_model_dir, @@ -404,7 +403,8 @@ def __init__(self) -> None: trt_calib_mode=trt_calib_mode, cpu_threads=cpu_threads, enable_mkldnn=enable_mkldnn, - use_dark=use_dark, ) + use_dark=use_dark, + ) keypoint_arch = self.topdown_keypoint_detector.pred_config.arch assert ( KEYPOINT_SUPPORT_MODELS[keypoint_arch] == "keypoint_topdown" @@ -417,7 +417,8 @@ def ppdet_hrnet_infer(self, image): self.topdown_keypoint_detector, image, keypoint_batch_size, - det_threshold, ) + det_threshold, + ) def main(): @@ -439,7 +440,8 @@ def main(): trt_calib_mode=FLAGS.trt_calib_mode, cpu_threads=FLAGS.cpu_threads, enable_mkldnn=FLAGS.enable_mkldnn, - threshold=FLAGS.det_threshold, ) + threshold=FLAGS.det_threshold, + ) topdown_keypoint_detector = KeyPointDetector( FLAGS.keypoint_model_dir, @@ -452,7 +454,8 @@ def main(): trt_calib_mode=FLAGS.trt_calib_mode, cpu_threads=FLAGS.cpu_threads, enable_mkldnn=FLAGS.enable_mkldnn, - use_dark=FLAGS.use_dark, ) + use_dark=FLAGS.use_dark, + ) keypoint_arch = topdown_keypoint_detector.pred_config.arch assert ( KEYPOINT_SUPPORT_MODELS[keypoint_arch] == "keypoint_topdown" @@ -465,7 +468,8 @@ def main(): topdown_keypoint_detector, FLAGS.camera_id, FLAGS.keypoint_batch_size, - FLAGS.save_res, ) + FLAGS.save_res, + ) else: # predict from image img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file) @@ -474,7 +478,8 @@ def main(): topdown_keypoint_detector, img_list, FLAGS.keypoint_batch_size, - FLAGS.save_res, ) + FLAGS.save_res, + ) if not FLAGS.run_benchmark: detector.det_times.info(average=True) topdown_keypoint_detector.det_times.info(average=True) @@ -496,7 +501,8 @@ def main(): img_list, keypoint_model_info, FLAGS.keypoint_batch_size, - "KeyPoint", ) + "KeyPoint", + ) if __name__ == "__main__": @@ -505,7 +511,6 @@ def main(): FLAGS = parser.parse_args() print_arguments(FLAGS) FLAGS.device = FLAGS.device.upper() - assert FLAGS.device in ["CPU", "GPU", "XPU" - ], "device should be CPU, GPU or XPU" + assert FLAGS.device in ["CPU", "GPU", "XPU"], "device should be CPU, GPU or XPU" main() diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_utils.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_utils.py index 5290e03d818fa..0d023a6d28d57 100644 --- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_utils.py +++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/det_keypoint_unite_utils.py @@ -22,58 +22,60 @@ def argsparser(): "--det_model_dir", type=str, default=None, - help=("Directory include:'model.pdiparams', 'model.pdmodel', " - "'infer_cfg.yml', created by tools/export_model.py."), - required=True, ) + help=( + "Directory include:'model.pdiparams', 'model.pdmodel', " + "'infer_cfg.yml', created by tools/export_model.py." + ), + required=True, + ) parser.add_argument( "--keypoint_model_dir", type=str, default=None, - help=("Directory include:'model.pdiparams', 'model.pdmodel', " - "'infer_cfg.yml', created by tools/export_model.py."), - required=True, ) - parser.add_argument( - "--image_file", type=str, default=None, help="Path of image file.") + help=( + "Directory include:'model.pdiparams', 'model.pdmodel', " + "'infer_cfg.yml', created by tools/export_model.py." + ), + required=True, + ) + parser.add_argument("--image_file", type=str, default=None, help="Path of image file.") parser.add_argument( "--image_dir", type=str, default=None, - help="Dir of image file, `image_file` has a higher priority.", ) + help="Dir of image file, `image_file` has a higher priority.", + ) parser.add_argument( "--keypoint_batch_size", type=int, default=8, - help=("batch_size for keypoint inference. In detection-keypoint unit" - "inference, the batch size in detection is 1. Then collate det " - "result in batch for keypoint inference."), ) + help=( + "batch_size for keypoint inference. In detection-keypoint unit" + "inference, the batch size in detection is 1. Then collate det " + "result in batch for keypoint inference." + ), + ) parser.add_argument( "--video_file", type=str, default=None, help="Path of video file, `video_file` or `camera_id` has a highest priority.", ) - parser.add_argument( - "--camera_id", - type=int, - default=-1, - help="device id of camera to predict.") - parser.add_argument( - "--det_threshold", type=float, default=0.5, help="Threshold of score.") - parser.add_argument( - "--keypoint_threshold", - type=float, - default=0.5, - help="Threshold of score.") + parser.add_argument("--camera_id", type=int, default=-1, help="device id of camera to predict.") + parser.add_argument("--det_threshold", type=float, default=0.5, help="Threshold of score.") + parser.add_argument("--keypoint_threshold", type=float, default=0.5, help="Threshold of score.") parser.add_argument( "--output_dir", type=str, default="output", - help="Directory of output visualization files.", ) + help="Directory of output visualization files.", + ) parser.add_argument( "--run_mode", type=str, default="paddle", - help="mode of running(paddle/trt_fp32/trt_fp16/trt_int8)", ) + help="mode of running(paddle/trt_fp32/trt_fp16/trt_int8)", + ) parser.add_argument( "--device", type=str, @@ -84,32 +86,24 @@ def argsparser(): "--run_benchmark", type=ast.literal_eval, default=False, - help="Whether to predict a image_file repeatedly for benchmark", ) + help="Whether to predict a image_file repeatedly for benchmark", + ) parser.add_argument( "--enable_mkldnn", type=ast.literal_eval, default=False, - help="Whether use mkldnn with CPU.", ) - parser.add_argument( - "--cpu_threads", type=int, default=1, help="Num of threads with CPU.") - parser.add_argument( - "--trt_min_shape", type=int, default=1, help="min_shape for TensorRT.") - parser.add_argument( - "--trt_max_shape", - type=int, - default=1280, - help="max_shape for TensorRT.") - parser.add_argument( - "--trt_opt_shape", - type=int, - default=640, - help="opt_shape for TensorRT.") + help="Whether use mkldnn with CPU.", + ) + parser.add_argument("--cpu_threads", type=int, default=1, help="Num of threads with CPU.") + parser.add_argument("--trt_min_shape", type=int, default=1, help="min_shape for TensorRT.") + parser.add_argument("--trt_max_shape", type=int, default=1280, help="max_shape for TensorRT.") + parser.add_argument("--trt_opt_shape", type=int, default=640, help="opt_shape for TensorRT.") parser.add_argument( "--trt_calib_mode", type=bool, default=False, - help="If the model is produced by TRT offline quantitative " - "calibration, trt_calib_mode need to set True.", ) + help="If the model is produced by TRT offline quantitative " "calibration, trt_calib_mode need to set True.", + ) parser.add_argument( "--use_dark", type=ast.literal_eval, @@ -126,7 +120,9 @@ def argsparser(): "2) image_data: [imageid, rects, [keypoints, scores]]" "3) rects: list of rect [xmin, ymin, xmax, ymax]" "4) keypoints: 17(joint numbers)*[x, y, conf], total 51 data in list" - "5) scores: mean of all joint conf"), ) + "5) scores: mean of all joint conf" + ), + ) parser.add_argument( "--smooth", type=ast.literal_eval, diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/infer.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/infer.py index a2a9769e224e0..6d4135cdfb9a6 100644 --- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/infer.py +++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/infer.py @@ -25,16 +25,17 @@ from paddle.inference import Config, create_predictor from .benchmark_utils import PaddleInferBenchmark -from .keypoint_preprocess import ( - EvalAffine, - TopDownEvalAffine, # noqa F401 - expand_crop) +from .keypoint_preprocess import EvalAffine, TopDownEvalAffine, expand_crop # noqa F401 from .picodet_postprocess import PicoDetPostProcess from .preprocess import Pad # noqa F401 -from .preprocess import (LetterBoxResize, NormalizeImage, PadStride, Permute, - Resize, WarpAffine, decode_image, preprocess) -from .utils import (Timer, argsparser, coco_clsid2catid, get_current_memory_mb, - multiclass_nms) +from .preprocess import preprocess +from .utils import ( + Timer, + argsparser, + coco_clsid2catid, + get_current_memory_mb, + multiclass_nms, +) from .visualize import visualize_box_mask # Global dictionary @@ -81,8 +82,7 @@ def bench_log(detector, img_list, model_info, batch_size=1, name=None): "shape": "dynamic_shape", "data_num": perf_info["img_num"], } - log = PaddleInferBenchmark(detector.config, model_info, data_info, - perf_info, mems) + log = PaddleInferBenchmark(detector.config, model_info, data_info, perf_info, mems) log(name) @@ -109,21 +109,22 @@ class Detector(object): """ def __init__( - self, - model_dir, - device="CPU", - run_mode="paddle", - batch_size=1, - trt_min_shape=1, - trt_max_shape=1280, - trt_opt_shape=640, - trt_calib_mode=False, - cpu_threads=1, - enable_mkldnn=False, - enable_mkldnn_bfloat16=False, - output_dir="output", - threshold=0.5, - delete_shuffle_pass=False, ): + self, + model_dir, + device="CPU", + run_mode="paddle", + batch_size=1, + trt_min_shape=1, + trt_max_shape=1280, + trt_opt_shape=640, + trt_calib_mode=False, + cpu_threads=1, + enable_mkldnn=False, + enable_mkldnn_bfloat16=False, + output_dir="output", + threshold=0.5, + delete_shuffle_pass=False, + ): self.pred_config = self.set_config(model_dir) self.predictor, self.config = load_predictor( model_dir, @@ -140,7 +141,8 @@ def __init__( cpu_threads=cpu_threads, enable_mkldnn=enable_mkldnn, enable_mkldnn_bfloat16=enable_mkldnn_bfloat16, - delete_shuffle_pass=delete_shuffle_pass, ) + delete_shuffle_pass=delete_shuffle_pass, + ) self.det_times = Timer() self.cpu_mem, self.gpu_mem, self.gpu_util = 0, 0, 0 self.batch_size = batch_size @@ -177,9 +179,7 @@ def preprocess(self, image_list): def postprocess(self, inputs, result): # postprocess output of predictor np_boxes_num = result["boxes_num"] - assert isinstance( - np_boxes_num, - np.ndarray), "`np_boxes_num` should be a `numpy.ndarray`" + assert isinstance(np_boxes_num, np.ndarray), "`np_boxes_num` should be a `numpy.ndarray`" result = {k: v for k, v in result.items() if v is not None} return result @@ -192,7 +192,7 @@ def filter_box(self, result, threshold): filter_num = [] for i in range(len(np_boxes_num)): boxes_num = np_boxes_num[i] - boxes_i = boxes[start_idx:start_idx + boxes_num, :] + boxes_i = boxes[start_idx : start_idx + boxes_num, :] idx = boxes_i[:, 1] > threshold filter_boxes_i = boxes_i[idx, :] filter_boxes.append(filter_boxes_i) @@ -220,8 +220,7 @@ def predict(self, repeats=1, run_benchmark=False): for i in range(repeats): self.predictor.run() paddle.device.cuda.synchronize() - result = dict( - boxes=np_boxes, masks=np_masks, boxes_num=np_boxes_num) + result = dict(boxes=np_boxes, masks=np_masks, boxes_num=np_boxes_num) return result for i in range(repeats): @@ -258,17 +257,18 @@ def get_timer(self): return self.det_times def predict_image_slice( - self, - img_list, - slice_size=[640, 640], - overlap_ratio=[0.25, 0.25], - combine_method="nms", - match_threshold=0.6, - match_metric="ios", - run_benchmark=False, - repeats=1, - visual=True, - save_results=False, ): + self, + img_list, + slice_size=[640, 640], + overlap_ratio=[0.25, 0.25], + combine_method="nms", + match_threshold=0.6, + match_metric="ios", + run_benchmark=False, + repeats=1, + visual=True, + save_results=False, + ): # slice infer only support bs=1 results = [] try: @@ -287,14 +287,13 @@ def predict_image_slice( slice_height=slice_size[0], slice_width=slice_size[1], overlap_height_ratio=overlap_ratio[0], - overlap_width_ratio=overlap_ratio[1], ) + overlap_width_ratio=overlap_ratio[1], + ) sub_img_num = len(slice_image_result) merged_bboxs = [] print("slice to {} sub_samples.", sub_img_num) - batch_image_list = [ - slice_image_result.images[_ind] for _ind in range(sub_img_num) - ] + batch_image_list = [slice_image_result.images[_ind] for _ind in range(sub_img_num)] if run_benchmark: # preprocess inputs = self.preprocess(batch_image_list) # warmup @@ -341,10 +340,8 @@ def predict_image_slice( boxes_num = result["boxes_num"][_ind] ed = st + boxes_num shift_amount = slice_image_result.starting_pixels[_ind] - result["boxes"][st:ed][:, 2:4] = ( - result["boxes"][st:ed][:, 2:4] + shift_amount) - result["boxes"][st:ed][:, 4:6] = ( - result["boxes"][st:ed][:, 4:6] + shift_amount) + result["boxes"][st:ed][:, 2:4] = result["boxes"][st:ed][:, 2:4] + shift_amount + result["boxes"][st:ed][:, 4:6] = result["boxes"][st:ed][:, 4:6] + shift_amount merged_bboxs.append(result["boxes"][st:ed]) st = ed @@ -354,16 +351,14 @@ def predict_image_slice( np.concatenate(merged_bboxs), num_classes, match_threshold, - match_metric, ) + match_metric, + ) merged_results["boxes"] = np.concatenate(final_boxes) elif combine_method == "concat": merged_results["boxes"] = np.concatenate(merged_bboxs) else: - raise ValueError( - "Now only support 'nms' or 'concat' to fuse detection results." - ) - merged_results["boxes_num"] = np.array( - [len(merged_results["boxes"])], dtype=np.int32) + raise ValueError("Now only support 'nms' or 'concat' to fuse detection results.") + merged_results["boxes_num"] = np.array([len(merged_results["boxes"])], dtype=np.int32) if visual: visualize( @@ -371,24 +366,25 @@ def predict_image_slice( merged_results, self.pred_config.labels, output_dir=self.output_dir, - threshold=self.threshold, ) + threshold=self.threshold, + ) results.append(merged_results) results = self.merge_batch_result(results) if save_results: Path(self.output_dir).mkdir(exist_ok=True) - self.save_coco_results( - img_list, results, use_coco_category=FLAGS.use_coco_category) + self.save_coco_results(img_list, results, use_coco_category=FLAGS.use_coco_category) return results def predict_image( - self, - image_list, - run_benchmark=False, - repeats=1, - visual=True, - save_results=False, ): + self, + image_list, + run_benchmark=False, + repeats=1, + visual=True, + save_results=False, + ): batch_loop_cnt = math.ceil(float(len(image_list)) / self.batch_size) results = [] for i in range(batch_loop_cnt): @@ -442,13 +438,13 @@ def predict_image( result, self.pred_config.labels, output_dir=self.output_dir, - threshold=self.threshold, ) + threshold=self.threshold, + ) results.append(result) results = self.merge_batch_result(results) if save_results: Path(self.output_dir).mkdir(exist_ok=True) - self.save_coco_results( - image_list, results, use_coco_category=FLAGS.use_coco_category) + self.save_coco_results(image_list, results, use_coco_category=FLAGS.use_coco_category) return results def predict_video(self, video_file, camera_id): @@ -468,7 +464,7 @@ def predict_video(self, video_file, camera_id): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) out_path = os.path.join(self.output_dir, video_out_name) - fourcc = cv2.VideoWriter_fourcc(* "mp4v") + fourcc = cv2.VideoWriter_fourcc(*"mp4v") writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height)) index = 1 while 1: @@ -479,11 +475,7 @@ def predict_video(self, video_file, camera_id): index += 1 results = self.predict_image([frame[:, :, ::-1]], visual=False) - im = visualize_box_mask( - frame, - results, - self.pred_config.labels, - threshold=self.threshold) + im = visualize_box_mask(frame, results, self.pred_config.labels, threshold=self.threshold) im = np.array(im) writer.write(im) if camera_id != -1: @@ -505,43 +497,44 @@ def save_coco_results(self, image_list, results, use_coco_category=False): img_id = i if "boxes" in results: - boxes = results["boxes"][idx:idx + box_num].tolist() - bbox_results.extend([ - { - "image_id": img_id, - "category_id": coco_clsid2catid[int(box[0])] - if use_coco_category else int(box[0]), - "file_name": file_name, - "bbox": [ - box[2], - box[3], - box[4] - box[2], - box[5] - box[3], - ], # xyxy -> xywh - "score": box[1], - } for box in boxes - ]) + boxes = results["boxes"][idx : idx + box_num].tolist() + bbox_results.extend( + [ + { + "image_id": img_id, + "category_id": coco_clsid2catid[int(box[0])] if use_coco_category else int(box[0]), + "file_name": file_name, + "bbox": [ + box[2], + box[3], + box[4] - box[2], + box[5] - box[3], + ], # xyxy -> xywh + "score": box[1], + } + for box in boxes + ] + ) if "masks" in results: import pycocotools.mask as mask_util - boxes = results["boxes"][idx:idx + box_num].tolist() + boxes = results["boxes"][idx : idx + box_num].tolist() masks = results["masks"][i][:box_num].astype(np.uint8) seg_res = [] for box, mask in zip(boxes, masks): - rle = mask_util.encode( - np.array( - mask[:, :, None], dtype=np.uint8, order="F"))[0] + rle = mask_util.encode(np.array(mask[:, :, None], dtype=np.uint8, order="F"))[0] if "counts" in rle: rle["counts"] = rle["counts"].decode("utf8") - seg_res.append({ - "image_id": img_id, - "category_id": coco_clsid2catid[int(box[0])] - if use_coco_category else int(box[0]), - "file_name": file_name, - "segmentation": rle, - "score": box[1], - }) + seg_res.append( + { + "image_id": img_id, + "category_id": coco_clsid2catid[int(box[0])] if use_coco_category else int(box[0]), + "file_name": file_name, + "segmentation": rle, + "score": box[1], + } + ) mask_results.extend(seg_res) idx += box_num @@ -579,20 +572,21 @@ class DetectorSOLOv2(Detector): """ def __init__( - self, - model_dir, - device="CPU", - run_mode="paddle", - batch_size=1, - trt_min_shape=1, - trt_max_shape=1280, - trt_opt_shape=640, - trt_calib_mode=False, - cpu_threads=1, - enable_mkldnn=False, - enable_mkldnn_bfloat16=False, - output_dir="./", - threshold=0.5, ): + self, + model_dir, + device="CPU", + run_mode="paddle", + batch_size=1, + trt_min_shape=1, + trt_max_shape=1280, + trt_opt_shape=640, + trt_calib_mode=False, + cpu_threads=1, + enable_mkldnn=False, + enable_mkldnn_bfloat16=False, + output_dir="./", + threshold=0.5, + ): super(DetectorSOLOv2, self).__init__( model_dir=model_dir, device=device, @@ -606,7 +600,8 @@ def __init__( enable_mkldnn=enable_mkldnn, enable_mkldnn_bfloat16=enable_mkldnn_bfloat16, output_dir=output_dir, - threshold=threshold, ) + threshold=threshold, + ) def predict(self, repeats=1, run_benchmark=False): """ @@ -617,37 +612,24 @@ def predict(self, repeats=1, run_benchmark=False): 'cate_label': label of segm, shape:[N] 'cate_score': confidence score of segm, shape:[N] """ - np_segms, np_label, np_score, np_boxes_num = None, None, None, np.array( - [0]) + np_segms, np_label, np_score, np_boxes_num = None, None, None, np.array([0]) if run_benchmark: for i in range(repeats): self.predictor.run() paddle.device.cuda.synchronize() - result = dict( - segm=np_segms, - label=np_label, - score=np_score, - boxes_num=np_boxes_num) + result = dict(segm=np_segms, label=np_label, score=np_score, boxes_num=np_boxes_num) return result for i in range(repeats): self.predictor.run() output_names = self.predictor.get_output_names() - np_boxes_num = self.predictor.get_output_handle(output_names[ - 0]).copy_to_cpu() - np_label = self.predictor.get_output_handle(output_names[ - 1]).copy_to_cpu() - np_score = self.predictor.get_output_handle(output_names[ - 2]).copy_to_cpu() - np_segms = self.predictor.get_output_handle(output_names[ - 3]).copy_to_cpu() - - result = dict( - segm=np_segms, - label=np_label, - score=np_score, - boxes_num=np_boxes_num) + np_boxes_num = self.predictor.get_output_handle(output_names[0]).copy_to_cpu() + np_label = self.predictor.get_output_handle(output_names[1]).copy_to_cpu() + np_score = self.predictor.get_output_handle(output_names[2]).copy_to_cpu() + np_segms = self.predictor.get_output_handle(output_names[3]).copy_to_cpu() + + result = dict(segm=np_segms, label=np_label, score=np_score, boxes_num=np_boxes_num) return result @@ -669,20 +651,21 @@ class DetectorPicoDet(Detector): """ def __init__( - self, - model_dir, - device="CPU", - run_mode="paddle", - batch_size=1, - trt_min_shape=1, - trt_max_shape=1280, - trt_opt_shape=640, - trt_calib_mode=False, - cpu_threads=1, - enable_mkldnn=False, - enable_mkldnn_bfloat16=False, - output_dir="./", - threshold=0.5, ): + self, + model_dir, + device="CPU", + run_mode="paddle", + batch_size=1, + trt_min_shape=1, + trt_max_shape=1280, + trt_opt_shape=640, + trt_calib_mode=False, + cpu_threads=1, + enable_mkldnn=False, + enable_mkldnn_bfloat16=False, + output_dir="./", + threshold=0.5, + ): super(DetectorPicoDet, self).__init__( model_dir=model_dir, device=device, @@ -696,7 +679,8 @@ def __init__( enable_mkldnn=enable_mkldnn, enable_mkldnn_bfloat16=enable_mkldnn_bfloat16, output_dir=output_dir, - threshold=threshold, ) + threshold=threshold, + ) def postprocess(self, inputs, result): # postprocess output of predictor @@ -707,7 +691,8 @@ def postprocess(self, inputs, result): inputs["im_shape"], inputs["scale_factor"], strides=self.pred_config.fpn_stride, - nms_threshold=self.pred_config.nms["nms_threshold"], ) + nms_threshold=self.pred_config.nms["nms_threshold"], + ) np_boxes, np_boxes_num = postprocessor(np_score_list, np_boxes_list) result = dict(boxes=np_boxes, boxes_num=np_boxes_num) return result @@ -736,12 +721,8 @@ def predict(self, repeats=1, run_benchmark=False): output_names = self.predictor.get_output_names() num_outs = int(len(output_names) / 2) for out_idx in range(num_outs): - np_score_list.append( - self.predictor.get_output_handle(output_names[out_idx]) - .copy_to_cpu()) - np_boxes_list.append( - self.predictor.get_output_handle(output_names[ - out_idx + num_outs]).copy_to_cpu()) + np_score_list.append(self.predictor.get_output_handle(output_names[out_idx]).copy_to_cpu()) + np_boxes_list.append(self.predictor.get_output_handle(output_names[out_idx + num_outs]).copy_to_cpu()) result = dict(boxes=np_score_list, boxes_num=np_boxes_list) return result @@ -759,16 +740,14 @@ def create_inputs(imgs, im_info): im_shape = [] scale_factor = [] if len(imgs) == 1: - inputs["image"] = np.array((imgs[0], )).astype("float32") - inputs["im_shape"] = np.array( - (im_info[0]["im_shape"], )).astype("float32") - inputs["scale_factor"] = np.array( - (im_info[0]["scale_factor"], )).astype("float32") + inputs["image"] = np.array((imgs[0],)).astype("float32") + inputs["im_shape"] = np.array((im_info[0]["im_shape"],)).astype("float32") + inputs["scale_factor"] = np.array((im_info[0]["scale_factor"],)).astype("float32") return inputs for e in im_info: - im_shape.append(np.array((e["im_shape"], )).astype("float32")) - scale_factor.append(np.array((e["scale_factor"], )).astype("float32")) + im_shape.append(np.array((e["im_shape"],)).astype("float32")) + scale_factor.append(np.array((e["scale_factor"],)).astype("float32")) inputs["im_shape"] = np.concatenate(im_shape, axis=0) inputs["scale_factor"] = np.concatenate(scale_factor, axis=0) @@ -779,8 +758,7 @@ def create_inputs(imgs, im_info): padding_imgs = [] for img in imgs: im_c, im_h, im_w = img.shape[:] - padding_im = np.zeros( - (im_c, max_shape_h, max_shape_w), dtype=np.float32) + padding_im = np.zeros((im_c, max_shape_h, max_shape_w), dtype=np.float32) padding_im[:, :im_h, :im_w] = img padding_imgs.append(padding_im) inputs["image"] = np.stack(padding_imgs, axis=0) @@ -815,9 +793,7 @@ def __init__(self, model_dir): if "fpn_stride" in yml_conf: self.fpn_stride = yml_conf["fpn_stride"] if self.arch == "RCNN" and yml_conf.get("export_onnx", False): - print( - "The RCNN export model is used for ONNX and it only supports batch_size = 1" - ) + print("The RCNN export model is used for ONNX and it only supports batch_size = 1") self.print_config() def check_model(self, yml_conf): @@ -828,8 +804,7 @@ def check_model(self, yml_conf): for support_model in SUPPORT_MODELS: if support_model in yml_conf["arch"]: return True - raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf[ - "arch"], SUPPORT_MODELS)) + raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf["arch"], SUPPORT_MODELS)) def print_config(self): print("----------- Model Configuration -----------") @@ -841,22 +816,23 @@ def print_config(self): def load_predictor( - model_dir, - arch, - run_mode="paddle", - batch_size=1, - device="CPU", - min_subgraph_size=3, - use_dynamic_shape=False, - trt_min_shape=1, - trt_max_shape=1280, - trt_opt_shape=640, - trt_calib_mode=False, - cpu_threads=1, - enable_mkldnn=False, - enable_mkldnn_bfloat16=False, - delete_shuffle_pass=False, - tuned_trt_shape_file="shape_range_info.pbtxt", ): + model_dir, + arch, + run_mode="paddle", + batch_size=1, + device="CPU", + min_subgraph_size=3, + use_dynamic_shape=False, + trt_min_shape=1, + trt_max_shape=1280, + trt_opt_shape=640, + trt_calib_mode=False, + cpu_threads=1, + enable_mkldnn=False, + enable_mkldnn_bfloat16=False, + delete_shuffle_pass=False, + tuned_trt_shape_file="shape_range_info.pbtxt", +): """set AnalysisConfig, generate AnalysisPredictor Args: model_dir (str): root path of __model__ and __params__ @@ -877,16 +853,15 @@ def load_predictor( """ if device != "GPU" and run_mode != "paddle": raise ValueError( - "Predict by TensorRT mode: {}, expect device=='GPU', but device == {}". - format(run_mode, device)) + "Predict by TensorRT mode: {}, expect device=='GPU', but device == {}".format(run_mode, device) + ) infer_model = os.path.join(model_dir, "model.pdmodel") infer_params = os.path.join(model_dir, "model.pdiparams") if not os.path.exists(infer_model): infer_model = os.path.join(model_dir, "inference.pdmodel") infer_params = os.path.join(model_dir, "inference.pdiparams") if not os.path.exists(infer_model): - raise ValueError( - "Cannot find any inference model in dir: {},".format(model_dir)) + raise ValueError("Cannot find any inference model in dir: {},".format(model_dir)) config = Config(infer_model, infer_params) if device == "GPU": # initial GPU memory(M), device ID @@ -912,9 +887,7 @@ def load_predictor( if enable_mkldnn_bfloat16: config.enable_mkldnn_bfloat16() except: - print( - "The current environment does not support `mkldnn`, so disable mkldnn." - ) + print("The current environment does not support `mkldnn`, so disable mkldnn.") pass precision_map = { @@ -931,10 +904,10 @@ def load_predictor( min_subgraph_size=min_subgraph_size, precision_mode=precision_map[run_mode], use_static=False, - use_calib_mode=trt_calib_mode, ) + use_calib_mode=trt_calib_mode, + ) if arch in TUNED_TRT_DYNAMIC_MODELS: - config.enable_tuned_tensorrt_dynamic_shape(tuned_trt_shape_file, - True) + config.enable_tuned_tensorrt_dynamic_shape(tuned_trt_shape_file, True) if use_dynamic_shape: min_input_shape = { @@ -949,8 +922,7 @@ def load_predictor( "image": [batch_size, 3, trt_opt_shape, trt_opt_shape], "scale_factor": [batch_size, 2], } - config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape, - opt_input_shape) + config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape, opt_input_shape) print("trt set dynamic shape done!") # disable print log when predict @@ -969,12 +941,9 @@ def get_test_images(infer_dir, infer_img): """ Get image path list in TEST mode """ - assert (infer_img is not None or - infer_dir is not None), "--image_file or --image_dir should be set" - assert infer_img is None or os.path.isfile( - infer_img), "{} is not a file".format(infer_img) - assert infer_dir is None or os.path.isdir( - infer_dir), "{} is not a directory".format(infer_dir) + assert infer_img is not None or infer_dir is not None, "--image_file or --image_dir should be set" + assert infer_img is None or os.path.isfile(infer_img), "{} is not a file".format(infer_img) + assert infer_dir is None or os.path.isdir(infer_dir), "{} is not a directory".format(infer_dir) # infer_img has a higher priority if infer_img and os.path.isfile(infer_img): @@ -982,8 +951,7 @@ def get_test_images(infer_dir, infer_img): images = set() infer_dir = os.path.abspath(infer_dir) - assert os.path.isdir(infer_dir), "infer_dir {} is not a directory".format( - infer_dir) + assert os.path.isdir(infer_dir), "infer_dir {} is not a directory".format(infer_dir) exts = ["jpg", "jpeg", "png", "bmp"] exts += [ext.upper() for ext in exts] for ext in exts: @@ -1003,24 +971,18 @@ def visualize(image_list, result, labels, output_dir="output/", threshold=0.5): im_bboxes_num = result["boxes_num"][idx] im_results = {} if "boxes" in result: - im_results["boxes"] = result["boxes"][start_idx:start_idx + - im_bboxes_num, :] + im_results["boxes"] = result["boxes"][start_idx : start_idx + im_bboxes_num, :] if "masks" in result: - im_results["masks"] = result["masks"][start_idx:start_idx + - im_bboxes_num, :] + im_results["masks"] = result["masks"][start_idx : start_idx + im_bboxes_num, :] if "segm" in result: - im_results["segm"] = result["segm"][start_idx:start_idx + - im_bboxes_num, :] + im_results["segm"] = result["segm"][start_idx : start_idx + im_bboxes_num, :] if "label" in result: - im_results["label"] = result["label"][start_idx:start_idx + - im_bboxes_num] + im_results["label"] = result["label"][start_idx : start_idx + im_bboxes_num] if "score" in result: - im_results["score"] = result["score"][start_idx:start_idx + - im_bboxes_num] + im_results["score"] = result["score"][start_idx : start_idx + im_bboxes_num] start_idx += im_bboxes_num - im = visualize_box_mask( - image_file, im_results, labels, threshold=threshold) + im = visualize_box_mask(image_file, im_results, labels, threshold=threshold) img_name = os.path.split(image_file)[-1] if not os.path.exists(output_dir): os.makedirs(output_dir) @@ -1060,7 +1022,8 @@ def main(): enable_mkldnn=FLAGS.enable_mkldnn, enable_mkldnn_bfloat16=FLAGS.enable_mkldnn_bfloat16, threshold=FLAGS.threshold, - output_dir=FLAGS.output_dir, ) + output_dir=FLAGS.output_dir, + ) # predict from video file or camera video stream if FLAGS.video_file is not None or FLAGS.camera_id != -1: @@ -1068,8 +1031,7 @@ def main(): else: # predict from image if FLAGS.image_dir is None and FLAGS.image_file is not None: - assert (FLAGS.batch_size == 1 - ), "batch_size should be 1, when image_file is not None" + assert FLAGS.batch_size == 1, "batch_size should be 1, when image_file is not None" img_list = get_test_images(FLAGS.image_dir, FLAGS.image_file) if FLAGS.slice_infer: detector.predict_image_slice( @@ -1080,14 +1042,16 @@ def main(): FLAGS.match_threshold, FLAGS.match_metric, visual=FLAGS.save_images, - save_results=FLAGS.save_results, ) + save_results=FLAGS.save_results, + ) else: detector.predict_image( img_list, FLAGS.run_benchmark, repeats=100, visual=FLAGS.save_images, - save_results=FLAGS.save_results, ) + save_results=FLAGS.save_results, + ) if not FLAGS.run_benchmark: detector.det_times.info(average=True) else: diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_infer.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_infer.py index 8f661fb65fe6b..fa3551f584493 100644 --- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_infer.py +++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_infer.py @@ -52,20 +52,21 @@ class KeyPointDetector(Detector): """ def __init__( - self, - model_dir, - device="CPU", - run_mode="paddle", - batch_size=1, - trt_min_shape=1, - trt_max_shape=1280, - trt_opt_shape=640, - trt_calib_mode=False, - cpu_threads=1, - enable_mkldnn=False, - output_dir="output", - threshold=0.5, - use_dark=True, ): + self, + model_dir, + device="CPU", + run_mode="paddle", + batch_size=1, + trt_min_shape=1, + trt_max_shape=1280, + trt_opt_shape=640, + trt_calib_mode=False, + cpu_threads=1, + enable_mkldnn=False, + output_dir="output", + threshold=0.5, + use_dark=True, + ): super(KeyPointDetector, self).__init__( model_dir=model_dir, device=device, @@ -78,7 +79,8 @@ def __init__( cpu_threads=cpu_threads, enable_mkldnn=enable_mkldnn, output_dir=output_dir, - threshold=threshold, ) + threshold=threshold, + ) self.use_dark = use_dark def set_config(self, model_dir): @@ -105,8 +107,7 @@ def postprocess(self, inputs, result): np_heatmap = result["heatmap"] np_masks = result["masks"] # postprocess output of predictor - if KEYPOINT_SUPPORT_MODELS[ - self.pred_config.arch] == "keypoint_bottomup": + if KEYPOINT_SUPPORT_MODELS[self.pred_config.arch] == "keypoint_bottomup": results = {} h, w = inputs["im_shape"][0] preds = [np_heatmap] @@ -118,8 +119,7 @@ def postprocess(self, inputs, result): results["keypoint"] = kpts results["score"] = scores return results - elif KEYPOINT_SUPPORT_MODELS[ - self.pred_config.arch] == "keypoint_topdown": + elif KEYPOINT_SUPPORT_MODELS[self.pred_config.arch] == "keypoint_topdown": results = {} imshape = inputs["im_shape"][:, ::-1] center = np.round(imshape / 2.0) @@ -130,8 +130,7 @@ def postprocess(self, inputs, result): results["score"] = scores return results else: - raise ValueError("Unsupported arch: {}, expect {}".format( - self.pred_config.arch, KEYPOINT_SUPPORT_MODELS)) + raise ValueError("Unsupported arch: {}, expect {}".format(self.pred_config.arch, KEYPOINT_SUPPORT_MODELS)) def predict(self, repeats=1): """ @@ -162,11 +161,7 @@ def predict(self, repeats=1): result = dict(heatmap=np_heatmap, masks=np_masks) return result - def predict_image(self, - image_list, - run_benchmark=False, - repeats=1, - visual=True): + def predict_image(self, image_list, run_benchmark=False, repeats=1, visual=True): results = [] batch_loop_cnt = math.ceil(float(len(image_list)) / self.batch_size) for i in range(batch_loop_cnt): @@ -222,7 +217,8 @@ def predict_image(self, batch_image_list, result, visual_thresh=self.threshold, - save_dir=self.output_dir, ) + save_dir=self.output_dir, + ) results.append(result) results = self.merge_batch_result(results) @@ -245,7 +241,7 @@ def predict_video(self, video_file, camera_id): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) out_path = os.path.join(self.output_dir, video_name) - fourcc = cv2.VideoWriter_fourcc(* "mp4v") + fourcc = cv2.VideoWriter_fourcc(*"mp4v") writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height)) index = 1 while 1: @@ -257,8 +253,7 @@ def predict_video(self, video_file, camera_id): results = self.predict_image([frame[:, :, ::-1]], visual=False) im_results = {} im_results["keypoint"] = [results["keypoint"], results["score"]] - im = visualize_pose( - frame, im_results, visual_thresh=self.threshold, returnimg=True) + im = visualize_pose(frame, im_results, visual_thresh=self.threshold, returnimg=True) writer.write(im) if camera_id != -1: cv2.imshow("Mask Detection", im) @@ -315,8 +310,7 @@ def check_model(self, yml_conf): for support_model in KEYPOINT_SUPPORT_MODELS: if support_model in yml_conf["arch"]: return True - raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf[ - "arch"], KEYPOINT_SUPPORT_MODELS)) + raise ValueError("Unsupported arch: {}, expect {}".format(yml_conf["arch"], KEYPOINT_SUPPORT_MODELS)) def print_config(self): print("----------- Model Configuration -----------") @@ -332,14 +326,10 @@ def visualize(image_list, results, visual_thresh=0.6, save_dir="output"): for i, image_file in enumerate(image_list): skeletons = results["keypoint"] scores = results["score"] - skeleton = skeletons[i:i + 1] - score = scores[i:i + 1] + skeleton = skeletons[i : i + 1] + score = scores[i : i + 1] im_results["keypoint"] = [skeleton, score] - visualize_pose( - image_file, - im_results, - visual_thresh=visual_thresh, - save_dir=save_dir) + visualize_pose(image_file, im_results, visual_thresh=visual_thresh, save_dir=save_dir) def main(): @@ -356,7 +346,8 @@ def main(): enable_mkldnn=FLAGS.enable_mkldnn, threshold=FLAGS.threshold, output_dir=FLAGS.output_dir, - use_dark=FLAGS.use_dark, ) + use_dark=FLAGS.use_dark, + ) # predict from video file or camera video stream if FLAGS.video_file is not None or FLAGS.camera_id != -1: @@ -385,8 +376,7 @@ def main(): "shape": "dynamic_shape", "data_num": perf_info["img_num"], } - det_log = PaddleInferBenchmark(detector.config, model_info, - data_info, perf_info, mems) + det_log = PaddleInferBenchmark(detector.config, model_info, data_info, perf_info, mems) det_log("KeyPoint") @@ -396,8 +386,7 @@ def main(): FLAGS = parser.parse_args() print_arguments(FLAGS) FLAGS.device = FLAGS.device.upper() - assert FLAGS.device in ["CPU", "GPU", "XPU" - ], "device should be CPU, GPU or XPU" + assert FLAGS.device in ["CPU", "GPU", "XPU"], "device should be CPU, GPU or XPU" assert not FLAGS.use_gpu, "use_gpu has been deprecated, please use --device" main() diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_postprocess.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_postprocess.py index 01aa825cb00ee..8ba1f6a47b0cd 100644 --- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_postprocess.py +++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_postprocess.py @@ -50,14 +50,11 @@ def lerp(self, j, y, x, heatmap): right = np.clip(x + 1, 0, W - 1) up = np.clip(y - 1, 0, H - 1) down = np.clip(y + 1, 0, H - 1) - offset_y = np.where(heatmap[j, down, x] > heatmap[j, up, x], 0.25, - -0.25) - offset_x = np.where(heatmap[j, y, right] > heatmap[j, y, left], 0.25, - -0.25) + offset_y = np.where(heatmap[j, down, x] > heatmap[j, up, x], 0.25, -0.25) + offset_x = np.where(heatmap[j, y, right] > heatmap[j, y, left], 0.25, -0.25) return offset_y + 0.5, offset_x + 0.5 - def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height, - original_width): + def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height, original_width): N, J, H, W = heatmap.shape assert N == 1, "only support batch size 1" @@ -67,8 +64,9 @@ def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height, inds_np = inds_k[0] y = inds_np // W x = inds_np % W - tags = tagmap[np.arange(J)[None, :].repeat(self.max_num_people), - y.flatten(), x.flatten()].reshape(J, -1, tagmap.shape[-1]) + tags = tagmap[np.arange(J)[None, :].repeat(self.max_num_people), y.flatten(), x.flatten()].reshape( + J, -1, tagmap.shape[-1] + ) coords = np.stack((y, x), axis=2) # threshold mask = heats > self.heat_thresh @@ -94,11 +92,8 @@ def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height, cluster[key]["scores"][jid] = heats[jid, i] cluster[key]["coords"][jid] = coords[jid, i] continue - candidates = list(cluster.keys())[:self.max_num_people] - centroids = [ - np.mean( - cluster[k]["tags"], axis=0) for k in candidates - ] + candidates = list(cluster.keys())[: self.max_num_people] + centroids = [np.mean(cluster[k]["tags"], axis=0) for k in candidates] num_clusters = len(centroids) # shape is (num_valid, num_clusters, tag_dim) dist = valid_tags[:, None, :] - np.array(centroids)[None, ...] @@ -111,12 +106,12 @@ def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height, cost, ((0, 0), (0, num_valid - num_clusters)), "constant", - constant_values=((0, 0), (0, 1e-10)), ) + constant_values=((0, 0), (0, 1e-10)), + ) rows, cols = linear_sum_assignment(cost) for y, x in zip(rows, cols): tag = tags[jid, y] - if (y < num_valid and x < num_clusters and - l2_dist[y, x] < self.tag_thresh): + if y < num_valid and x < num_clusters and l2_dist[y, x] < self.tag_thresh: key = candidates[x] # merge to cluster else: key = tag[0] # initialize new cluster @@ -151,7 +146,7 @@ def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height, if True: for pid, coords in enumerate(pose_coords): tag_mean = np.array(pose_tags[pid]).mean(axis=0) - norm = np.sum((tagmap - tag_mean)**2, axis=3)**0.5 + norm = np.sum((tagmap - tag_mean) ** 2, axis=3) ** 0.5 score = heatmap - np.round(norm) # (J, H, W) flat_score = score.reshape(J, -1) max_inds = np.argmax(flat_score, axis=1) @@ -167,9 +162,7 @@ def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height, pose_coords[pid][salvage_joints, 0] = y pose_coords[pid][salvage_joints, 1] = x pose_kpts[pid][salvage_joints, 2] = max_scores[salvage_joints] - pose_kpts[..., :2] = transpred(pose_coords[..., :2][..., ::-1], - original_height, original_width, - min(H, W)) + pose_kpts[..., :2] = transpred(pose_coords[..., :2][..., ::-1], original_height, original_width, min(H, W)) return pose_kpts, mean_score @@ -193,9 +186,7 @@ def warp_affine_joints(joints, mat): joints = np.array(joints) shape = joints.shape joints = joints.reshape(-1, 2) - return np.dot(np.concatenate( - (joints, joints[:, 0:1] * 0 + 1), axis=1), - mat.T).reshape(shape) + return np.dot(np.concatenate((joints, joints[:, 0:1] * 0 + 1), axis=1), mat.T).reshape(shape) class HRNetPostProcess(object): @@ -203,9 +194,7 @@ def __init__(self, use_dark=True): self.use_dark = use_dark def flip_back(self, output_flipped, matched_parts): - assert ( - output_flipped.ndim == 4 - ), "output_flipped should be [batch_size, num_joints, height, width]" + assert output_flipped.ndim == 4, "output_flipped should be [batch_size, num_joints, height, width]" output_flipped = output_flipped[:, :, :, ::-1] @@ -226,8 +215,7 @@ def get_max_preds(self, heatmaps): preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints """ - assert isinstance(heatmaps, - np.ndarray), "heatmaps should be numpy.ndarray" + assert isinstance(heatmaps, np.ndarray), "heatmaps should be numpy.ndarray" assert heatmaps.ndim == 4, "batch_images should be 4-ndim" batch_size = heatmaps.shape[0] @@ -277,10 +265,8 @@ def dark_parse(self, hm, coord): dx = 0.5 * (hm[py][px + 1] - hm[py][px - 1]) dy = 0.5 * (hm[py + 1][px] - hm[py - 1][px]) dxx = 0.25 * (hm[py][px + 2] - 2 * hm[py][px] + hm[py][px - 2]) - dxy = 0.25 * (hm[py + 1][px + 1] - hm[py - 1][px + 1] - - hm[py + 1][px - 1] + hm[py - 1][px - 1]) - dyy = 0.25 * ( - hm[py + 2 * 1][px] - 2 * hm[py][px] + hm[py - 2 * 1][px]) + dxy = 0.25 * (hm[py + 1][px + 1] - hm[py - 1][px + 1] - hm[py + 1][px - 1] + hm[py - 1][px - 1]) + dyy = 0.25 * (hm[py + 2 * 1][px] - 2 * hm[py][px] + hm[py - 2 * 1][px]) derivative = np.matrix([[dx], [dy]]) hessian = np.matrix([[dxx, dxy], [dxy, dyy]]) if dxx * dyy - dxy**2 != 0: @@ -331,25 +317,24 @@ def get_final_preds(self, heatmaps, center, scale, kernelsize=3): px = int(math.floor(coords[n][p][0] + 0.5)) py = int(math.floor(coords[n][p][1] + 0.5)) if 1 < px < heatmap_width - 1 and 1 < py < heatmap_height - 1: - diff = np.array([ - hm[py][px + 1] - hm[py][px - 1], - hm[py + 1][px] - hm[py - 1][px], - ]) + diff = np.array( + [ + hm[py][px + 1] - hm[py][px - 1], + hm[py + 1][px] - hm[py - 1][px], + ] + ) coords[n][p] += np.sign(diff) * 0.25 preds = coords.copy() # Transform back for i in range(coords.shape[0]): - preds[i] = transform_preds(coords[i], center[i], scale[i], - [heatmap_width, heatmap_height]) + preds[i] = transform_preds(coords[i], center[i], scale[i], [heatmap_width, heatmap_height]) return preds, maxvals def __call__(self, output, center, scale): preds, maxvals = self.get_final_preds(output, center, scale) - return np.concatenate( - (preds, maxvals), axis=-1), np.mean( - maxvals, axis=1) + return np.concatenate((preds, maxvals), axis=-1), np.mean(maxvals, axis=1) def transform_preds(coords, center, scale, output_size): diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_preprocess.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_preprocess.py index 86bf7e57c6605..68173f62bd043 100644 --- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_preprocess.py +++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/keypoint_preprocess.py @@ -48,18 +48,12 @@ def get_affine_mat_kernel(h, w, s, inv=False): center = np.array([np.round(w / 2.0), np.round(h / 2.0)]) size_resized = (w_, h_) - trans = get_affine_transform( - center, np.array([scale_w, scale_h]), 0, size_resized, inv=inv) + trans = get_affine_transform(center, np.array([scale_w, scale_h]), 0, size_resized, inv=inv) return trans, size_resized -def get_affine_transform(center, - input_size, - rot, - output_size, - shift=(0.0, 0.0), - inv=False): +def get_affine_transform(center, input_size, rot, output_size, shift=(0.0, 0.0), inv=False): """Get the affine transform matrix, given the center/scale/rot/output_size. Args: @@ -134,13 +128,13 @@ def get_warp_matrix(theta, size_input, size_dst, size_target): matrix[0, 0] = np.cos(theta) * scale_x matrix[0, 1] = -np.sin(theta) * scale_x matrix[0, 2] = scale_x * ( - -0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] * - np.sin(theta) + 0.5 * size_target[0]) + -0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] * np.sin(theta) + 0.5 * size_target[0] + ) matrix[1, 0] = np.sin(theta) * scale_y matrix[1, 1] = np.cos(theta) * scale_y matrix[1, 2] = scale_y * ( - -0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] * - np.cos(theta) + 0.5 * size_target[1]) + -0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] * np.cos(theta) + 0.5 * size_target[1] + ) return matrix @@ -212,19 +206,22 @@ def __call__(self, image, im_info): rot, center * 2.0, [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0], - scale, ) + scale, + ) image = cv2.warpAffine( image, trans, (int(self.trainsize[0]), int(self.trainsize[1])), - flags=cv2.INTER_LINEAR, ) + flags=cv2.INTER_LINEAR, + ) else: trans = get_affine_transform(center, scale, rot, self.trainsize) image = cv2.warpAffine( image, trans, (int(self.trainsize[0]), int(self.trainsize[1])), - flags=cv2.INTER_LINEAR, ) + flags=cv2.INTER_LINEAR, + ) return image, im_info diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/picodet_postprocess.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/picodet_postprocess.py index e858fa5051eaf..aa9b060ce7059 100644 --- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/picodet_postprocess.py +++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/picodet_postprocess.py @@ -41,8 +41,8 @@ def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200): rest_boxes = boxes[indexes, :] iou = iou_of( rest_boxes, - np.expand_dims( - current_box, axis=0), ) + np.expand_dims(current_box, axis=0), + ) indexes = indexes[iou <= iou_threshold] return box_scores[picked, :] @@ -88,15 +88,16 @@ class PicoDetPostProcess(object): """ def __init__( - self, - input_shape, - ori_shape, - scale_factor, - strides=[8, 16, 32, 64], - score_threshold=0.4, - nms_threshold=0.5, - nms_top_k=1000, - keep_top_k=100, ): + self, + input_shape, + ori_shape, + scale_factor, + strides=[8, 16, 32, 64], + score_threshold=0.4, + nms_threshold=0.5, + nms_top_k=1000, + keep_top_k=100, + ): self.ori_shape = ori_shape self.input_shape = input_shape self.scale_factor = scale_factor @@ -113,15 +114,13 @@ def warp_boxes(self, boxes, ori_shape): if n: # warp points xy = np.ones((n * 4, 3)) - xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape( - n * 4, 2) # x1y1, x2y2, x1y2, x2y1 + xy[:, :2] = boxes[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2) # x1y1, x2y2, x1y2, x2y1 # xy = xy @ M.T # transform xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8) # rescale # create new boxes x = xy[:, [0, 2, 4, 6]] y = xy[:, [1, 3, 5, 7]] - xy = (np.concatenate( - (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T) + xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T # clip boxes xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width) xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height) @@ -138,8 +137,7 @@ def __call__(self, scores, raw_boxes): # generate centers decode_boxes = [] select_scores = [] - for stride, box_distribute, score in zip(self.strides, raw_boxes, - scores): + for stride, box_distribute, score in zip(self.strides, raw_boxes, scores): box_distribute = box_distribute[batch_id] score = score[batch_id] # centers @@ -162,7 +160,7 @@ def __call__(self, scores, raw_boxes): # top K candidate topk_idx = np.argsort(score.max(axis=1))[::-1] - topk_idx = topk_idx[:self.nms_top_k] + topk_idx = topk_idx[: self.nms_top_k] center = center[topk_idx] score = score[topk_idx] box_distance = box_distance[topk_idx] @@ -185,12 +183,12 @@ def __call__(self, scores, raw_boxes): if probs.shape[0] == 0: continue subset_boxes = bboxes[mask, :] - box_probs = np.concatenate( - [subset_boxes, probs.reshape(-1, 1)], axis=1) + box_probs = np.concatenate([subset_boxes, probs.reshape(-1, 1)], axis=1) box_probs = hard_nms( box_probs, iou_threshold=self.nms_threshold, - top_k=self.keep_top_k, ) + top_k=self.keep_top_k, + ) picked_box_probs.append(box_probs) picked_labels.extend([class_index] * box_probs.shape[0]) @@ -202,24 +200,25 @@ def __call__(self, scores, raw_boxes): picked_box_probs = np.concatenate(picked_box_probs) # resize output boxes - picked_box_probs[:, :4] = self.warp_boxes( - picked_box_probs[:, :4], self.ori_shape[batch_id]) - im_scale = np.concatenate([ - self.scale_factor[batch_id][::-1], - self.scale_factor[batch_id][::-1], - ]) + picked_box_probs[:, :4] = self.warp_boxes(picked_box_probs[:, :4], self.ori_shape[batch_id]) + im_scale = np.concatenate( + [ + self.scale_factor[batch_id][::-1], + self.scale_factor[batch_id][::-1], + ] + ) picked_box_probs[:, :4] /= im_scale # clas score box out_boxes_list.append( np.concatenate( [ - np.expand_dims( - np.array(picked_labels), axis=-1), - np.expand_dims( - picked_box_probs[:, 4], axis=-1), + np.expand_dims(np.array(picked_labels), axis=-1), + np.expand_dims(picked_box_probs[:, 4], axis=-1), picked_box_probs[:, :4], ], - axis=1, )) + axis=1, + ) + ) out_boxes_num.append(len(picked_labels)) out_boxes_list = np.concatenate(out_boxes_list, axis=0) diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/preprocess.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/preprocess.py index 1066879f2e9ad..e57404bfe6c10 100644 --- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/preprocess.py +++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/preprocess.py @@ -64,16 +64,9 @@ def __call__(self, im, im_info): im_info (dict): info of processed image """ im_scale_y, im_scale_x = self.generate_scale(im) - im = cv2.resize( - im, - None, - None, - fx=im_scale_x, - fy=im_scale_y, - interpolation=self.interp) + im = cv2.resize(im, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=self.interp) im_info["im_shape"] = np.array(im.shape[:2]).astype("float32") - im_info["scale_factor"] = np.array( - [im_scale_y, im_scale_x]).astype("float32") + im_info["scale_factor"] = np.array([im_scale_y, im_scale_x]).astype("float32") return im, im_info def generate_scale(self, img): @@ -140,16 +133,9 @@ def __call__(self, im, im_info): assert len(self.target_size) == 2 assert self.target_size[0] > 0 and self.target_size[1] > 0 im_scale_y, im_scale_x = self.generate_scale(im) - im = cv2.resize( - im, - None, - None, - fx=im_scale_x, - fy=im_scale_y, - interpolation=self.interp) + im = cv2.resize(im, None, None, fx=im_scale_x, fy=im_scale_y, interpolation=self.interp) im_info["im_shape"] = np.array(im.shape[:2]).astype("float32") - im_info["scale_factor"] = np.array( - [im_scale_y, im_scale_x]).astype("float32") + im_info["scale_factor"] = np.array([im_scale_y, im_scale_x]).astype("float32") return im, im_info def generate_scale(self, im): @@ -189,12 +175,13 @@ class ShortSizeScale(object): """ def __init__( - self, - short_size, - fixed_ratio=True, - keep_ratio=None, - do_round=False, - backend="pillow", ): + self, + short_size, + fixed_ratio=True, + keep_ratio=None, + do_round=False, + backend="pillow", + ): self.short_size = short_size assert (fixed_ratio and not keep_ratio) or ( not fixed_ratio @@ -236,10 +223,8 @@ def __call__(self, img): oh = self.short_size else: scale_factor = self.short_size / w - oh = (int(h * float(scale_factor) + 0.5) - if self.do_round else int(h * self.short_size / w)) - ow = (int(w * float(scale_factor) + 0.5) - if self.do_round else int(w * self.short_size / h)) + oh = int(h * float(scale_factor) + 0.5) if self.do_round else int(h * self.short_size / w) + ow = int(w * float(scale_factor) + 0.5) if self.do_round else int(w * self.short_size / h) else: oh = self.short_size if self.fixed_ratio: @@ -248,10 +233,8 @@ def __call__(self, img): ow = self.short_size else: scale_factor = self.short_size / h - oh = (int(h * float(scale_factor) + 0.5) - if self.do_round else int(h * self.short_size / w)) - ow = (int(w * float(scale_factor) + 0.5) - if self.do_round else int(w * self.short_size / h)) + oh = int(h * float(scale_factor) + 0.5) if self.do_round else int(h * self.short_size / w) + ow = int(w * float(scale_factor) + 0.5) if self.do_round else int(w * self.short_size / h) if type(img) == np.ndarray: img = Image.fromarray(img, mode="RGB") @@ -259,12 +242,9 @@ def __call__(self, img): if self.backend == "pillow": result_img = img.resize((ow, oh), Image.BILINEAR) elif self.backend == "cv2" and (self.keep_ratio is not None): - result_img = cv2.resize( - img, (ow, oh), interpolation=cv2.INTER_LINEAR) + result_img = cv2.resize(img, (ow, oh), interpolation=cv2.INTER_LINEAR) else: - result_img = Image.fromarray( - cv2.resize( - np.asarray(img), (ow, oh), interpolation=cv2.INTER_LINEAR)) + result_img = Image.fromarray(cv2.resize(np.asarray(img), (ow, oh), interpolation=cv2.INTER_LINEAR)) return result_img @@ -313,7 +293,9 @@ class Permute(object): channel_first (bool): whether convert HWC to CHW """ - def __init__(self, ): + def __init__( + self, + ): super(Permute, self).__init__() def __call__(self, im, im_info): @@ -379,17 +361,15 @@ def letterbox(self, img, height, width, color=(127.5, 127.5, 127.5)): ratio = min(ratio_h, ratio_w) new_shape = ( round(shape[1] * ratio), - round(shape[0] * ratio), ) # [width, height] + round(shape[0] * ratio), + ) # [width, height] padw = (width - new_shape[0]) / 2 padh = (height - new_shape[1]) / 2 top, bottom = round(padh - 0.1), round(padh + 0.1) left, right = round(padw - 0.1), round(padw + 0.1) - img = cv2.resize( - img, new_shape, interpolation=cv2.INTER_AREA) # resized, no border - img = cv2.copyMakeBorder( - img, top, bottom, left, right, cv2.BORDER_CONSTANT, - value=color) # padded rectangular + img = cv2.resize(img, new_shape, interpolation=cv2.INTER_AREA) # resized, no border + img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # padded rectangular return img, ratio, padw, padh def __call__(self, im, im_info): @@ -445,14 +425,15 @@ class WarpAffine(object): """Warp affine the image""" def __init__( - self, - keep_res=False, - pad=31, - input_h=512, - input_w=512, - scale=0.4, - shift=0.1, - down_ratio=4, ): + self, + keep_res=False, + pad=31, + input_h=512, + input_w=512, + scale=0.4, + shift=0.1, + down_ratio=4, + ): self.keep_res = keep_res self.pad = pad self.input_h = input_h @@ -489,32 +470,32 @@ def __call__(self, im, im_info): trans_input = get_affine_transform(c, s, 0, [input_w, input_h]) img = cv2.resize(img, (w, h)) - inp = cv2.warpAffine( - img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR) + inp = cv2.warpAffine(img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR) if not self.keep_res: out_h = input_h // self.down_ratio out_w = input_w // self.down_ratio trans_output = get_affine_transform(c, s, 0, [out_w, out_h]) - im_info.update({ - "center": c, - "scale": s, - "out_height": out_h, - "out_width": out_w, - "inp_height": input_h, - "inp_width": input_w, - "trans_input": trans_input, - "trans_output": trans_output, - }) + im_info.update( + { + "center": c, + "scale": s, + "out_height": out_h, + "out_width": out_w, + "inp_height": input_h, + "inp_width": input_w, + "trans_input": trans_input, + "trans_output": trans_output, + } + ) return inp, im_info def preprocess(im, preprocess_ops): # process image by preprocess_ops im_info = { - "scale_factor": np.array( - [1.0, 1.0], dtype=np.float32), + "scale_factor": np.array([1.0, 1.0], dtype=np.float32), "im_shape": None, } im, im_info = decode_image(im, im_info) diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/util.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/util.py index 179b3b366e15a..1d38777a4526c 100644 --- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/util.py +++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/util.py @@ -46,8 +46,7 @@ def pad_right_down_corner(img, stride, padValue): def transfer(model, model_weights): transfered_model_weights = {} for weights_name in model.state_dict().keys(): - transfered_model_weights[weights_name] = model_weights[".".join( - weights_name.split(".")[1:])] + transfered_model_weights[weights_name] = model_weights[".".join(weights_name.split(".")[1:])] return transfered_model_weights @@ -113,11 +112,9 @@ def draw_bodypose(canvas, candidate, subset): X = candidate[index.astype(int), 1] mX = np.mean(X) mY = np.mean(Y) - length = ((X[0] - X[1])**2 + (Y[0] - Y[1])**2)**0.5 + length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5 angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1])) - polygon = cv2.ellipse2Poly((int(mY), int(mX)), - (int(length / 2), stickwidth), - int(angle), 0, 360, 1) + polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1) cv2.fillConvexPoly(cur_canvas, polygon, colors[i]) canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0) return canvas @@ -156,9 +153,9 @@ def draw_handpose(canvas, all_hand_peaks, show_number=False): canvas, (x1, y1), (x2, y2), - matplotlib.colors.hsv_to_rgb( - [ie / float(len(edges)), 1.0, 1.0]) * 255, - thickness=2, ) + matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255, + thickness=2, + ) for i, keyponit in enumerate(peaks): x, y = keyponit @@ -171,7 +168,8 @@ def draw_handpose(canvas, all_hand_peaks, show_number=False): cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0, 0, 0), - lineType=cv2.LINE_AA, ) + lineType=cv2.LINE_AA, + ) return canvas @@ -192,16 +190,14 @@ def hand_detect(candidate, subset, oriImg): hands = [] # left hand if has_left: - left_shoulder_index, left_elbow_index, left_wrist_index = person[ - [5, 6, 7]] + left_shoulder_index, left_elbow_index, left_wrist_index = person[[5, 6, 7]] x1, y1 = candidate[left_shoulder_index][:2] x2, y2 = candidate[left_elbow_index][:2] x3, y3 = candidate[left_wrist_index][:2] hands.append([x1, y1, x2, y2, x3, y3, True]) # right hand if has_right: - right_shoulder_index, right_elbow_index, right_wrist_index = person[ - [2, 3, 4]] + right_shoulder_index, right_elbow_index, right_wrist_index = person[[2, 3, 4]] x1, y1 = candidate[right_shoulder_index][:2] x2, y2 = candidate[right_elbow_index][:2] x3, y3 = candidate[right_wrist_index][:2] @@ -216,8 +212,8 @@ def hand_detect(candidate, subset, oriImg): # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder); x = x3 + ratioWristElbow * (x3 - x2) y = y3 + ratioWristElbow * (y3 - y2) - distanceWristElbow = math.sqrt((x3 - x2)**2 + (y3 - y2)**2) - distanceElbowShoulder = math.sqrt((x2 - x1)**2 + (y2 - y1)**2) + distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2) + distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2) width = 1.0 * max(distanceWristElbow, 0.9 * distanceElbowShoulder) # x-y refers to the center --> offset to topLeft point # handRectangle.x -= handRectangle.width / 2.f; diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/utils.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/utils.py index eba62c30d1e34..eb3856ca3a117 100644 --- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/utils.py +++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/utils.py @@ -26,41 +26,40 @@ def argsparser(): "--model_dir", type=str, default=None, - help=("Directory include:'model.pdiparams', 'model.pdmodel', " - "'infer_cfg.yml', created by tools/export_model.py."), - required=True, ) - parser.add_argument( - "--image_file", type=str, default=None, help="Path of image file.") + help=( + "Directory include:'model.pdiparams', 'model.pdmodel', " + "'infer_cfg.yml', created by tools/export_model.py." + ), + required=True, + ) + parser.add_argument("--image_file", type=str, default=None, help="Path of image file.") parser.add_argument( "--image_dir", type=str, default=None, - help="Dir of image file, `image_file` has a higher priority.", ) - parser.add_argument( - "--batch_size", type=int, default=1, help="batch_size for inference.") + help="Dir of image file, `image_file` has a higher priority.", + ) + parser.add_argument("--batch_size", type=int, default=1, help="batch_size for inference.") parser.add_argument( "--video_file", type=str, default=None, help="Path of video file, `video_file` or `camera_id` has a highest priority.", ) - parser.add_argument( - "--camera_id", - type=int, - default=-1, - help="device id of camera to predict.") - parser.add_argument( - "--threshold", type=float, default=0.5, help="Threshold of score.") + parser.add_argument("--camera_id", type=int, default=-1, help="device id of camera to predict.") + parser.add_argument("--threshold", type=float, default=0.5, help="Threshold of score.") parser.add_argument( "--output_dir", type=str, default="output", - help="Directory of output visualization files.", ) + help="Directory of output visualization files.", + ) parser.add_argument( "--run_mode", type=str, default="paddle", - help="mode of running(paddle/trt_fp32/trt_fp16/trt_int8)", ) + help="mode of running(paddle/trt_fp32/trt_fp16/trt_int8)", + ) parser.add_argument( "--device", type=str, @@ -71,74 +70,70 @@ def argsparser(): "--use_gpu", type=ast.literal_eval, default=False, - help="Deprecated, please use `--device`.", ) + help="Deprecated, please use `--device`.", + ) parser.add_argument( "--run_benchmark", type=ast.literal_eval, default=False, - help="Whether to predict a image_file repeatedly for benchmark", ) + help="Whether to predict a image_file repeatedly for benchmark", + ) parser.add_argument( "--enable_mkldnn", type=ast.literal_eval, default=False, - help="Whether use mkldnn with CPU.", ) + help="Whether use mkldnn with CPU.", + ) parser.add_argument( "--enable_mkldnn_bfloat16", type=ast.literal_eval, default=False, - help="Whether use mkldnn bfloat16 inference with CPU.", ) - parser.add_argument( - "--cpu_threads", type=int, default=1, help="Num of threads with CPU.") - parser.add_argument( - "--trt_min_shape", type=int, default=1, help="min_shape for TensorRT.") - parser.add_argument( - "--trt_max_shape", - type=int, - default=1280, - help="max_shape for TensorRT.") - parser.add_argument( - "--trt_opt_shape", - type=int, - default=640, - help="opt_shape for TensorRT.") + help="Whether use mkldnn bfloat16 inference with CPU.", + ) + parser.add_argument("--cpu_threads", type=int, default=1, help="Num of threads with CPU.") + parser.add_argument("--trt_min_shape", type=int, default=1, help="min_shape for TensorRT.") + parser.add_argument("--trt_max_shape", type=int, default=1280, help="max_shape for TensorRT.") + parser.add_argument("--trt_opt_shape", type=int, default=640, help="opt_shape for TensorRT.") parser.add_argument( "--trt_calib_mode", type=bool, default=False, - help="If the model is produced by TRT offline quantitative " - "calibration, trt_calib_mode need to set True.", ) + help="If the model is produced by TRT offline quantitative " "calibration, trt_calib_mode need to set True.", + ) parser.add_argument( "--save_images", type=ast.literal_eval, default=True, - help="Save visualization image results.", ) - parser.add_argument( - "--save_mot_txts", - action="store_true", - help="Save tracking results (txt).") + help="Save visualization image results.", + ) + parser.add_argument("--save_mot_txts", action="store_true", help="Save tracking results (txt).") parser.add_argument( "--save_mot_txt_per_img", action="store_true", - help="Save tracking results (txt) for each image.", ) + help="Save tracking results (txt) for each image.", + ) parser.add_argument( "--scaled", type=bool, default=False, - help="Whether coords after detector outputs are scaled, False in JDE YOLOv3 " - "True in general detector.", ) - parser.add_argument( - "--tracker_config", type=str, default=None, help=("tracker donfig")) + help="Whether coords after detector outputs are scaled, False in JDE YOLOv3 " "True in general detector.", + ) + parser.add_argument("--tracker_config", type=str, default=None, help=("tracker donfig")) parser.add_argument( "--reid_model_dir", type=str, default=None, - help=("Directory include:'model.pdiparams', 'model.pdmodel', " - "'infer_cfg.yml', created by tools/export_model.py."), ) + help=( + "Directory include:'model.pdiparams', 'model.pdmodel', " + "'infer_cfg.yml', created by tools/export_model.py." + ), + ) parser.add_argument( "--reid_batch_size", type=int, default=50, - help="max batch_size for reid model inference.", ) + help="max batch_size for reid model inference.", + ) parser.add_argument( "--use_dark", type=ast.literal_eval, @@ -149,27 +144,32 @@ def argsparser(): "--action_file", type=str, default=None, - help="Path of input file for action recognition.", ) + help="Path of input file for action recognition.", + ) parser.add_argument( "--window_size", type=int, default=50, - help="Temporal size of skeleton feature for action recognition.", ) + help="Temporal size of skeleton feature for action recognition.", + ) parser.add_argument( "--random_pad", type=ast.literal_eval, default=False, - help="Whether do random padding for action recognition.", ) + help="Whether do random padding for action recognition.", + ) parser.add_argument( "--save_results", action="store_true", default=False, - help="Whether save detection result to file using coco format", ) + help="Whether save detection result to file using coco format", + ) parser.add_argument( "--use_coco_category", action="store_true", default=False, - help="Whether to use the coco format dictionary `clsid2catid`", ) + help="Whether to use the coco format dictionary `clsid2catid`", + ) parser.add_argument( "--slice_infer", action="store_true", @@ -180,13 +180,15 @@ def argsparser(): nargs="+", type=int, default=[640, 640], - help="Height of the sliced image.", ) + help="Height of the sliced image.", + ) parser.add_argument( "--overlap_ratio", nargs="+", type=float, default=[0.25, 0.25], - help="Overlap height ratio of the sliced image.", ) + help="Overlap height ratio of the sliced image.", + ) parser.add_argument( "--combine_method", type=str, @@ -197,12 +199,14 @@ def argsparser(): "--match_threshold", type=float, default=0.6, - help="Combine method matching threshold.", ) + help="Combine method matching threshold.", + ) parser.add_argument( "--match_metric", type=str, default="ios", - help="Combine method matching metric, choose in ['iou', 'ios'].", ) + help="Combine method matching metric, choose in ['iou', 'ios'].", + ) return parser @@ -254,38 +258,34 @@ def info(self, average=False): total_time = total_time + track_time total_time = round(total_time, 4) print("------------------ Inference Time Info ----------------------") - print("total_time(ms): {}, img_num: {}".format(total_time * 1000, - self.img_num)) - preprocess_time = (round(pre_time / max(1, self.img_num), 4) - if average else pre_time) - postprocess_time = (round(post_time / max(1, self.img_num), 4) - if average else post_time) - inference_time = (round(infer_time / max(1, self.img_num), 4) - if average else infer_time) - tracking_time = (round(track_time / max(1, self.img_num), 4) - if average else track_time) + print("total_time(ms): {}, img_num: {}".format(total_time * 1000, self.img_num)) + preprocess_time = round(pre_time / max(1, self.img_num), 4) if average else pre_time + postprocess_time = round(post_time / max(1, self.img_num), 4) if average else post_time + inference_time = round(infer_time / max(1, self.img_num), 4) if average else infer_time + tracking_time = round(track_time / max(1, self.img_num), 4) if average else track_time average_latency = total_time / max(1, self.img_num) qps = 0 if total_time > 0: qps = 1 / average_latency - print("average latency time(ms): {:.2f}, QPS: {:2f}".format( - average_latency * 1000, qps)) + print("average latency time(ms): {:.2f}, QPS: {:2f}".format(average_latency * 1000, qps)) if self.with_tracker: print( - "preprocess_time(ms): {:.2f}, inference_time(ms): {:.2f}, postprocess_time(ms): {:.2f}, tracking_time(ms): {:.2f}". - format( + "preprocess_time(ms): {:.2f}, inference_time(ms): {:.2f}, postprocess_time(ms): {:.2f}, tracking_time(ms): {:.2f}".format( preprocess_time * 1000, inference_time * 1000, postprocess_time * 1000, - tracking_time * 1000, )) + tracking_time * 1000, + ) + ) else: print( - "preprocess_time(ms): {:.2f}, inference_time(ms): {:.2f}, postprocess_time(ms): {:.2f}". - format( + "preprocess_time(ms): {:.2f}, inference_time(ms): {:.2f}, postprocess_time(ms): {:.2f}".format( preprocess_time * 1000, inference_time * 1000, - postprocess_time * 1000, )) + postprocess_time * 1000, + ) + ) def report(self, average=False): dic = {} @@ -294,18 +294,13 @@ def report(self, average=False): post_time = self.postprocess_time_s.value() track_time = self.tracking_time_s.value() - dic["preprocess_time_s"] = (round(pre_time / max(1, self.img_num), 4) - if average else pre_time) - dic["inference_time_s"] = (round(infer_time / max(1, self.img_num), 4) - if average else infer_time) - dic["postprocess_time_s"] = (round(post_time / max(1, self.img_num), 4) - if average else post_time) + dic["preprocess_time_s"] = round(pre_time / max(1, self.img_num), 4) if average else pre_time + dic["inference_time_s"] = round(infer_time / max(1, self.img_num), 4) if average else infer_time + dic["postprocess_time_s"] = round(post_time / max(1, self.img_num), 4) if average else post_time dic["img_num"] = self.img_num total_time = pre_time + infer_time + post_time if self.with_tracker: - dic["tracking_time_s"] = ( - round(track_time / max(1, self.img_num), 4) - if average else track_time) + dic["tracking_time_s"] = round(track_time / max(1, self.img_num), 4) if average else track_time total_time = total_time + track_time dic["total_time_s"] = round(total_time, 4) return dic @@ -513,10 +508,9 @@ def gaussian_radius(bbox_size, min_overlap): def gaussian2D(shape, sigma_x=1, sigma_y=1): m, n = [(ss - 1.0) / 2.0 for ss in shape] - y, x = np.ogrid[-m:m + 1, -n:n + 1] + y, x = np.ogrid[-m : m + 1, -n : n + 1] - h = np.exp(-(x * x / (2 * sigma_x * sigma_x) + y * y / (2 * sigma_y * - sigma_y))) + h = np.exp(-(x * x / (2 * sigma_x * sigma_x) + y * y / (2 * sigma_y * sigma_y))) h[h < np.finfo(h.dtype).eps * h.max()] = 0 return h @@ -526,8 +520,7 @@ def draw_umich_gaussian(heatmap, center, radius, k=1): draw_umich_gaussian, refer to https://github.com/xingyizhou/CenterNet/blob/master/src/lib/utils/image.py#L126 """ diameter = 2 * radius + 1 - gaussian = gaussian2D( - (diameter, diameter), sigma_x=diameter / 6, sigma_y=diameter / 6) + gaussian = gaussian2D((diameter, diameter), sigma_x=diameter / 6, sigma_y=diameter / 6) x, y = int(center[0]), int(center[1]) @@ -536,9 +529,8 @@ def draw_umich_gaussian(heatmap, center, radius, k=1): left, right = min(x, radius), min(width - x, radius + 1) top, bottom = min(y, radius), min(height - y, radius + 1) - masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] - masked_gaussian = gaussian[radius - top:radius + bottom, radius - left: - radius + right] + masked_heatmap = heatmap[y - top : y + bottom, x - left : x + right] + masked_gaussian = gaussian[radius - top : radius + bottom, radius - left : radius + right] if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap) return heatmap diff --git a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/visualize.py b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/visualize.py index 3fdd640c1969b..6ea9f1b4a241b 100644 --- a/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/visualize.py +++ b/ppdiffusers/examples/controlnet/annotator/ppdet_hrnet/visualize.py @@ -42,8 +42,7 @@ def visualize_box_mask(im, results, labels, threshold=0.5): elif isinstance(im, np.ndarray): im = Image.fromarray(im) if "masks" in results and "boxes" in results and len(results["boxes"]) > 0: - im = draw_mask( - im, results["boxes"], results["masks"], labels, threshold=threshold) + im = draw_mask(im, results["boxes"], results["masks"], labels, threshold=threshold) if "boxes" in results and len(results["boxes"]) > 0: im = draw_box(im, results["boxes"], labels, threshold=threshold) if "segm" in results: @@ -53,7 +52,8 @@ def visualize_box_mask(im, results, labels, threshold=0.5): results["label"], results["score"], labels, - threshold=threshold, ) + threshold=threshold, + ) return im @@ -74,7 +74,7 @@ def get_color_map_list(num_classes): color_map[i * 3 + 2] |= ((lab >> 2) & 1) << (7 - j) j += 1 lab >>= 3 - color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)] + color_map = [color_map[i : i + 3] for i in range(0, len(color_map), 3)] return color_map @@ -141,40 +141,31 @@ def draw_box(im, np_boxes, labels, threshold=0.5): if len(bbox) == 4: xmin, ymin, xmax, ymax = bbox - print("class_id:{:d}, confidence:{:.4f}, left_top:[{:.2f},{:.2f}]," - "right_bottom:[{:.2f},{:.2f}]".format( - int(clsid), score, xmin, ymin, xmax, ymax)) + print( + "class_id:{:d}, confidence:{:.4f}, left_top:[{:.2f},{:.2f}]," + "right_bottom:[{:.2f},{:.2f}]".format(int(clsid), score, xmin, ymin, xmax, ymax) + ) # draw bbox draw.line( - [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin), - (xmin, ymin)], + [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin), (xmin, ymin)], width=draw_thickness, - fill=color, ) + fill=color, + ) elif len(bbox) == 8: x1, y1, x2, y2, x3, y3, x4, y4 = bbox - draw.line( - [(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)], - width=2, - fill=color) + draw.line([(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)], width=2, fill=color) xmin = min(x1, x2, x3, x4) ymin = min(y1, y2, y3, y4) # draw label text = "{} {:.4f}".format(labels[clsid], score) tw, th = draw.textsize(text) - draw.rectangle( - [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color) + draw.rectangle([(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color) draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255)) return im -def draw_segm(im, - np_segms, - np_label, - np_score, - labels, - threshold=0.5, - alpha=0.7): +def draw_segm(im, np_segms, np_label, np_score, labels, threshold=0.5, alpha=0.7): """ Draw segmentation on image """ @@ -204,8 +195,7 @@ def draw_segm(im, sum_y = np.sum(mask, axis=1) y = np.where(sum_y > 0.5)[0] x0, x1, y0, y1 = x[0], x[-1], y[0], y[-1] - cv2.rectangle(im, (x0, y0), (x1, y1), - tuple(color_mask.astype("int32").tolist()), 1) + cv2.rectangle(im, (x0, y0), (x1, y1), tuple(color_mask.astype("int32").tolist()), 1) bbox_text = "%s %.2f" % (labels[clsid], score) t_size = cv2.getTextSize(bbox_text, 0, 0.3, thickness=1)[0] cv2.rectangle( @@ -213,7 +203,8 @@ def draw_segm(im, (x0, y0), (x0 + t_size[0], y0 - t_size[1] - 3), tuple(color_mask.astype("int32").tolist()), - -1, ) + -1, + ) cv2.putText( im, bbox_text, @@ -222,7 +213,8 @@ def draw_segm(im, 0.3, (0, 0, 0), 1, - lineType=cv2.LINE_AA, ) + lineType=cv2.LINE_AA, + ) return Image.fromarray(im.astype("uint8")) @@ -233,20 +225,20 @@ def get_color(idx): def visualize_pose( - imgfile, - results, - visual_thresh=0.6, - save_name="pose.jpg", - save_dir="output", - returnimg=False, - ids=None, ): + imgfile, + results, + visual_thresh=0.6, + save_name="pose.jpg", + save_dir="output", + returnimg=False, + ids=None, +): try: import matplotlib.pyplot as plt plt.switch_backend("agg") except Exception as e: - print("Matplotlib not found, please install matplotlib." - "for example: `pip install matplotlib`.") + print("Matplotlib not found, please install matplotlib." "for example: `pip install matplotlib`.") raise e skeletons, _ = results["keypoint"] skeletons = np.array(skeletons) @@ -323,8 +315,7 @@ def visualize_pose( bboxs = results["bbox"] for j, rect in enumerate(bboxs): xmin, ymin, xmax, ymax = rect - color = (colors[0] if color_set is None else - colors[color_set[j] % len(colors)]) + color = colors[0] if color_set is None else colors[color_set[j] % len(colors)] cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color, 1) canvas = img.copy() @@ -333,8 +324,7 @@ def visualize_pose( if skeletons[j][i, 2] < visual_thresh: continue if ids is None: - color = (colors[i] if color_set is None else - colors[color_set[j] % len(colors)]) + color = colors[i] if color_set is None else colors[color_set[j] % len(colors)] else: color = get_color(ids[j]) @@ -343,15 +333,15 @@ def visualize_pose( tuple(skeletons[j][i, 0:2].astype("int32")), 2, color, - thickness=-1, ) + thickness=-1, + ) stickwidth = 2 for i in range(NUM_EDGES): for j in range(len(skeletons)): edge = EDGES[i] - if (skeletons[j][edge[0], 2] < visual_thresh or - skeletons[j][edge[1], 2] < visual_thresh): + if skeletons[j][edge[0], 2] < visual_thresh or skeletons[j][edge[1], 2] < visual_thresh: continue cur_canvas = canvas.copy() @@ -359,22 +349,18 @@ def visualize_pose( Y = [skeletons[j][edge[0], 0], skeletons[j][edge[1], 0]] mX = np.mean(X) mY = np.mean(Y) - length = ((X[0] - X[1])**2 + (Y[0] - Y[1])**2)**0.5 + length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5 angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1])) - polygon = cv2.ellipse2Poly((int(mY), int(mX)), - (int(length / 2), stickwidth), - int(angle), 0, 360, 1) + polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1) if ids is None: - color = (colors[i] if color_set is None else - colors[color_set[j] % len(colors)]) + color = colors[i] if color_set is None else colors[color_set[j] % len(colors)] else: color = get_color(ids[j]) cv2.fillConvexPoly(cur_canvas, polygon, color) canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0) if returnimg: return canvas - save_name = os.path.join( - save_dir, os.path.splitext(os.path.basename(imgfile))[0] + "_vis.jpg") + save_name = os.path.join(save_dir, os.path.splitext(os.path.basename(imgfile))[0] + "_vis.jpg") plt.imsave(save_name, canvas[:, :, ::-1]) print("keypoint visualize image saved to: " + save_name) plt.close() @@ -414,5 +400,6 @@ def visualize_attr(im, results, boxes=None, is_mtmct=False): cv2.FONT_ITALIC, text_scale, (0, 255, 255), - thickness=text_thickness, ) + thickness=text_thickness, + ) return im diff --git a/ppdiffusers/examples/controlnet/annotator/segformer_paddle/__init__.py b/ppdiffusers/examples/controlnet/annotator/segformer_paddle/__init__.py index 53102b4c87bb4..1284578b851f1 100644 --- a/ppdiffusers/examples/controlnet/annotator/segformer_paddle/__init__.py +++ b/ppdiffusers/examples/controlnet/annotator/segformer_paddle/__init__.py @@ -32,71 +32,80 @@ def parse_args(): parser = argparse.ArgumentParser(description="Model prediction") # params of prediction - parser.add_argument( - "--config", dest="cfg", help="The config file.", default=None, type=str) + parser.add_argument("--config", dest="cfg", help="The config file.", default=None, type=str) parser.add_argument( "--model_path", dest="model_path", help="The path of model for prediction", type=str, - default=None, ) + default=None, + ) parser.add_argument( "--image_path", dest="image_path", help="The image to predict, which can be a path of image, or a file list containing image paths, or a directory including images", type=str, - default=None, ) + default=None, + ) parser.add_argument( "--save_dir", dest="save_dir", help="The directory for saving the predicted results", type=str, - default="./output/result", ) + default="./output/result", + ) # augment for prediction parser.add_argument( "--aug_pred", dest="aug_pred", help="Whether to use mulit-scales and flip augment for prediction", - action="store_true", ) + action="store_true", + ) parser.add_argument( "--scales", dest="scales", nargs="+", help="Scales for augment", type=float, - default=1.0, ) + default=1.0, + ) parser.add_argument( "--flip_horizontal", dest="flip_horizontal", help="Whether to use flip horizontally augment", - action="store_true", ) + action="store_true", + ) parser.add_argument( "--flip_vertical", dest="flip_vertical", help="Whether to use flip vertically augment", - action="store_true", ) + action="store_true", + ) # sliding window prediction parser.add_argument( "--is_slide", dest="is_slide", help="Whether to prediction by sliding window", - action="store_true", ) + action="store_true", + ) parser.add_argument( "--crop_size", dest="crop_size", nargs=2, help="The crop size of sliding window, the first is width and the second is height.", type=int, - default=None, ) + default=None, + ) parser.add_argument( "--stride", dest="stride", nargs=2, help="The stride of sliding window, the first is width and the second is height.", type=int, - default=None, ) + default=None, + ) # custom color map parser.add_argument( @@ -105,7 +114,8 @@ def parse_args(): nargs="+", help="Save images with a custom color map. Default: None, use paddleseg's default color map.", type=int, - default=None, ) + default=None, + ) # set device parser.add_argument( @@ -113,7 +123,8 @@ def parse_args(): dest="device", help="Device place to be set, which can be GPU, XPU, NPU, CPU", default="gpu", - type=str, ) + type=str, + ) return parser.parse_args() @@ -301,8 +312,7 @@ def get_test_config(cfg, args): def main(args): env_info = get_sys_env() - if (args.device == "gpu" and env_info["Paddle compiled with cuda"] and - env_info["GPUs used"]): + if args.device == "gpu" and env_info["Paddle compiled with cuda"] and env_info["GPUs used"]: place = "gpu" elif args.device == "xpu" and paddle.is_compiled_with_xpu(): place = "xpu" @@ -337,10 +347,13 @@ def main(args): image_list=image_list, image_dir=image_dir, save_dir=args.save_dir, - **test_config, ) + **test_config, + ) -checkpoint_file = "https://bj.bcebos.com/paddleseg/dygraph/cityscapes/segformer_b5_cityscapes_1024x1024_160k/model.pdparams" +checkpoint_file = ( + "https://bj.bcebos.com/paddleseg/dygraph/cityscapes/segformer_b5_cityscapes_1024x1024_160k/model.pdparams" +) class SegformerDetector: @@ -350,27 +363,21 @@ def __init__(self, mode): "ade20k", ], f"mode should in {['cityscapes', 'ade20k']}!" if mode == "cityscapes": - segformer_annotator_ckpts_path = os.path.join(annotator_ckpts_path, - "segformer_model") - modelpath = os.path.join(segformer_annotator_ckpts_path, - "model.pdparams") + segformer_annotator_ckpts_path = os.path.join(annotator_ckpts_path, "segformer_model") + modelpath = os.path.join(segformer_annotator_ckpts_path, "model.pdparams") if not os.path.exists(modelpath): - from paddlenlp.utils.downloader import \ - get_path_from_url_with_filelock + from paddlenlp.utils.downloader import get_path_from_url_with_filelock - get_path_from_url_with_filelock( - checkpoint_file, root_dir=segformer_annotator_ckpts_path) + get_path_from_url_with_filelock(checkpoint_file, root_dir=segformer_annotator_ckpts_path) self.model_path = modelpath - cfg = ( - "annotator/segformer_paddle/segformer_b5_cityscapes_1024x1024_160k.yml" - ) + cfg = "annotator/segformer_paddle/segformer_b5_cityscapes_1024x1024_160k.yml" else: - segformer_annotator_ckpts_path = os.path.join(annotator_ckpts_path, - "segformer_model") + segformer_annotator_ckpts_path = os.path.join(annotator_ckpts_path, "segformer_model") modelpath = os.path.join( segformer_annotator_ckpts_path, - "segformer_b5_ade20k_512x512_160k.pdparams", ) + "segformer_b5_ade20k_512x512_160k.pdparams", + ) self.model_path = modelpath @@ -404,9 +411,9 @@ def __call__(self, img): save_dir="output", skip_save=True, custom_color=custom_color_flatten, - **self.test_config, ) - pred_mask = cv2.cvtColor( - np.asarray(pred_mask.convert("RGB"))[:, :, ::-1], cv2.COLOR_RGB2BGR) + **self.test_config, + ) + pred_mask = cv2.cvtColor(np.asarray(pred_mask.convert("RGB"))[:, :, ::-1], cv2.COLOR_RGB2BGR) return pred_mask diff --git a/ppdiffusers/examples/controlnet/annotator/segformer_paddle/predict.py b/ppdiffusers/examples/controlnet/annotator/segformer_paddle/predict.py index 6077f36175759..5e1850259a3f1 100644 --- a/ppdiffusers/examples/controlnet/annotator/segformer_paddle/predict.py +++ b/ppdiffusers/examples/controlnet/annotator/segformer_paddle/predict.py @@ -33,7 +33,7 @@ def mkdir(path): def partition_list(arr, m): """split the list 'arr' into m pieces""" n = int(math.ceil(len(arr) / float(m))) - return [arr[i:i + n] for i in range(0, len(arr), n)] + return [arr[i : i + n] for i in range(0, len(arr), n)] def preprocess(im_path, transforms): @@ -47,20 +47,21 @@ def preprocess(im_path, transforms): def predict( - model, - model_path, - transforms, - image_list, - image_dir=None, - save_dir="output", - aug_pred=False, - scales=1.0, - flip_horizontal=True, - flip_vertical=False, - is_slide=False, - stride=None, - crop_size=None, - custom_color=None, ): + model, + model_path, + transforms, + image_list, + image_dir=None, + save_dir="output", + aug_pred=False, + scales=1.0, + flip_horizontal=True, + flip_vertical=False, + is_slide=False, + stride=None, + crop_size=None, + custom_color=None, +): """ predict and visualize the image_list. @@ -112,7 +113,8 @@ def predict( flip_vertical=flip_vertical, is_slide=is_slide, stride=stride, - crop_size=crop_size, ) + crop_size=crop_size, + ) else: pred, _ = infer.inference( model, @@ -120,7 +122,8 @@ def predict( trans_info=data["trans_info"], is_slide=is_slide, stride=stride, - crop_size=crop_size, ) + crop_size=crop_size, + ) pred = paddle.squeeze(pred) pred = pred.numpy().astype("uint8") @@ -133,16 +136,14 @@ def predict( im_file = im_file[1:] # save added image - added_image = utils.visualize.visualize( - im_path, pred, color_map, weight=0.6) + added_image = utils.visualize.visualize(im_path, pred, color_map, weight=0.6) added_image_path = os.path.join(added_saved_dir, im_file) mkdir(added_image_path) cv2.imwrite(added_image_path, added_image) # save pseudo color prediction pred_mask = utils.visualize.get_pseudo_color_map(pred, color_map) - pred_saved_path = os.path.join( - pred_saved_dir, os.path.splitext(im_file)[0] + ".png") + pred_saved_path = os.path.join(pred_saved_dir, os.path.splitext(im_file)[0] + ".png") mkdir(pred_saved_path) pred_mask.save(pred_saved_path) @@ -151,21 +152,22 @@ def predict( def quick_predict( - model, - model_path, - transforms, - image_list, - image_dir=None, - save_dir="output", - aug_pred=False, - scales=1.0, - flip_horizontal=True, - flip_vertical=False, - is_slide=False, - stride=None, - crop_size=None, - custom_color=None, - skip_save=True, ): + model, + model_path, + transforms, + image_list, + image_dir=None, + save_dir="output", + aug_pred=False, + scales=1.0, + flip_horizontal=True, + flip_vertical=False, + is_slide=False, + stride=None, + crop_size=None, + custom_color=None, + skip_save=True, +): """ predict and visualize the image_list. @@ -218,7 +220,8 @@ def quick_predict( flip_vertical=flip_vertical, is_slide=is_slide, stride=stride, - crop_size=crop_size, ) + crop_size=crop_size, + ) else: pred, _ = infer.inference( model, @@ -226,7 +229,8 @@ def quick_predict( trans_info=data["trans_info"], is_slide=is_slide, stride=stride, - crop_size=crop_size, ) + crop_size=crop_size, + ) pred = paddle.squeeze(pred) pred = pred.numpy().astype("uint8") @@ -241,8 +245,7 @@ def quick_predict( # save added image if not skip_save: - added_image = utils.visualize.visualize( - im_path, pred, color_map, weight=0.6) + added_image = utils.visualize.visualize(im_path, pred, color_map, weight=0.6) added_image_path = os.path.join(added_saved_dir, im_file) mkdir(added_image_path) cv2.imwrite(added_image_path, added_image) @@ -250,8 +253,7 @@ def quick_predict( # save pseudo color prediction pred_mask = utils.visualize.get_pseudo_color_map(pred, color_map) if not skip_save: - pred_saved_path = os.path.join( - pred_saved_dir, os.path.splitext(im_file)[0] + ".png") + pred_saved_path = os.path.join(pred_saved_dir, os.path.splitext(im_file)[0] + ".png") mkdir(pred_saved_path) pred_mask.save(pred_saved_path) diff --git a/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/__init__.py b/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/__init__.py index 76919bda8b88c..5d041d259a4ad 100644 --- a/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/__init__.py +++ b/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/__init__.py @@ -32,71 +32,80 @@ def parse_args(): parser = argparse.ArgumentParser(description="Model prediction") # params of prediction - parser.add_argument( - "--config", dest="cfg", help="The config file.", default=None, type=str) + parser.add_argument("--config", dest="cfg", help="The config file.", default=None, type=str) parser.add_argument( "--model_path", dest="model_path", help="The path of model for prediction", type=str, - default=None, ) + default=None, + ) parser.add_argument( "--image_path", dest="image_path", help="The image to predict, which can be a path of image, or a file list containing image paths, or a directory including images", type=str, - default=None, ) + default=None, + ) parser.add_argument( "--save_dir", dest="save_dir", help="The directory for saving the predicted results", type=str, - default="./output/result", ) + default="./output/result", + ) # augment for prediction parser.add_argument( "--aug_pred", dest="aug_pred", help="Whether to use mulit-scales and flip augment for prediction", - action="store_true", ) + action="store_true", + ) parser.add_argument( "--scales", dest="scales", nargs="+", help="Scales for augment", type=float, - default=1.0, ) + default=1.0, + ) parser.add_argument( "--flip_horizontal", dest="flip_horizontal", help="Whether to use flip horizontally augment", - action="store_true", ) + action="store_true", + ) parser.add_argument( "--flip_vertical", dest="flip_vertical", help="Whether to use flip vertically augment", - action="store_true", ) + action="store_true", + ) # sliding window prediction parser.add_argument( "--is_slide", dest="is_slide", help="Whether to prediction by sliding window", - action="store_true", ) + action="store_true", + ) parser.add_argument( "--crop_size", dest="crop_size", nargs=2, help="The crop size of sliding window, the first is width and the second is height.", type=int, - default=None, ) + default=None, + ) parser.add_argument( "--stride", dest="stride", nargs=2, help="The stride of sliding window, the first is width and the second is height.", type=int, - default=None, ) + default=None, + ) # custom color map parser.add_argument( @@ -105,7 +114,8 @@ def parse_args(): nargs="+", help="Save images with a custom color map. Default: None, use paddleseg's default color map.", type=int, - default=None, ) + default=None, + ) # set device parser.add_argument( @@ -113,7 +123,8 @@ def parse_args(): dest="device", help="Device place to be set, which can be GPU, XPU, NPU, CPU", default="gpu", - type=str, ) + type=str, + ) return parser.parse_args() @@ -301,8 +312,7 @@ def get_test_config(cfg, args): def main(args): env_info = get_sys_env() - if (args.device == "gpu" and env_info["Paddle compiled with cuda"] and - env_info["GPUs used"]): + if args.device == "gpu" and env_info["Paddle compiled with cuda"] and env_info["GPUs used"]: place = "gpu" elif args.device == "xpu" and paddle.is_compiled_with_xpu(): place = "xpu" @@ -337,24 +347,23 @@ def main(args): image_list=image_list, image_dir=image_dir, save_dir=args.save_dir, - **test_config, ) + **test_config, + ) -checkpoint_file = "https://paddleseg.bj.bcebos.com/dygraph/ade20k/segmenter_vit_base_linear_ade20k_512x512_160k/model.pdparams" +checkpoint_file = ( + "https://paddleseg.bj.bcebos.com/dygraph/ade20k/segmenter_vit_base_linear_ade20k_512x512_160k/model.pdparams" +) class SegmenterDetector: def __init__(self): - segmenter_annotator_ckpts_path = os.path.join(annotator_ckpts_path, - "segmenter_model") - modelpath = os.path.join(segmenter_annotator_ckpts_path, - "model.pdparams") + segmenter_annotator_ckpts_path = os.path.join(annotator_ckpts_path, "segmenter_model") + modelpath = os.path.join(segmenter_annotator_ckpts_path, "model.pdparams") if not os.path.exists(modelpath): - from paddlenlp.utils.downloader import \ - get_path_from_url_with_filelock + from paddlenlp.utils.downloader import get_path_from_url_with_filelock - get_path_from_url_with_filelock( - checkpoint_file, root_dir=segmenter_annotator_ckpts_path) + get_path_from_url_with_filelock(checkpoint_file, root_dir=segmenter_annotator_ckpts_path) self.model_path = modelpath cfg = "annotator/segmenter_paddle/segmenter_vit_base_linear_ade20k_512x512_160k.yml" @@ -385,9 +394,9 @@ def __call__(self, img): save_dir="output", skip_save=True, custom_color=custom_color_flatten, - **self.test_config, ) - pred_mask = cv2.cvtColor( - np.asarray(pred_mask.convert("RGB"))[:, :, ::-1], cv2.COLOR_RGB2BGR) + **self.test_config, + ) + pred_mask = cv2.cvtColor(np.asarray(pred_mask.convert("RGB"))[:, :, ::-1], cv2.COLOR_RGB2BGR) return pred_mask diff --git a/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/predict.py b/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/predict.py index 6077f36175759..5e1850259a3f1 100644 --- a/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/predict.py +++ b/ppdiffusers/examples/controlnet/annotator/segmenter_paddle/predict.py @@ -33,7 +33,7 @@ def mkdir(path): def partition_list(arr, m): """split the list 'arr' into m pieces""" n = int(math.ceil(len(arr) / float(m))) - return [arr[i:i + n] for i in range(0, len(arr), n)] + return [arr[i : i + n] for i in range(0, len(arr), n)] def preprocess(im_path, transforms): @@ -47,20 +47,21 @@ def preprocess(im_path, transforms): def predict( - model, - model_path, - transforms, - image_list, - image_dir=None, - save_dir="output", - aug_pred=False, - scales=1.0, - flip_horizontal=True, - flip_vertical=False, - is_slide=False, - stride=None, - crop_size=None, - custom_color=None, ): + model, + model_path, + transforms, + image_list, + image_dir=None, + save_dir="output", + aug_pred=False, + scales=1.0, + flip_horizontal=True, + flip_vertical=False, + is_slide=False, + stride=None, + crop_size=None, + custom_color=None, +): """ predict and visualize the image_list. @@ -112,7 +113,8 @@ def predict( flip_vertical=flip_vertical, is_slide=is_slide, stride=stride, - crop_size=crop_size, ) + crop_size=crop_size, + ) else: pred, _ = infer.inference( model, @@ -120,7 +122,8 @@ def predict( trans_info=data["trans_info"], is_slide=is_slide, stride=stride, - crop_size=crop_size, ) + crop_size=crop_size, + ) pred = paddle.squeeze(pred) pred = pred.numpy().astype("uint8") @@ -133,16 +136,14 @@ def predict( im_file = im_file[1:] # save added image - added_image = utils.visualize.visualize( - im_path, pred, color_map, weight=0.6) + added_image = utils.visualize.visualize(im_path, pred, color_map, weight=0.6) added_image_path = os.path.join(added_saved_dir, im_file) mkdir(added_image_path) cv2.imwrite(added_image_path, added_image) # save pseudo color prediction pred_mask = utils.visualize.get_pseudo_color_map(pred, color_map) - pred_saved_path = os.path.join( - pred_saved_dir, os.path.splitext(im_file)[0] + ".png") + pred_saved_path = os.path.join(pred_saved_dir, os.path.splitext(im_file)[0] + ".png") mkdir(pred_saved_path) pred_mask.save(pred_saved_path) @@ -151,21 +152,22 @@ def predict( def quick_predict( - model, - model_path, - transforms, - image_list, - image_dir=None, - save_dir="output", - aug_pred=False, - scales=1.0, - flip_horizontal=True, - flip_vertical=False, - is_slide=False, - stride=None, - crop_size=None, - custom_color=None, - skip_save=True, ): + model, + model_path, + transforms, + image_list, + image_dir=None, + save_dir="output", + aug_pred=False, + scales=1.0, + flip_horizontal=True, + flip_vertical=False, + is_slide=False, + stride=None, + crop_size=None, + custom_color=None, + skip_save=True, +): """ predict and visualize the image_list. @@ -218,7 +220,8 @@ def quick_predict( flip_vertical=flip_vertical, is_slide=is_slide, stride=stride, - crop_size=crop_size, ) + crop_size=crop_size, + ) else: pred, _ = infer.inference( model, @@ -226,7 +229,8 @@ def quick_predict( trans_info=data["trans_info"], is_slide=is_slide, stride=stride, - crop_size=crop_size, ) + crop_size=crop_size, + ) pred = paddle.squeeze(pred) pred = pred.numpy().astype("uint8") @@ -241,8 +245,7 @@ def quick_predict( # save added image if not skip_save: - added_image = utils.visualize.visualize( - im_path, pred, color_map, weight=0.6) + added_image = utils.visualize.visualize(im_path, pred, color_map, weight=0.6) added_image_path = os.path.join(added_saved_dir, im_file) mkdir(added_image_path) cv2.imwrite(added_image_path, added_image) @@ -250,8 +253,7 @@ def quick_predict( # save pseudo color prediction pred_mask = utils.visualize.get_pseudo_color_map(pred, color_map) if not skip_save: - pred_saved_path = os.path.join( - pred_saved_dir, os.path.splitext(im_file)[0] + ".png") + pred_saved_path = os.path.join(pred_saved_dir, os.path.splitext(im_file)[0] + ".png") mkdir(pred_saved_path) pred_mask.save(pred_saved_path) diff --git a/ppdiffusers/examples/controlnet/annotator/util.py b/ppdiffusers/examples/controlnet/annotator/util.py index 069005f683d59..7231c67ac5507 100644 --- a/ppdiffusers/examples/controlnet/annotator/util.py +++ b/ppdiffusers/examples/controlnet/annotator/util.py @@ -53,16 +53,15 @@ def resize_image(input_image, resolution): img = cv2.resize( input_image, (W, H), - interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA, ) + interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA, + ) return img def make_noise_disk(H, W, C, F): - noise = np.random.uniform( - low=0, high=1, size=((H // F) + 2, (W // F) + 2, C)) - noise = cv2.resize( - noise, (W + 2 * F, H + 2 * F), interpolation=cv2.INTER_CUBIC) - noise = noise[F:F + H, F:F + W] + noise = np.random.uniform(low=0, high=1, size=((H // F) + 2, (W // F) + 2, C)) + noise = cv2.resize(noise, (W + 2 * F, H + 2 * F), interpolation=cv2.INTER_CUBIC) + noise = noise[F : F + H, F : F + W] noise -= np.min(noise) noise /= np.max(noise) if C == 1: diff --git a/ppdiffusers/examples/controlnet/control/control_args.py b/ppdiffusers/examples/controlnet/control/control_args.py index 82e5c32ab1181..6a688687e1a27 100644 --- a/ppdiffusers/examples/controlnet/control/control_args.py +++ b/ppdiffusers/examples/controlnet/control/control_args.py @@ -22,44 +22,28 @@ class ModelArguments: Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. """ - vae_name_or_path: Optional[str] = field( - default=None, metadata={"help": "pretrained_vae_name_or_path"}) - text_encoder_name_or_path: Optional[str] = field( - default=None, metadata={"help": "text_encoder_name_or_path"}) - unet_name_or_path: Optional[str] = field( - default=None, metadata={"help": "unet_encoder_name_or_path"}) + vae_name_or_path: Optional[str] = field(default=None, metadata={"help": "pretrained_vae_name_or_path"}) + text_encoder_name_or_path: Optional[str] = field(default=None, metadata={"help": "text_encoder_name_or_path"}) + unet_name_or_path: Optional[str] = field(default=None, metadata={"help": "unet_encoder_name_or_path"}) tokenizer_name: Optional[str] = field( default=None, - metadata={ - "help": - "Pretrained tokenizer name or path if not the same as model_name" - }, ) - model_max_length: Optional[int] = field( - default=77, metadata={"help": "Pretrained tokenizer model_max_length"}) - num_inference_steps: Optional[int] = field( - default=50, metadata={"help": "num_inference_steps"}) - use_ema: bool = field( - default=False, metadata={"help": "Whether or not use ema"}) + metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}, + ) + model_max_length: Optional[int] = field(default=77, metadata={"help": "Pretrained tokenizer model_max_length"}) + num_inference_steps: Optional[int] = field(default=50, metadata={"help": "num_inference_steps"}) + use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"}) pretrained_model_name_or_path: str = field( default="runwayml/stable-diffusion-v1-5", - metadata={ - "help": - "Path to pretrained model or model, when we want to resume training." - }, ) - image_logging_steps: Optional[int] = field( - default=1000, metadata={"help": "Log image every X steps."}) - sd_locked: bool = field( - default=True, metadata={"help": "lock unet output_blocks and out."}) - use_paddle_conv_init: bool = field( - default=False, - metadata={"help": "Whether or not use paddle conv2d init."}) - only_mid_control: bool = field( - default=False, metadata={"help": "only_mid_control."}) - is_ldmbert: bool = field( - default=False, metadata={"help": "Whether to use ldmbert."}) + metadata={"help": "Path to pretrained model or model, when we want to resume training."}, + ) + image_logging_steps: Optional[int] = field(default=1000, metadata={"help": "Log image every X steps."}) + sd_locked: bool = field(default=True, metadata={"help": "lock unet output_blocks and out."}) + use_paddle_conv_init: bool = field(default=False, metadata={"help": "Whether or not use paddle conv2d init."}) + only_mid_control: bool = field(default=False, metadata={"help": "only_mid_control."}) + is_ldmbert: bool = field(default=False, metadata={"help": "Whether to use ldmbert."}) enable_xformers_memory_efficient_attention: bool = field( - default=False, - metadata={"help": "enable_xformers_memory_efficient_attention."}) + default=False, metadata={"help": "enable_xformers_memory_efficient_attention."} + ) @dataclass @@ -71,8 +55,7 @@ class DataArguments: resolution: int = field( default=512, metadata={ - "help": - "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution." - }, ) - file_path: str = field( - default="./fill50k", metadata={"help": "The path to of the fill50k."}) + "help": "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution." + }, + ) + file_path: str = field(default="./fill50k", metadata={"help": "The path to of the fill50k."}) diff --git a/ppdiffusers/examples/controlnet/control/control_trainer.py b/ppdiffusers/examples/controlnet/control/control_trainer.py index 506dfc88664cb..0b40903ded378 100644 --- a/ppdiffusers/examples/controlnet/control/control_trainer.py +++ b/ppdiffusers/examples/controlnet/control/control_trainer.py @@ -18,8 +18,11 @@ import paddle.amp.auto_cast as autocast from paddlenlp.trainer import Trainer -from paddlenlp.trainer.integrations import (INTEGRATION_TO_CALLBACK, - VisualDLCallback, rewrite_logs) +from paddlenlp.trainer.integrations import ( + INTEGRATION_TO_CALLBACK, + VisualDLCallback, + rewrite_logs, +) from paddlenlp.utils.log import logger from ppdiffusers.training_utils import unwrap_model @@ -36,19 +39,17 @@ def autocast_smart_context_manager(self, args): "c_softmax_with_cross_entropy", ], level=args.fp16_opt_level, - dtype=amp_dtype, ) + dtype=amp_dtype, + ) else: - ctx_manager = (contextlib.nullcontext() - if sys.version_info >= (3, 7) else - contextlib.suppress()) + ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress() return ctx_manager def on_step_end(self, args, state, control, model=None, **kwargs): if hasattr(model, "on_train_batch_end"): model.on_train_batch_end() - if (args.image_logging_steps > 0 and - state.global_step % args.image_logging_steps == 0): + if args.image_logging_steps > 0 and state.global_step % args.image_logging_steps == 0: control.should_log = True def on_log(self, args, state, control, logs=None, **kwargs): @@ -58,20 +59,22 @@ def on_log(self, args, state, control, logs=None, **kwargs): inputs = kwargs.get("inputs", None) model = kwargs.get("model", None) image_logs = {} - if (inputs is not None and model is not None and - args.image_logging_steps > 0 and - state.global_step % args.image_logging_steps == 0): + if ( + inputs is not None + and model is not None + and args.image_logging_steps > 0 + and state.global_step % args.image_logging_steps == 0 + ): with self.autocast_smart_context_manager(args): - image_logs["reconstruction"] = model.decode_image( - pixel_values=inputs["pixel_values"]) - image_logs["control"] = model.decode_control_image( - controlnet_cond=inputs["controlnet_cond"]) + image_logs["reconstruction"] = model.decode_image(pixel_values=inputs["pixel_values"]) + image_logs["control"] = model.decode_control_image(controlnet_cond=inputs["controlnet_cond"]) image_logs["ddim-samples-9.0"] = model.log_image( input_ids=inputs["input_ids"], controlnet_cond=inputs["controlnet_cond"], guidance_scale=9.0, height=args.resolution, - width=args.resolution, ) + width=args.resolution, + ) if self.vdl_writer is None: self._init_summary_writer(args) @@ -86,11 +89,11 @@ def on_log(self, args, state, control, logs=None, **kwargs): "Trainer is attempting to log a value of " f'"{v}" of type {type(v)} for key "{k}" as a scalar. ' "This invocation of VisualDL's writer.add_scalar() " - "is incorrect so we dropped this attribute.") + "is incorrect so we dropped this attribute." + ) # log images for k, v in image_logs.items(): - self.vdl_writer.add_image( - k, v, state.global_step, dataformats="NHWC") + self.vdl_writer.add_image(k, v, state.global_step, dataformats="NHWC") self.vdl_writer.flush() @@ -103,14 +106,11 @@ def compute_loss(self, model, inputs, return_outputs=False): loss = model(**inputs) return loss - def _save(self, - output_dir=None, - state_dict=None, - merge_tensor_parallel=False): + def _save(self, output_dir=None, state_dict=None, merge_tensor_parallel=False): super()._save( output_dir=output_dir, state_dict=state_dict, - merge_tensor_parallel=merge_tensor_parallel, ) + merge_tensor_parallel=merge_tensor_parallel, + ) output_dir = output_dir if output_dir is not None else self.args.output_dir - unwrap_model(self.model).controlnet.save_pretrained( - os.path.join(output_dir, "controlnet")) + unwrap_model(self.model).controlnet.save_pretrained(os.path.join(output_dir, "controlnet")) diff --git a/ppdiffusers/examples/controlnet/control/dumpy_dataset.py b/ppdiffusers/examples/controlnet/control/dumpy_dataset.py index 78c3c2bfdbf84..c67eca10fb034 100644 --- a/ppdiffusers/examples/controlnet/control/dumpy_dataset.py +++ b/ppdiffusers/examples/controlnet/control/dumpy_dataset.py @@ -35,7 +35,8 @@ def __init__(self, tokenizer, file_path="./fill50k"): padding="max_length", truncation=True, max_length=tokenizer.model_max_length, - return_tensors="np", ).input_ids[0] + return_tensors="np", + ).input_ids[0] def __len__(self): return len(self.data) @@ -63,9 +64,7 @@ def __getitem__(self, idx): input_ids = self.text_processing(prompt) return dict( - input_ids=paddle.to_tensor( - input_ids, dtype=paddle.int64), - pixel_values=paddle.to_tensor( - target.transpose([2, 0, 1]), dtype=paddle.float32), - controlnet_cond=paddle.to_tensor( - source.transpose([2, 0, 1]), dtype=paddle.float32), ) + input_ids=paddle.to_tensor(input_ids, dtype=paddle.int64), + pixel_values=paddle.to_tensor(target.transpose([2, 0, 1]), dtype=paddle.float32), + controlnet_cond=paddle.to_tensor(source.transpose([2, 0, 1]), dtype=paddle.float32), + ) diff --git a/ppdiffusers/examples/controlnet/control/model.py b/ppdiffusers/examples/controlnet/control/model.py index de2bfb4ee5d47..c0d86532d5021 100644 --- a/ppdiffusers/examples/controlnet/control/model.py +++ b/ppdiffusers/examples/controlnet/control/model.py @@ -22,9 +22,15 @@ from paddlenlp.transformers import AutoTokenizer, CLIPTextModel from paddlenlp.utils.log import logger -from ppdiffusers import (AutoencoderKL, ControlNetModel, DDIMScheduler, - DDPMScheduler, LDMBertModel, UNet2DConditionModel, - is_ppxformers_available) +from ppdiffusers import ( + AutoencoderKL, + ControlNetModel, + DDIMScheduler, + DDPMScheduler, + LDMBertModel, + UNet2DConditionModel, + is_ppxformers_available, +) from ppdiffusers.initializer import reset_initialized_parameter from ppdiffusers.models.ema import LitEma from ppdiffusers.training_utils import freeze_params @@ -42,18 +48,20 @@ def __init__(self, model_args): # init tokenizer tokenizer_name_or_path = ( model_args.tokenizer_name - if model_args.pretrained_model_name_or_path is None else - os.path.join(model_args.pretrained_model_name_or_path, "tokenizer")) + if model_args.pretrained_model_name_or_path is None + else os.path.join(model_args.pretrained_model_name_or_path, "tokenizer") + ) self.tokenizer = AutoTokenizer.from_pretrained( - tokenizer_name_or_path, - model_max_length=model_args.model_max_length) + tokenizer_name_or_path, model_max_length=model_args.model_max_length + ) vae_name = "vqvae" if model_args.is_ldmbert else "vae" # init vae vae_name_or_path = ( model_args.vae_name_or_path - if model_args.pretrained_model_name_or_path is None else - os.path.join(model_args.pretrained_model_name_or_path, vae_name)) + if model_args.pretrained_model_name_or_path is None + else os.path.join(model_args.pretrained_model_name_or_path, vae_name) + ) self.vae = AutoencoderKL.from_pretrained(vae_name_or_path) freeze_params(self.vae.parameters()) @@ -62,55 +70,54 @@ def __init__(self, model_args): if model_args.is_ldmbert: text_encoder_name_or_path = ( model_args.text_encoder_name_or_path - if model_args.pretrained_model_name_or_path is None else - os.path.join(model_args.pretrained_model_name_or_path, "bert")) + if model_args.pretrained_model_name_or_path is None + else os.path.join(model_args.pretrained_model_name_or_path, "bert") + ) # init text_encoder - self.text_encoder = LDMBertModel.from_pretrained( - text_encoder_name_or_path) + self.text_encoder = LDMBertModel.from_pretrained(text_encoder_name_or_path) else: text_encoder_name_or_path = ( model_args.text_encoder_name_or_path - if model_args.pretrained_model_name_or_path is None else - os.path.join(model_args.pretrained_model_name_or_path, - "text_encoder")) - self.text_encoder = CLIPTextModel.from_pretrained( - text_encoder_name_or_path) + if model_args.pretrained_model_name_or_path is None + else os.path.join(model_args.pretrained_model_name_or_path, "text_encoder") + ) + self.text_encoder = CLIPTextModel.from_pretrained(text_encoder_name_or_path) freeze_params(self.text_encoder.parameters()) logger.info("Freeze text_encoder parameters!") unet_name_or_path = ( model_args.unet_name_or_path - if model_args.pretrained_model_name_or_path is None else - os.path.join(model_args.pretrained_model_name_or_path, "unet")) + if model_args.pretrained_model_name_or_path is None + else os.path.join(model_args.pretrained_model_name_or_path, "unet") + ) self.unet = UNet2DConditionModel.from_pretrained(unet_name_or_path) freeze_params(self.unet.parameters()) logger.info("Freeze unet parameters!") - self.controlnet = ControlNetModel.from_unet( - self.unet, load_weights_from_unet=True) + self.controlnet = ControlNetModel.from_unet(self.unet, load_weights_from_unet=True) if not model_args.use_paddle_conv_init: # use torch conv2d init - reset_initialized_parameter( - self.controlnet.controlnet_cond_embedding.conv_in) - reset_initialized_parameter( - self.controlnet.controlnet_cond_embedding.blocks) + reset_initialized_parameter(self.controlnet.controlnet_cond_embedding.conv_in) + reset_initialized_parameter(self.controlnet.controlnet_cond_embedding.blocks) self.noise_scheduler = DDPMScheduler( beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", - num_train_timesteps=1000, ) + num_train_timesteps=1000, + ) self.eval_scheduler = DDIMScheduler( beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False, - steps_offset=1, ) + steps_offset=1, + ) self.eval_scheduler.set_timesteps(model_args.num_inference_steps) self.use_ema = model_args.use_ema if self.use_ema: @@ -118,15 +125,15 @@ def __init__(self, model_args): self.control_scales = [1.0] * 13 self.only_mid_control = model_args.only_mid_control - if (model_args.enable_xformers_memory_efficient_attention and - is_ppxformers_available()): + if model_args.enable_xformers_memory_efficient_attention and is_ppxformers_available(): try: self.unet.enable_xformers_memory_efficient_attention() self.controlnet.enable_xformers_memory_efficient_attention() except Exception as e: logger.warn( "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed" - f" correctly and a GPU is available: {e}") + f" correctly and a GPU is available: {e}" + ) @contextlib.contextmanager def ema_scope(self, context=None): @@ -147,11 +154,7 @@ def on_train_batch_end(self): if self.use_ema: self.model_ema(self.controlnet) - def forward(self, - input_ids=None, - pixel_values=None, - controlnet_cond=None, - **kwargs): + def forward(self, input_ids=None, pixel_values=None, controlnet_cond=None, **kwargs): self.train() with paddle.amp.auto_cast(enable=False): with paddle.no_grad(): @@ -160,11 +163,10 @@ def forward(self, latents = self.vae.encode(pixel_values).latent_dist.sample() latents = latents * 0.18215 noise = paddle.randn(latents.shape) - timesteps = paddle.randint( - 0, self.noise_scheduler.num_train_timesteps, - (latents.shape[0], )).astype("int64") - noisy_latents = self.noise_scheduler.add_noise(latents, noise, - timesteps) + timesteps = paddle.randint(0, self.noise_scheduler.num_train_timesteps, (latents.shape[0],)).astype( + "int64" + ) + noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps) encoder_hidden_states = self.text_encoder(input_ids)[0] # control down_block_res_samples, mid_block_res_sample = self.controlnet( @@ -173,7 +175,8 @@ def forward(self, encoder_hidden_states=encoder_hidden_states, controlnet_cond=controlnet_cond, conditioning_scale=self.control_scales, - return_dict=False, ) + return_dict=False, + ) # predict the noise residual noise_pred = self.unet( @@ -181,7 +184,8 @@ def forward(self, timestep=timesteps, encoder_hidden_states=encoder_hidden_states, down_block_additional_residuals=down_block_res_samples, - mid_block_additional_residual=mid_block_res_sample, ).sample + mid_block_additional_residual=mid_block_res_sample, + ).sample loss = F.mse_loss(noise_pred, noise, reduction="mean") return loss @@ -198,25 +202,23 @@ def decode_image(self, pixel_values=None, **kwargs): @paddle.no_grad() def decode_control_image(self, controlnet_cond=None, **kwargs): - return ((255 * controlnet_cond.transpose([0, 2, 3, 1])).cast("float32") - .numpy().round()) + return (255 * controlnet_cond.transpose([0, 2, 3, 1])).cast("float32").numpy().round() @paddle.no_grad() def log_image( - self, - input_ids=None, - controlnet_cond=None, - height=512, - width=512, - eta=0.0, - guidance_scale=7.5, - **kwargs, ): + self, + input_ids=None, + controlnet_cond=None, + height=512, + width=512, + eta=0.0, + guidance_scale=7.5, + **kwargs, + ): self.eval() with self.ema_scope(): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") # only log 8 image if input_ids.shape[0] > 4: input_ids = input_ids[:4] @@ -230,34 +232,30 @@ def log_image( padding="max_length", truncation=True, max_length=max_length, - return_tensors="pd", ) + return_tensors="pd", + ) uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0] - text_embeddings = paddle.concat( - [uncond_embeddings, text_embeddings], axis=0) + text_embeddings = paddle.concat([uncond_embeddings, text_embeddings], axis=0) - latents = paddle.randn((input_ids.shape[0], self.unet.in_channels, - height // 8, width // 8)) + latents = paddle.randn((input_ids.shape[0], self.unet.in_channels, height // 8, width // 8)) # ddim donot use this latents = latents * self.eval_scheduler.init_noise_sigma - accepts_eta = "eta" in set( - inspect.signature(self.eval_scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta - controlnet_cond_input = (paddle.concat([controlnet_cond] * 2) - if do_classifier_free_guidance else - controlnet_cond) + controlnet_cond_input = ( + paddle.concat([controlnet_cond] * 2) if do_classifier_free_guidance else controlnet_cond + ) for t in self.eval_scheduler.timesteps: # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents # ddim donot use this - latent_model_input = self.eval_scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t) # ControlNet predict the noise residual down_block_res_samples, mid_block_res_sample = self.controlnet( @@ -266,7 +264,8 @@ def log_image( encoder_hidden_states=text_embeddings, controlnet_cond=controlnet_cond_input, conditioning_scale=self.control_scales, - return_dict=False, ) + return_dict=False, + ) # predict the noise residual noise_pred = self.unet( @@ -274,17 +273,16 @@ def log_image( t, encoder_hidden_states=text_embeddings, down_block_additional_residuals=down_block_res_samples, - mid_block_additional_residual=mid_block_res_sample, ).sample + mid_block_additional_residual=mid_block_res_sample, + ).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.eval_scheduler.step( - noise_pred, t, latents, **extra_step_kwargs).prev_sample + latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample latents = 1 / 0.18215 * latents image = self.vae.decode(latents).sample @@ -296,7 +294,6 @@ def set_recompute(self, value=False): def fn(layer): if hasattr(layer, "gradient_checkpointing"): layer.gradient_checkpointing = value - print("Set", layer.__class__, "recompute", - layer.gradient_checkpointing) + print("Set", layer.__class__, "recompute", layer.gradient_checkpointing) self.controlnet.apply(fn) diff --git a/ppdiffusers/examples/controlnet/extract_controlnet_ema_weights.py b/ppdiffusers/examples/controlnet/extract_controlnet_ema_weights.py index 0cb439f90dd2b..17582dd93e648 100644 --- a/ppdiffusers/examples/controlnet/extract_controlnet_ema_weights.py +++ b/ppdiffusers/examples/controlnet/extract_controlnet_ema_weights.py @@ -24,13 +24,11 @@ def extract_controlnet_ema_weights(model_path, output_path): for k in state_dict.keys(): if k.startswith("controlnet."): flat_ema_key = "model_ema." + "".join(k.split(".")[1:]) - ema_state_dict[k.replace("controlnet.", "")] = state_dict.get( - flat_ema_key) + ema_state_dict[k.replace("controlnet.", "")] = state_dict.get(flat_ema_key) if len(ema_state_dict) == 0: raise ValueError("Can not extract ema weights!") os.makedirs(output_path, exist_ok=True) - paddle.save(ema_state_dict, - os.path.join(output_path, "model_state.ema.pdparams")) + paddle.save(ema_state_dict, os.path.join(output_path, "model_state.ema.pdparams")) print(f"Save EMA weights to {output_path} !") @@ -40,11 +38,13 @@ def extract_controlnet_ema_weights(model_path, output_path): "--model_path", type=str, default="./model_state.pdparams", - help="model_state.", ) + help="model_state.", + ) parser.add_argument( "--output_path", type=str, default="ema_controlnet", - help="The model output path.", ) + help="The model output path.", + ) args = parser.parse_args() extract_controlnet_ema_weights(args.model_path, args.output_path) diff --git a/ppdiffusers/examples/controlnet/gradio_canny2image.py b/ppdiffusers/examples/controlnet/gradio_canny2image.py index 5dc43a6ca4f8e..5c0ad9e936299 100644 --- a/ppdiffusers/examples/controlnet/gradio_canny2image.py +++ b/ppdiffusers/examples/controlnet/gradio_canny2image.py @@ -27,39 +27,37 @@ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny") pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - controlnet=controlnet, - safety_checker=None) + "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None +) def process( - input_image, - prompt, - a_prompt, - n_prompt, - num_samples, - image_resolution, - ddim_steps, - guess_mode, - strength, - scale, - seed, - eta, - low_threshold, - high_threshold, ): + input_image, + prompt, + a_prompt, + n_prompt, + num_samples, + image_resolution, + ddim_steps, + guess_mode, + strength, + scale, + seed, + eta, + low_threshold, + high_threshold, +): with paddle.no_grad(): img = resize_image(HWC3(input_image), image_resolution) H, W, C = img.shape detected_map = apply_canny(img, low_threshold, high_threshold) detected_map = HWC3(detected_map) - control = paddle.to_tensor( - detected_map.copy(), dtype=paddle.float32) / 255.0 + control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0 control = control.unsqueeze(0).transpose([0, 3, 1, 2]) control_scales = ( - [strength * (0.825**float(12 - i)) for i in range(13)] - if guess_mode else ([strength] * 13) + [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13) ) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01 if seed == -1: seed = random.randint(0, 65535) @@ -75,7 +73,8 @@ def process( width=W, eta=eta, controlnet_conditioning_scale=control_scales, - guidance_scale=scale, ).images[0] + guidance_scale=scale, + ).images[0] results.append(img) return [255 - detected_map] + results @@ -91,59 +90,55 @@ def process( prompt = gr.Textbox(label="Prompt") run_button = gr.Button(label="Run") with gr.Accordion("Advanced options", open=False): - num_samples = gr.Slider( - label="Images", minimum=1, maximum=12, value=1, step=1) + num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) image_resolution = gr.Slider( label="Image Resolution", minimum=256, maximum=768, value=512, - step=64, ) + step=64, + ) strength = gr.Slider( label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, - step=0.01, ) + step=0.01, + ) guess_mode = gr.Checkbox(label="Guess Mode", value=False) low_threshold = gr.Slider( label="Canny low threshold", minimum=1, maximum=255, value=100, - step=1, ) + step=1, + ) high_threshold = gr.Slider( label="Canny high threshold", minimum=1, maximum=255, value=200, - step=1, ) - ddim_steps = gr.Slider( - label="Steps", minimum=1, maximum=100, value=20, step=1) + step=1, + ) + ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) scale = gr.Slider( label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, - step=0.1, ) - seed = gr.Slider( - label="Seed", - minimum=-1, - maximum=2147483647, - step=1, - randomize=True) + step=0.1, + ) + seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True) eta = gr.Number(label="eta (DDIM)", value=0.0) - a_prompt = gr.Textbox( - label="Added Prompt", - value="best quality, extremely detailed") + a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed") n_prompt = gr.Textbox( label="Negative Prompt", value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality", ) with gr.Column(): - result_gallery = gr.Gallery( - label="Output", show_label=False, elem_id="gallery").style( - grid=2, height="auto") + result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style( + grid=2, height="auto" + ) ips = [ input_image, prompt, diff --git a/ppdiffusers/examples/controlnet/gradio_depth2image.py b/ppdiffusers/examples/controlnet/gradio_depth2image.py index 63b50704b9bff..67f33684cc947 100644 --- a/ppdiffusers/examples/controlnet/gradio_depth2image.py +++ b/ppdiffusers/examples/controlnet/gradio_depth2image.py @@ -28,37 +28,34 @@ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-depth") pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - controlnet=controlnet, - safety_checker=None) + "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None +) def process( - input_image, - prompt, - a_prompt, - n_prompt, - num_samples, - image_resolution, - detect_resolution, - ddim_steps, - guess_mode, - strength, - scale, - seed, - eta, ): + input_image, + prompt, + a_prompt, + n_prompt, + num_samples, + image_resolution, + detect_resolution, + ddim_steps, + guess_mode, + strength, + scale, + seed, + eta, +): with paddle.no_grad(): input_image = HWC3(input_image) - detected_map, _ = apply_midas( - resize_image(input_image, detect_resolution)) + detected_map, _ = apply_midas(resize_image(input_image, detect_resolution)) detected_map = HWC3(detected_map) img = resize_image(input_image, image_resolution) H, W, C = img.shape - detected_map = cv2.resize( - detected_map, (W, H), interpolation=cv2.INTER_NEAREST) + detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST) - control = paddle.to_tensor( - detected_map.copy(), dtype=paddle.float32) / 255.0 + control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0 control = control.unsqueeze(0).transpose([0, 3, 1, 2]) if seed == -1: @@ -75,7 +72,8 @@ def process( width=W, eta=eta, controlnet_conditioning_scale=1.0, - guidance_scale=scale, ).images[0] + guidance_scale=scale, + ).images[0] results.append(img) return [detected_map] + results @@ -91,53 +89,48 @@ def process( prompt = gr.Textbox(label="Prompt") run_button = gr.Button(label="Run") with gr.Accordion("Advanced options", open=False): - num_samples = gr.Slider( - label="Images", minimum=1, maximum=12, value=1, step=1) + num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) image_resolution = gr.Slider( label="Image Resolution", minimum=256, maximum=768, value=512, - step=64, ) + step=64, + ) strength = gr.Slider( label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, - step=0.01, ) + step=0.01, + ) guess_mode = gr.Checkbox(label="Guess Mode", value=False) detect_resolution = gr.Slider( label="Depth Resolution", minimum=128, maximum=1024, value=384, - step=1, ) - ddim_steps = gr.Slider( - label="Steps", minimum=1, maximum=100, value=20, step=1) + step=1, + ) + ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) scale = gr.Slider( label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, - step=0.1, ) - seed = gr.Slider( - label="Seed", - minimum=-1, - maximum=2147483647, - step=1, - randomize=True) + step=0.1, + ) + seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True) eta = gr.Number(label="eta (DDIM)", value=0.0) - a_prompt = gr.Textbox( - label="Added Prompt", - value="best quality, extremely detailed") + a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed") n_prompt = gr.Textbox( label="Negative Prompt", value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality", ) with gr.Column(): - result_gallery = gr.Gallery( - label="Output", show_label=False, elem_id="gallery").style( - grid=2, height="auto") + result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style( + grid=2, height="auto" + ) ips = [ input_image, prompt, diff --git a/ppdiffusers/examples/controlnet/gradio_hed2image.py b/ppdiffusers/examples/controlnet/gradio_hed2image.py index 87e37dccb3043..9394f85ba697d 100644 --- a/ppdiffusers/examples/controlnet/gradio_hed2image.py +++ b/ppdiffusers/examples/controlnet/gradio_hed2image.py @@ -28,25 +28,25 @@ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-hed") pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - controlnet=controlnet, - safety_checker=None) + "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None +) def process( - input_image, - prompt, - a_prompt, - n_prompt, - num_samples, - image_resolution, - detect_resolution, - ddim_steps, - guess_mode, - strength, - scale, - seed, - eta, ): + input_image, + prompt, + a_prompt, + n_prompt, + num_samples, + image_resolution, + detect_resolution, + ddim_steps, + guess_mode, + strength, + scale, + seed, + eta, +): with paddle.no_grad(): input_image = HWC3(input_image) detected_map = apply_hed(resize_image(input_image, detect_resolution)) @@ -54,16 +54,13 @@ def process( img = resize_image(input_image, image_resolution) H, W, C = img.shape - detected_map = cv2.resize( - detected_map, (W, H), interpolation=cv2.INTER_LINEAR) + detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR) - control = paddle.to_tensor( - detected_map.copy(), dtype=paddle.float32) / 255.0 + control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0 control = control.unsqueeze(0).transpose([0, 3, 1, 2]) control_scales = ( - [strength * (0.825**float(12 - i)) for i in range(13)] - if guess_mode else ([strength] * 13) + [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13) ) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01 if seed == -1: seed = random.randint(0, 65535) @@ -79,7 +76,8 @@ def process( width=W, eta=eta, controlnet_conditioning_scale=control_scales, - guidance_scale=scale, ).images[0] + guidance_scale=scale, + ).images[0] results.append(img) return [detected_map] + results @@ -95,53 +93,42 @@ def process( prompt = gr.Textbox(label="Prompt") run_button = gr.Button(label="Run") with gr.Accordion("Advanced options", open=False): - num_samples = gr.Slider( - label="Images", minimum=1, maximum=12, value=1, step=1) + num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) image_resolution = gr.Slider( label="Image Resolution", minimum=256, maximum=768, value=512, - step=64, ) + step=64, + ) strength = gr.Slider( label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, - step=0.01, ) + step=0.01, + ) guess_mode = gr.Checkbox(label="Guess Mode", value=False) - detect_resolution = gr.Slider( - label="HED Resolution", - minimum=128, - maximum=1024, - value=512, - step=1) - ddim_steps = gr.Slider( - label="Steps", minimum=1, maximum=100, value=20, step=1) + detect_resolution = gr.Slider(label="HED Resolution", minimum=128, maximum=1024, value=512, step=1) + ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) scale = gr.Slider( label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, - step=0.1, ) - seed = gr.Slider( - label="Seed", - minimum=-1, - maximum=2147483647, - step=1, - randomize=True) + step=0.1, + ) + seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True) eta = gr.Number(label="eta (DDIM)", value=0.0) - a_prompt = gr.Textbox( - label="Added Prompt", - value="best quality, extremely detailed") + a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed") n_prompt = gr.Textbox( label="Negative Prompt", value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality", ) with gr.Column(): - result_gallery = gr.Gallery( - label="Output", show_label=False, elem_id="gallery").style( - grid=2, height="auto") + result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style( + grid=2, height="auto" + ) ips = [ input_image, prompt, diff --git a/ppdiffusers/examples/controlnet/gradio_hough2image.py b/ppdiffusers/examples/controlnet/gradio_hough2image.py index eef44cc32f7b0..65ff6c1410769 100644 --- a/ppdiffusers/examples/controlnet/gradio_hough2image.py +++ b/ppdiffusers/examples/controlnet/gradio_hough2image.py @@ -28,46 +28,44 @@ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-mlsd") pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - controlnet=controlnet, - safety_checker=None) + "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None +) def process( - input_image, - prompt, - a_prompt, - n_prompt, - num_samples, - image_resolution, - detect_resolution, - ddim_steps, - guess_mode, - strength, - scale, - seed, - eta, - value_threshold, - distance_threshold, ): + input_image, + prompt, + a_prompt, + n_prompt, + num_samples, + image_resolution, + detect_resolution, + ddim_steps, + guess_mode, + strength, + scale, + seed, + eta, + value_threshold, + distance_threshold, +): with paddle.no_grad(): input_image = HWC3(input_image) detected_map = apply_mlsd( resize_image(input_image, detect_resolution), value_threshold, - distance_threshold, ) + distance_threshold, + ) detected_map = HWC3(detected_map) img = resize_image(input_image, image_resolution) H, W, C = img.shape - detected_map = cv2.resize( - detected_map, (W, H), interpolation=cv2.INTER_NEAREST) + detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST) - control = paddle.to_tensor( - detected_map.copy(), dtype=paddle.float32) / 255.0 + control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0 control = control.unsqueeze(0).transpose([0, 3, 1, 2]) control_scales = ( - [strength * (0.825**float(12 - i)) for i in range(13)] - if guess_mode else ([strength] * 13) + [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13) ) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01 if seed == -1: seed = random.randint(0, 65535) @@ -83,7 +81,8 @@ def process( width=W, eta=eta, controlnet_conditioning_scale=control_scales, - guidance_scale=scale, ).images[0] + guidance_scale=scale, + ).images[0] results.append(img) return [detected_map] + results @@ -99,65 +98,62 @@ def process( prompt = gr.Textbox(label="Prompt") run_button = gr.Button(label="Run") with gr.Accordion("Advanced options", open=False): - num_samples = gr.Slider( - label="Images", minimum=1, maximum=12, value=1, step=1) + num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) image_resolution = gr.Slider( label="Image Resolution", minimum=256, maximum=768, value=512, - step=64, ) + step=64, + ) strength = gr.Slider( label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, - step=0.01, ) + step=0.01, + ) guess_mode = gr.Checkbox(label="Guess Mode", value=False) detect_resolution = gr.Slider( label="Hough Line Resolution", minimum=128, maximum=1024, value=512, - step=1, ) + step=1, + ) value_threshold = gr.Slider( label="Hough value threshold (MLSD)", minimum=0.01, maximum=2.0, value=0.1, - step=0.01, ) + step=0.01, + ) distance_threshold = gr.Slider( label="Hough distance threshold (MLSD)", minimum=0.01, maximum=20.0, value=0.1, - step=0.01, ) - ddim_steps = gr.Slider( - label="Steps", minimum=1, maximum=100, value=20, step=1) + step=0.01, + ) + ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) scale = gr.Slider( label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, - step=0.1, ) - seed = gr.Slider( - label="Seed", - minimum=-1, - maximum=2147483647, - step=1, - randomize=True) + step=0.1, + ) + seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True) eta = gr.Number(label="eta (DDIM)", value=0.0) - a_prompt = gr.Textbox( - label="Added Prompt", - value="best quality, extremely detailed") + a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed") n_prompt = gr.Textbox( label="Negative Prompt", value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality", ) with gr.Column(): - result_gallery = gr.Gallery( - label="Output", show_label=False, elem_id="gallery").style( - grid=2, height="auto") + result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style( + grid=2, height="auto" + ) ips = [ input_image, prompt, diff --git a/ppdiffusers/examples/controlnet/gradio_ip2p2image.py b/ppdiffusers/examples/controlnet/gradio_ip2p2image.py index 0d23830f2c4be..7f164b57c63be 100644 --- a/ppdiffusers/examples/controlnet/gradio_ip2p2image.py +++ b/ppdiffusers/examples/controlnet/gradio_ip2p2image.py @@ -23,41 +23,37 @@ from ppdiffusers import ControlNetModel, StableDiffusionControlNetPipeline -controlnet = ControlNetModel.from_pretrained( - "lllyasviel/control_v11e_sd15_ip2p") +controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11e_sd15_ip2p") pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - controlnet=controlnet, - safety_checker=None) + "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None +) def process( - input_image, - prompt, - a_prompt, - n_prompt, - num_samples, - image_resolution, - ddim_steps, - guess_mode, - strength, - scale, - seed, - eta, ): + input_image, + prompt, + a_prompt, + n_prompt, + num_samples, + image_resolution, + ddim_steps, + guess_mode, + strength, + scale, + seed, + eta, +): with paddle.no_grad(): img = resize_image(HWC3(input_image), image_resolution) detected_map = input_image.copy() H, W, C = img.shape - detected_map = cv2.resize( - detected_map, (W, H), interpolation=cv2.INTER_LINEAR) + detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR) - control = paddle.to_tensor( - detected_map.copy(), dtype=paddle.float32) / 255.0 + control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0 control = control.unsqueeze(0).transpose([0, 3, 1, 2]) control_scales = ( - [strength * (0.825**float(12 - i)) for i in range(13)] - if guess_mode else ([strength] * 13) + [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13) ) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01 if seed == -1: seed = random.randint(0, 65535) @@ -73,7 +69,8 @@ def process( width=W, eta=eta, controlnet_conditioning_scale=control_scales, - guidance_scale=scale, ).images[0] + guidance_scale=scale, + ).images[0] results.append(img) return [detected_map] + results @@ -89,47 +86,41 @@ def process( prompt = gr.Textbox(label="Prompt") run_button = gr.Button(label="Run") with gr.Accordion("Advanced options", open=False): - num_samples = gr.Slider( - label="Images", minimum=1, maximum=12, value=1, step=1) + num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) image_resolution = gr.Slider( label="Image Resolution", minimum=256, maximum=768, value=512, - step=64, ) + step=64, + ) strength = gr.Slider( label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, - step=0.01, ) + step=0.01, + ) guess_mode = gr.Checkbox(label="Guess Mode", value=False) - ddim_steps = gr.Slider( - label="Steps", minimum=1, maximum=100, value=20, step=1) + ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) scale = gr.Slider( label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, - step=0.1, ) - seed = gr.Slider( - label="Seed", - minimum=-1, - maximum=2147483647, - step=1, - randomize=True) + step=0.1, + ) + seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True) eta = gr.Number(label="eta (DDIM)", value=0.0) - a_prompt = gr.Textbox( - label="Added Prompt", - value="best quality, extremely detailed") + a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed") n_prompt = gr.Textbox( label="Negative Prompt", value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality", ) with gr.Column(): - result_gallery = gr.Gallery( - label="Output", show_label=False, elem_id="gallery").style( - grid=2, height="auto") + result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style( + grid=2, height="auto" + ) ips = [ input_image, prompt, diff --git a/ppdiffusers/examples/controlnet/gradio_normal2image.py b/ppdiffusers/examples/controlnet/gradio_normal2image.py index 6ce2e56d8ea3c..69bf238fe4521 100644 --- a/ppdiffusers/examples/controlnet/gradio_normal2image.py +++ b/ppdiffusers/examples/controlnet/gradio_normal2image.py @@ -28,43 +28,39 @@ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-normal") pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - controlnet=controlnet, - safety_checker=None) + "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None +) def process( - input_image, - prompt, - a_prompt, - n_prompt, - num_samples, - image_resolution, - detect_resolution, - ddim_steps, - guess_mode, - strength, - scale, - seed, - eta, - bg_threshold, ): + input_image, + prompt, + a_prompt, + n_prompt, + num_samples, + image_resolution, + detect_resolution, + ddim_steps, + guess_mode, + strength, + scale, + seed, + eta, + bg_threshold, +): with paddle.no_grad(): input_image = HWC3(input_image) - _, detected_map = apply_midas( - resize_image(input_image, detect_resolution), bg_th=bg_threshold) + _, detected_map = apply_midas(resize_image(input_image, detect_resolution), bg_th=bg_threshold) detected_map = HWC3(detected_map) img = resize_image(input_image, image_resolution) H, W, C = img.shape - detected_map = cv2.resize( - detected_map, (W, H), interpolation=cv2.INTER_NEAREST) + detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST) - control = paddle.to_tensor( - detected_map.copy(), dtype=paddle.float32) / 255.0 + control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0 control = control.unsqueeze(0).transpose([0, 3, 1, 2]) control_scales = ( - [strength * (0.825**float(12 - i)) for i in range(13)] - if guess_mode else ([strength] * 13) + [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13) ) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01 if seed == -1: seed = random.randint(0, 65535) @@ -80,7 +76,8 @@ def process( width=W, eta=eta, controlnet_conditioning_scale=control_scales, - guidance_scale=scale, ).images[0] + guidance_scale=scale, + ).images[0] results.append(img) return [detected_map] + results @@ -96,59 +93,55 @@ def process( prompt = gr.Textbox(label="Prompt") run_button = gr.Button(label="Run") with gr.Accordion("Advanced options", open=False): - num_samples = gr.Slider( - label="Images", minimum=1, maximum=12, value=1, step=1) + num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) image_resolution = gr.Slider( label="Image Resolution", minimum=256, maximum=768, value=512, - step=64, ) + step=64, + ) strength = gr.Slider( label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, - step=0.01, ) + step=0.01, + ) guess_mode = gr.Checkbox(label="Guess Mode", value=False) detect_resolution = gr.Slider( label="Normal Resolution", minimum=128, maximum=1024, value=384, - step=1, ) + step=1, + ) bg_threshold = gr.Slider( label="Normal background threshold", minimum=0.0, maximum=1.0, value=0.4, - step=0.01, ) - ddim_steps = gr.Slider( - label="Steps", minimum=1, maximum=100, value=20, step=1) + step=0.01, + ) + ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) scale = gr.Slider( label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, - step=0.1, ) - seed = gr.Slider( - label="Seed", - minimum=-1, - maximum=2147483647, - step=1, - randomize=True) + step=0.1, + ) + seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True) eta = gr.Number(label="eta (DDIM)", value=0.0) - a_prompt = gr.Textbox( - label="Added Prompt", - value="best quality, extremely detailed") + a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed") n_prompt = gr.Textbox( label="Negative Prompt", value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality", ) with gr.Column(): - result_gallery = gr.Gallery( - label="Output", show_label=False, elem_id="gallery").style( - grid=2, height="auto") + result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style( + grid=2, height="auto" + ) ips = [ input_image, prompt, diff --git a/ppdiffusers/examples/controlnet/gradio_pose2image_openpose.py b/ppdiffusers/examples/controlnet/gradio_pose2image_openpose.py index 07a52bcf286d3..e932854042b60 100644 --- a/ppdiffusers/examples/controlnet/gradio_pose2image_openpose.py +++ b/ppdiffusers/examples/controlnet/gradio_pose2image_openpose.py @@ -26,45 +26,41 @@ apply_openpose = OpenposePaddleDetector() -controlnet = ControlNetModel.from_pretrained( - "lllyasviel/sd-controlnet-openpose") +controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose") pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - controlnet=controlnet, - safety_checker=None) + "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None +) def process( - input_image, - hand, - prompt, - a_prompt, - n_prompt, - num_samples, - image_resolution, - detect_resolution, - ddim_steps, - guess_mode, - strength, - scale, - seed, - eta, ): + input_image, + hand, + prompt, + a_prompt, + n_prompt, + num_samples, + image_resolution, + detect_resolution, + ddim_steps, + guess_mode, + strength, + scale, + seed, + eta, +): with paddle.no_grad(): input_image = HWC3(input_image) detected_map, _ = apply_openpose(input_image, detect_resolution, hand) detected_map = HWC3(detected_map) img = resize_image(input_image, image_resolution) H, W, C = img.shape - detected_map = cv2.resize( - detected_map, (W, H), interpolation=cv2.INTER_NEAREST) + detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST) - control = paddle.to_tensor( - detected_map.copy(), dtype=paddle.float32) / 255.0 + control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0 control = control.unsqueeze(0).transpose([0, 3, 1, 2]) control_scales = ( - [strength * (0.825**float(12 - i)) for i in range(13)] - if guess_mode else ([strength] * 13) + [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13) ) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01 if seed == -1: seed = random.randint(0, 65535) @@ -80,7 +76,8 @@ def process( width=W, eta=eta, controlnet_conditioning_scale=control_scales, - guidance_scale=scale, ).images[0] + guidance_scale=scale, + ).images[0] results.append(img) return [detected_map] + results @@ -97,53 +94,48 @@ def process( prompt = gr.Textbox(label="Prompt") run_button = gr.Button(label="Run") with gr.Accordion("Advanced options", open=False): - num_samples = gr.Slider( - label="Images", minimum=1, maximum=12, value=1, step=1) + num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) image_resolution = gr.Slider( label="Image Resolution", minimum=256, maximum=768, value=512, - step=64, ) + step=64, + ) strength = gr.Slider( label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, - step=0.01, ) + step=0.01, + ) guess_mode = gr.Checkbox(label="Guess Mode", value=False) detect_resolution = gr.Slider( label="OpenPose Resolution", minimum=128, maximum=1024, value=512, - step=1, ) - ddim_steps = gr.Slider( - label="Steps", minimum=1, maximum=100, value=20, step=1) + step=1, + ) + ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) scale = gr.Slider( label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, - step=0.1, ) - seed = gr.Slider( - label="Seed", - minimum=-1, - maximum=2147483647, - step=1, - randomize=True) + step=0.1, + ) + seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True) eta = gr.Number(label="eta (DDIM)", value=0.0) - a_prompt = gr.Textbox( - label="Added Prompt", - value="best quality, extremely detailed") + a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed") n_prompt = gr.Textbox( label="Negative Prompt", value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality", ) with gr.Column(): - result_gallery = gr.Gallery( - label="Output", show_label=False, elem_id="gallery").style( - grid=2, height="auto") + result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style( + grid=2, height="auto" + ) ips = [ input_image, hand, diff --git a/ppdiffusers/examples/controlnet/gradio_pose2image_ppdetpose.py b/ppdiffusers/examples/controlnet/gradio_pose2image_ppdetpose.py index 51a713db003db..097bbd83516d3 100644 --- a/ppdiffusers/examples/controlnet/gradio_pose2image_ppdetpose.py +++ b/ppdiffusers/examples/controlnet/gradio_pose2image_ppdetpose.py @@ -26,45 +26,41 @@ apply_ppdetpose = PPDetDetector() -controlnet = ControlNetModel.from_pretrained( - "lllyasviel/sd-controlnet-openpose") +controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose") pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - controlnet=controlnet, - safety_checker=None) + "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None +) def process( - input_image, - hand, - prompt, - a_prompt, - n_prompt, - num_samples, - image_resolution, - detect_resolution, - ddim_steps, - guess_mode, - strength, - scale, - seed, - eta, ): + input_image, + hand, + prompt, + a_prompt, + n_prompt, + num_samples, + image_resolution, + detect_resolution, + ddim_steps, + guess_mode, + strength, + scale, + seed, + eta, +): with paddle.no_grad(): input_image = HWC3(input_image) detected_map, _ = apply_ppdetpose(input_image, detect_resolution, hand) detected_map = HWC3(detected_map) img = resize_image(input_image, image_resolution) H, W, C = img.shape - detected_map = cv2.resize( - detected_map, (W, H), interpolation=cv2.INTER_NEAREST) + detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST) - control = paddle.to_tensor( - detected_map.copy(), dtype=paddle.float32) / 255.0 + control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0 control = control.unsqueeze(0).transpose([0, 3, 1, 2]) control_scales = ( - [strength * (0.825**float(12 - i)) for i in range(13)] - if guess_mode else ([strength] * 13) + [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13) ) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01 if seed == -1: seed = random.randint(0, 65535) @@ -80,7 +76,8 @@ def process( width=W, eta=eta, controlnet_conditioning_scale=control_scales, - guidance_scale=scale, ).images[0] + guidance_scale=scale, + ).images[0] results.append(img) return [detected_map] + results @@ -97,53 +94,48 @@ def process( prompt = gr.Textbox(label="Prompt") run_button = gr.Button(label="Run") with gr.Accordion("Advanced options", open=False): - num_samples = gr.Slider( - label="Images", minimum=1, maximum=12, value=1, step=1) + num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) image_resolution = gr.Slider( label="Image Resolution", minimum=256, maximum=768, value=512, - step=64, ) + step=64, + ) strength = gr.Slider( label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, - step=0.01, ) + step=0.01, + ) guess_mode = gr.Checkbox(label="Guess Mode", value=False) detect_resolution = gr.Slider( label="OpenPose Resolution", minimum=128, maximum=1024, value=512, - step=1, ) - ddim_steps = gr.Slider( - label="Steps", minimum=1, maximum=100, value=20, step=1) + step=1, + ) + ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) scale = gr.Slider( label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, - step=0.1, ) - seed = gr.Slider( - label="Seed", - minimum=-1, - maximum=2147483647, - step=1, - randomize=True) + step=0.1, + ) + seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True) eta = gr.Number(label="eta (DDIM)", value=0.0) - a_prompt = gr.Textbox( - label="Added Prompt", - value="best quality, extremely detailed") + a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed") n_prompt = gr.Textbox( label="Negative Prompt", value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality", ) with gr.Column(): - result_gallery = gr.Gallery( - label="Output", show_label=False, elem_id="gallery").style( - grid=2, height="auto") + result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style( + grid=2, height="auto" + ) ips = [ input_image, hand, diff --git a/ppdiffusers/examples/controlnet/gradio_seg2image_segformer.py b/ppdiffusers/examples/controlnet/gradio_seg2image_segformer.py index 0d89c0899ecb4..1e8bd335f71a5 100644 --- a/ppdiffusers/examples/controlnet/gradio_seg2image_segformer.py +++ b/ppdiffusers/examples/controlnet/gradio_seg2image_segformer.py @@ -28,42 +28,38 @@ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg") pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - controlnet=controlnet, - safety_checker=None) + "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None +) def process( - input_image, - prompt, - a_prompt, - n_prompt, - num_samples, - image_resolution, - detect_resolution, - ddim_steps, - guess_mode, - strength, - scale, - seed, - eta, ): + input_image, + prompt, + a_prompt, + n_prompt, + num_samples, + image_resolution, + detect_resolution, + ddim_steps, + guess_mode, + strength, + scale, + seed, + eta, +): with paddle.no_grad(): input_image = HWC3(input_image) - detected_map = apply_uniformer( - resize_image(input_image, detect_resolution)) + detected_map = apply_uniformer(resize_image(input_image, detect_resolution)) img = resize_image(input_image, image_resolution) H, W, C = img.shape - detected_map = cv2.resize( - detected_map, (W, H), interpolation=cv2.INTER_NEAREST) + detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST) - control = paddle.to_tensor( - detected_map.copy(), dtype=paddle.float32) / 255.0 + control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0 control = control.unsqueeze(0).transpose([0, 3, 1, 2]) control_scales = ( - [strength * (0.825**float(12 - i)) for i in range(13)] - if guess_mode else ([strength] * 13) + [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13) ) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01 if seed == -1: seed = random.randint(0, 65535) @@ -79,7 +75,8 @@ def process( width=W, eta=eta, controlnet_conditioning_scale=control_scales, - guidance_scale=scale, ).images[0] + guidance_scale=scale, + ).images[0] results.append(img) return [detected_map] + results @@ -95,53 +92,48 @@ def process( prompt = gr.Textbox(label="Prompt") run_button = gr.Button(label="Run") with gr.Accordion("Advanced options", open=False): - num_samples = gr.Slider( - label="Images", minimum=1, maximum=12, value=1, step=1) + num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) image_resolution = gr.Slider( label="Image Resolution", minimum=256, maximum=768, value=512, - step=64, ) + step=64, + ) strength = gr.Slider( label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, - step=0.01, ) + step=0.01, + ) guess_mode = gr.Checkbox(label="Guess Mode", value=False) detect_resolution = gr.Slider( label="Segmentation Resolution", minimum=128, maximum=1024, value=512, - step=1, ) - ddim_steps = gr.Slider( - label="Steps", minimum=1, maximum=100, value=20, step=1) + step=1, + ) + ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) scale = gr.Slider( label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, - step=0.1, ) - seed = gr.Slider( - label="Seed", - minimum=-1, - maximum=2147483647, - step=1, - randomize=True) + step=0.1, + ) + seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True) eta = gr.Number(label="eta (DDIM)", value=0.0) - a_prompt = gr.Textbox( - label="Added Prompt", - value="best quality, extremely detailed") + a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed") n_prompt = gr.Textbox( label="Negative Prompt", value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality", ) with gr.Column(): - result_gallery = gr.Gallery( - label="Output", show_label=False, elem_id="gallery").style( - grid=2, height="auto") + result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style( + grid=2, height="auto" + ) ips = [ input_image, prompt, diff --git a/ppdiffusers/examples/controlnet/gradio_seg2image_segmenter.py b/ppdiffusers/examples/controlnet/gradio_seg2image_segmenter.py index b517ba3b94cc4..a99e82a4ea7e5 100644 --- a/ppdiffusers/examples/controlnet/gradio_seg2image_segmenter.py +++ b/ppdiffusers/examples/controlnet/gradio_seg2image_segmenter.py @@ -28,42 +28,38 @@ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg") pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - controlnet=controlnet, - safety_checker=None) + "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None +) def process( - input_image, - prompt, - a_prompt, - n_prompt, - num_samples, - image_resolution, - detect_resolution, - ddim_steps, - guess_mode, - strength, - scale, - seed, - eta, ): + input_image, + prompt, + a_prompt, + n_prompt, + num_samples, + image_resolution, + detect_resolution, + ddim_steps, + guess_mode, + strength, + scale, + seed, + eta, +): with paddle.no_grad(): input_image = HWC3(input_image) - detected_map = apply_uniformer( - resize_image(input_image, detect_resolution)) + detected_map = apply_uniformer(resize_image(input_image, detect_resolution)) img = resize_image(input_image, image_resolution) H, W, C = img.shape - detected_map = cv2.resize( - detected_map, (W, H), interpolation=cv2.INTER_NEAREST) + detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST) - control = paddle.to_tensor( - detected_map.copy(), dtype=paddle.float32) / 255.0 + control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0 control = control.unsqueeze(0).transpose([0, 3, 1, 2]) control_scales = ( - [strength * (0.825**float(12 - i)) for i in range(13)] - if guess_mode else ([strength] * 13) + [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13) ) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01 if seed == -1: seed = random.randint(0, 65535) @@ -79,7 +75,8 @@ def process( width=W, eta=eta, controlnet_conditioning_scale=control_scales, - guidance_scale=scale, ).images[0] + guidance_scale=scale, + ).images[0] results.append(img) return [detected_map] + results @@ -95,53 +92,48 @@ def process( prompt = gr.Textbox(label="Prompt") run_button = gr.Button(label="Run") with gr.Accordion("Advanced options", open=False): - num_samples = gr.Slider( - label="Images", minimum=1, maximum=12, value=1, step=1) + num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) image_resolution = gr.Slider( label="Image Resolution", minimum=256, maximum=768, value=512, - step=64, ) + step=64, + ) strength = gr.Slider( label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, - step=0.01, ) + step=0.01, + ) guess_mode = gr.Checkbox(label="Guess Mode", value=False) detect_resolution = gr.Slider( label="Segmentation Resolution", minimum=128, maximum=1024, value=512, - step=1, ) - ddim_steps = gr.Slider( - label="Steps", minimum=1, maximum=100, value=20, step=1) + step=1, + ) + ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) scale = gr.Slider( label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, - step=0.1, ) - seed = gr.Slider( - label="Seed", - minimum=-1, - maximum=2147483647, - step=1, - randomize=True) + step=0.1, + ) + seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True) eta = gr.Number(label="eta (DDIM)", value=0.0) - a_prompt = gr.Textbox( - label="Added Prompt", - value="best quality, extremely detailed") + a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed") n_prompt = gr.Textbox( label="Negative Prompt", value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality", ) with gr.Column(): - result_gallery = gr.Gallery( - label="Output", show_label=False, elem_id="gallery").style( - grid=2, height="auto") + result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style( + grid=2, height="auto" + ) ips = [ input_image, prompt, diff --git a/ppdiffusers/examples/controlnet/gradio_shuffle2image.py b/ppdiffusers/examples/controlnet/gradio_shuffle2image.py index da5f05c890081..0e6313d0d407c 100644 --- a/ppdiffusers/examples/controlnet/gradio_shuffle2image.py +++ b/ppdiffusers/examples/controlnet/gradio_shuffle2image.py @@ -25,34 +25,32 @@ apply_shuffle = ContentShuffleDetector() -controlnet = ControlNetModel.from_pretrained( - "lllyasviel/control_v11e_sd15_shuffle") +controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11e_sd15_shuffle") pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - controlnet=controlnet, - safety_checker=None) + "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None +) def process( - input_image, - prompt, - a_prompt, - n_prompt, - num_samples, - image_resolution, - ddim_steps, - guess_mode, - strength, - scale, - seed, - eta, ): + input_image, + prompt, + a_prompt, + n_prompt, + num_samples, + image_resolution, + ddim_steps, + guess_mode, + strength, + scale, + seed, + eta, +): with paddle.no_grad(): img = resize_image(HWC3(input_image), image_resolution) H, W, C = img.shape detected_map = apply_shuffle(img, w=W, h=H, f=256) - control = paddle.to_tensor( - detected_map.copy(), dtype=paddle.float32) / 255.0 + control = paddle.to_tensor(detected_map.copy(), dtype=paddle.float32) / 255.0 control = control.unsqueeze(0).transpose([0, 3, 1, 2]) control_scales = [strength] * 13 @@ -70,7 +68,8 @@ def process( width=W, eta=eta, controlnet_conditioning_scale=control_scales, - guidance_scale=scale, ).images[0] + guidance_scale=scale, + ).images[0] results.append(img) return [detected_map] + results @@ -86,47 +85,41 @@ def process( prompt = gr.Textbox(label="Prompt") run_button = gr.Button(label="Run") with gr.Accordion("Advanced options", open=False): - num_samples = gr.Slider( - label="Images", minimum=1, maximum=12, value=1, step=1) + num_samples = gr.Slider(label="Images", minimum=1, maximum=12, value=1, step=1) image_resolution = gr.Slider( label="Image Resolution", minimum=256, maximum=768, value=512, - step=64, ) + step=64, + ) strength = gr.Slider( label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, - step=0.01, ) + step=0.01, + ) guess_mode = gr.Checkbox(label="Guess Mode", value=False) - ddim_steps = gr.Slider( - label="Steps", minimum=1, maximum=100, value=20, step=1) + ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=20, step=1) scale = gr.Slider( label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, - step=0.1, ) - seed = gr.Slider( - label="Seed", - minimum=-1, - maximum=2147483647, - step=1, - randomize=True) + step=0.1, + ) + seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, step=1, randomize=True) eta = gr.Number(label="eta (DDIM)", value=0.0) - a_prompt = gr.Textbox( - label="Added Prompt", - value="best quality, extremely detailed") + a_prompt = gr.Textbox(label="Added Prompt", value="best quality, extremely detailed") n_prompt = gr.Textbox( label="Negative Prompt", value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality", ) with gr.Column(): - result_gallery = gr.Gallery( - label="Output", show_label=False, elem_id="gallery").style( - grid=2, height="auto") + result_gallery = gr.Gallery(label="Output", show_label=False, elem_id="gallery").style( + grid=2, height="auto" + ) ips = [ input_image, prompt, diff --git a/ppdiffusers/examples/controlnet/train_txt2img_control_trainer.py b/ppdiffusers/examples/controlnet/train_txt2img_control_trainer.py index f94a1bebdee43..34910428889af 100644 --- a/ppdiffusers/examples/controlnet/train_txt2img_control_trainer.py +++ b/ppdiffusers/examples/controlnet/train_txt2img_control_trainer.py @@ -16,10 +16,14 @@ import os import paddle -from control import (ControlNet, ControlNetTrainer, DataArguments, - Fill50kDataset, ModelArguments) -from paddlenlp.trainer import (PdArgumentParser, TrainingArguments, - get_last_checkpoint) +from control import ( + ControlNet, + ControlNetTrainer, + DataArguments, + Fill50kDataset, + ModelArguments, +) +from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint from paddlenlp.utils.log import logger @@ -29,15 +33,14 @@ def unfreeze_params(params): def main(): - parser = PdArgumentParser( - (ModelArguments, DataArguments, TrainingArguments)) + parser = PdArgumentParser((ModelArguments, DataArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() # report to custom_visualdl training_args.report_to = ["custom_visualdl"] training_args.resolution = data_args.resolution training_args.image_logging_steps = model_args.image_logging_steps = ( - math.ceil(model_args.image_logging_steps / training_args.logging_steps) - * training_args.logging_steps) + math.ceil(model_args.image_logging_steps / training_args.logging_steps) * training_args.logging_steps + ) training_args.print_config(model_args, "Model") training_args.print_config(data_args, "Data") @@ -45,16 +48,14 @@ def main(): # Detecting last checkpoint. last_checkpoint = None - if (os.path.isdir(training_args.output_dir) and training_args.do_train and - not training_args.overwrite_output_dir): + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len( - os.listdir(training_args.output_dir)) > 0: + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome.") - elif (last_checkpoint is not None and - training_args.resume_from_checkpoint is None): + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." @@ -67,7 +68,8 @@ def main(): model=model, args=training_args, train_dataset=train_dataset, - tokenizer=model.tokenizer, ) + tokenizer=model.tokenizer, + ) # must set recompute after trainer init trainer.model.set_recompute(training_args.recompute) @@ -76,7 +78,8 @@ def main(): trainer.model.controlnet.parameters(), trainer.model.unet.up_blocks.parameters(), trainer.model.unet.conv_norm_out.parameters(), - trainer.model.unet.conv_out.parameters(), ) + trainer.model.unet.conv_out.parameters(), + ) unfreeze_params(params_to_train) else: params_to_train = trainer.model.controlnet.parameters() diff --git a/ppdiffusers/examples/dreambooth/train_dreambooth.py b/ppdiffusers/examples/dreambooth/train_dreambooth.py index f9e184fbf53a3..1aedbd3d57952 100644 --- a/ppdiffusers/examples/dreambooth/train_dreambooth.py +++ b/ppdiffusers/examples/dreambooth/train_dreambooth.py @@ -29,10 +29,10 @@ import paddle.nn as nn import paddle.nn.functional as F from huggingface_hub import HfFolder, Repository, create_repo, whoami -from paddle.distributed.fleet.utils.hybrid_parallel_util import \ - fused_allreduce_gradients -from paddle.io import (BatchSampler, DataLoader, Dataset, - DistributedBatchSampler) +from paddle.distributed.fleet.utils.hybrid_parallel_util import ( + fused_allreduce_gradients, +) +from paddle.io import BatchSampler, DataLoader, Dataset, DistributedBatchSampler from paddle.optimizer import AdamW from paddle.vision import BaseTransform, transforms from paddlenlp.trainer import set_seed @@ -41,8 +41,13 @@ from PIL import Image from tqdm.auto import tqdm -from ppdiffusers import (AutoencoderKL, DDPMScheduler, DiffusionPipeline, - UNet2DConditionModel, is_ppxformers_available) +from ppdiffusers import ( + AutoencoderKL, + DDPMScheduler, + DiffusionPipeline, + UNet2DConditionModel, + is_ppxformers_available, +) from ppdiffusers.models.modeling_utils import freeze_params, unwrap_model from ppdiffusers.optimization import get_scheduler from ppdiffusers.utils import check_min_version @@ -52,8 +57,7 @@ def url_or_path_join(*path_list): - return (os.path.join(*path_list) - if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list)) + return os.path.join(*path_list) if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list) class Lambda(BaseTransform): @@ -65,11 +69,11 @@ def _apply_image(self, img): return self.fn(img) -def import_model_class_from_model_name_or_path( - pretrained_model_name_or_path: str): +def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str): try: text_encoder_config = PretrainedConfig.from_pretrained( - url_or_path_join(pretrained_model_name_or_path, "text_encoder")) + url_or_path_join(pretrained_model_name_or_path, "text_encoder") + ) model_class = text_encoder_config.architectures[0] except Exception: model_class = "LDMBertModel" @@ -78,8 +82,9 @@ def import_model_class_from_model_name_or_path( return CLIPTextModel elif model_class == "RobertaSeriesModelWithTransformation": - from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import \ - RobertaSeriesModelWithTransformation + from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import ( + RobertaSeriesModelWithTransformation, + ) return RobertaSeriesModelWithTransformation elif model_class == "BertModel": @@ -87,8 +92,9 @@ def import_model_class_from_model_name_or_path( return BertModel elif model_class == "LDMBertModel": - from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import \ - LDMBertModel + from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import ( + LDMBertModel, + ) return LDMBertModel else: @@ -104,8 +110,7 @@ def fn(layer): # unet if hasattr(layer, "gradient_checkpointing"): layer.gradient_checkpointing = value - print("Set", layer.__class__, "recompute", - layer.gradient_checkpointing) + print("Set", layer.__class__, "recompute", layer.gradient_checkpointing) model.apply(fn) @@ -125,8 +130,7 @@ def get_report_to(args): def parse_args(input_args=None): - parser = argparse.ArgumentParser( - description="Simple example of a training dreambooth script.") + parser = argparse.ArgumentParser(description="Simple example of a training dreambooth script.") parser.add_argument( "--pretrained_model_name_or_path", type=str, @@ -145,19 +149,22 @@ def parse_args(input_args=None): type=str, default=None, required=True, - help="A folder containing the training data of instance images.", ) + help="A folder containing the training data of instance images.", + ) parser.add_argument( "--class_data_dir", type=str, default=None, required=False, - help="A folder containing the training data of class images.", ) + help="A folder containing the training data of class images.", + ) parser.add_argument( "--instance_prompt", type=str, default=None, required=True, - help="The prompt with identifier specifying the instance", ) + help="The prompt with identifier specifying the instance", + ) parser.add_argument( "--class_prompt", type=str, @@ -168,12 +175,14 @@ def parse_args(input_args=None): "--with_prior_preservation", default=False, action="store_true", - help="Flag to add prior preservation loss.", ) + help="Flag to add prior preservation loss.", + ) parser.add_argument( "--prior_loss_weight", type=float, default=1.0, - help="The weight of prior preservation loss.", ) + help="The weight of prior preservation loss.", + ) parser.add_argument( "--num_class_images", type=int, @@ -181,39 +190,42 @@ def parse_args(input_args=None): help=( "Minimal class images for prior preservation loss. If there are not enough images already present in" " class_data_dir, additional images will be sampled with class_prompt." - ), ) + ), + ) parser.add_argument( "--output_dir", type=str, default="./dreambooth-model", help="The output directory where the model predictions and checkpoints will be written.", ) - parser.add_argument( - "--seed", - type=int, - default=None, - help="A seed for reproducible training.") + parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") parser.add_argument( "--height", type=int, default=None, help=( "The height for input images, all the images in the train/validation dataset will be resized to this" - " height"), ) + " height" + ), + ) parser.add_argument( "--width", type=int, default=None, help=( "The width for input images, all the images in the train/validation dataset will be resized to this" - " width"), ) + " width" + ), + ) parser.add_argument( "--resolution", type=int, default=512, help=( "The resolution for input images, all the images in the train/validation dataset will be resized to this" - " resolution"), ) + " resolution" + ), + ) parser.add_argument( "--center_crop", default=False, @@ -221,11 +233,13 @@ def parse_args(input_args=None): help=( "Whether to center crop the input images to the resolution. If not set, the images will be randomly" " cropped. The images will be resized to the resolution first before cropping." - ), ) + ), + ) parser.add_argument( "--random_flip", action="store_true", - help="whether to randomly flip images horizontally", ) + help="whether to randomly flip images horizontally", + ) parser.add_argument( "--train_text_encoder", action="store_true", @@ -235,12 +249,14 @@ def parse_args(input_args=None): "--train_batch_size", type=int, default=4, - help="Batch size (per device) for the training dataloader.", ) + help="Batch size (per device) for the training dataloader.", + ) parser.add_argument( "--sample_batch_size", type=int, default=4, - help="Batch size (per device) for sampling images.", ) + help="Batch size (per device) for sampling images.", + ) parser.add_argument("--num_train_epochs", type=int, default=1) parser.add_argument( "--max_train_steps", @@ -277,12 +293,15 @@ def parse_args(input_args=None): default="constant", help=( 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' - ' "constant", "constant_with_warmup"]'), ) + ' "constant", "constant_with_warmup"]' + ), + ) parser.add_argument( "--lr_warmup_steps", type=int, default=500, - help="Number of steps for the warmup in the lr scheduler.", ) + help="Number of steps for the warmup in the lr scheduler.", + ) parser.add_argument( "--lr_num_cycles", type=int, @@ -293,45 +312,47 @@ def parse_args(input_args=None): "--lr_power", type=float, default=1.0, - help="Power factor of the polynomial scheduler.", ) + help="Power factor of the polynomial scheduler.", + ) parser.add_argument( "--dataloader_num_workers", type=int, default=0, help=( "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process." - ), ) + ), + ) parser.add_argument( "--adam_beta1", type=float, default=0.9, - help="The beta1 parameter for the Adam optimizer.", ) + help="The beta1 parameter for the Adam optimizer.", + ) parser.add_argument( "--adam_beta2", type=float, default=0.999, - help="The beta2 parameter for the Adam optimizer.", ) - parser.add_argument( - "--adam_weight_decay", - type=float, - default=1e-2, - help="Weight decay to use.") + help="The beta2 parameter for the Adam optimizer.", + ) + parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") parser.add_argument( "--adam_epsilon", type=float, default=1e-08, - help="Epsilon value for the Adam optimizer", ) - parser.add_argument( - "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + help="Epsilon value for the Adam optimizer", + ) + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--push_to_hub", action="store_true", - help="Whether or not to push the model to the Hub.", ) + help="Whether or not to push the model to the Hub.", + ) parser.add_argument( "--hub_token", type=str, default=None, - help="The token to use to push to the Model Hub.", ) + help="The token to use to push to the Model Hub.", + ) parser.add_argument( "--hub_model_id", type=str, @@ -344,27 +365,28 @@ def parse_args(input_args=None): default="logs", help=( "[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to" - "*output_dir/logs"), ) + "*output_dir/logs" + ), + ) parser.add_argument( "--report_to", type=str, default="visualdl", choices=["tensorboard", "visualdl"], - help="Log writer type.", ) + help="Log writer type.", + ) parser.add_argument( "--checkpointing_steps", type=int, default=500, - help=("Save a checkpoint of the training state every X updates."), ) + help=("Save a checkpoint of the training state every X updates."), + ) parser.add_argument( "--enable_xformers_memory_efficient_attention", action="store_true", - help="Whether or not to use xformers.", ) - parser.add_argument( - "--noise_offset", - type=float, - default=1.0, - help="The scale of noise offset.") + help="Whether or not to use xformers.", + ) + parser.add_argument("--noise_offset", type=float, default=1.0, help="The scale of noise offset.") if input_args is not None: args = parser.parse_args(input_args) @@ -376,20 +398,15 @@ def parse_args(input_args=None): if args.with_prior_preservation: if args.class_data_dir is None: - raise ValueError( - "You must specify a data directory for class images.") + raise ValueError("You must specify a data directory for class images.") if args.class_prompt is None: raise ValueError("You must specify prompt for class images.") else: # logger is not available yet if args.class_data_dir is not None: - warnings.warn( - "You need not use --class_data_dir without --with_prior_preservation." - ) + warnings.warn("You need not use --class_data_dir without --with_prior_preservation.") if args.class_prompt is not None: - warnings.warn( - "You need not use --class_prompt without --with_prior_preservation." - ) + warnings.warn("You need not use --class_prompt without --with_prior_preservation.") args.logging_dir = os.path.join(args.output_dir, args.logging_dir) if args.height is None or args.width is None and args.resolution is not None: @@ -405,18 +422,19 @@ class DreamBoothDataset(Dataset): """ def __init__( - self, - instance_data_root, - instance_prompt, - tokenizer, - class_data_root=None, - class_prompt=None, - class_num=None, - height=512, - width=512, - center_crop=False, - interpolation="bilinear", - random_flip=False, ): + self, + instance_data_root, + instance_prompt, + tokenizer, + class_data_root=None, + class_prompt=None, + class_num=None, + height=512, + width=512, + center_crop=False, + interpolation="bilinear", + random_flip=False, + ): self.height = height self.width = width self.center_crop = center_crop @@ -442,8 +460,7 @@ def __init__( if any(suffix in p.name for suffix in ext): self.class_images_path.append(p) if class_num is not None: - self.num_class_images = min( - len(self.class_images_path), class_num) + self.num_class_images = min(len(self.class_images_path), class_num) else: self.num_class_images = len(self.class_images_path) self._length = max(self.num_class_images, self.num_instance_images) @@ -451,24 +468,22 @@ def __init__( else: self.class_data_root = None - self.image_transforms = transforms.Compose([ - transforms.Resize( - (height, width), interpolation=interpolation), - transforms.CenterCrop((height, width)) - if center_crop else transforms.RandomCrop((height, width)), - transforms.RandomHorizontalFlip() - if random_flip else Lambda(lambda x: x), - transforms.ToTensor(), - transforms.Normalize([0.5], [0.5]), - ]) + self.image_transforms = transforms.Compose( + [ + transforms.Resize((height, width), interpolation=interpolation), + transforms.CenterCrop((height, width)) if center_crop else transforms.RandomCrop((height, width)), + transforms.RandomHorizontalFlip() if random_flip else Lambda(lambda x: x), + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] + ) def __len__(self): return self._length def __getitem__(self, index): example = {} - instance_image = Image.open(self.instance_images_path[ - index % self.num_instance_images]) + instance_image = Image.open(self.instance_images_path[index % self.num_instance_images]) if not instance_image.mode == "RGB": instance_image = instance_image.convert("RGB") example["instance_images"] = self.image_transforms(instance_image) @@ -477,11 +492,11 @@ def __getitem__(self, index): padding="do_not_pad", truncation=True, max_length=self.tokenizer.model_max_length, - return_attention_mask=False, ).input_ids + return_attention_mask=False, + ).input_ids if self.class_data_root: - class_image = Image.open(self.class_images_path[ - index % self.num_class_images]) + class_image = Image.open(self.class_images_path[index % self.num_class_images]) if not class_image.mode == "RGB": class_image = class_image.convert("RGB") example["class_images"] = self.image_transforms(class_image) @@ -490,7 +505,8 @@ def __getitem__(self, index): padding="do_not_pad", truncation=True, max_length=self.tokenizer.model_max_length, - return_attention_mask=False, ).input_ids + return_attention_mask=False, + ).input_ids return example @@ -512,9 +528,7 @@ def __getitem__(self, index): return example -def get_full_repo_name(model_id: str, - organization: Optional[str]=None, - token: Optional[str]=None): +def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None): if token is None: token = HfFolder.get_token() if organization is None: @@ -547,45 +561,43 @@ def main(): pipeline = DiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, safety_checker=None, - requires_safety_checker=False, ) - if (args.enable_xformers_memory_efficient_attention and - is_ppxformers_available()): + requires_safety_checker=False, + ) + if args.enable_xformers_memory_efficient_attention and is_ppxformers_available(): try: pipeline.unet.enable_xformers_memory_efficient_attention() except Exception as e: logger.warn( "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed" - f" correctly and a GPU is available: {e}") + f" correctly and a GPU is available: {e}" + ) pipeline.set_progress_bar_config(disable=True) num_new_images = args.num_class_images - cur_class_images logger.info(f"Number of class images to sample: {num_new_images}.") sample_dataset = PromptDataset(args.class_prompt, num_new_images) - batch_sampler = (DistributedBatchSampler( - sample_dataset, - batch_size=args.sample_batch_size, - shuffle=False) if num_processes > 1 else BatchSampler( - sample_dataset, - batch_size=args.sample_batch_size, - shuffle=False)) + batch_sampler = ( + DistributedBatchSampler(sample_dataset, batch_size=args.sample_batch_size, shuffle=False) + if num_processes > 1 + else BatchSampler(sample_dataset, batch_size=args.sample_batch_size, shuffle=False) + ) sample_dataloader = DataLoader( sample_dataset, batch_sampler=batch_sampler, - num_workers=args.dataloader_num_workers, ) + num_workers=args.dataloader_num_workers, + ) for example in tqdm( - sample_dataloader, - desc="Generating class images", - disable=not is_main_process, ): + sample_dataloader, + desc="Generating class images", + disable=not is_main_process, + ): images = pipeline(example["prompt"]).images for i, image in enumerate(images): hash_image = hashlib.sha1(image.tobytes()).hexdigest() - image_filename = ( - class_images_dir / - f"{example['index'][i] + cur_class_images}-{hash_image}.jpg" - ) + image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg" image.save(image_filename) pipeline.to("cpu") del pipeline @@ -597,17 +609,14 @@ def main(): if args.push_to_hub: if args.hub_model_id is None: - repo_name = get_full_repo_name( - Path(args.output_dir).name, token=args.hub_token) + repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id create_repo(repo_name, exist_ok=True, token=args.hub_token) - repo = Repository( - args.output_dir, clone_from=repo_name, token=args.hub_token) + repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token) - with open(os.path.join(args.output_dir, ".gitignore"), - "w+") as gitignore: + with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: if "step_*" not in gitignore: gitignore.write("step_*\n") if "epoch_*" not in gitignore: @@ -617,30 +626,26 @@ def main(): if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name) elif args.pretrained_model_name_or_path: - tokenizer = AutoTokenizer.from_pretrained( - url_or_path_join(args.pretrained_model_name_or_path, "tokenizer")) + tokenizer = AutoTokenizer.from_pretrained(url_or_path_join(args.pretrained_model_name_or_path, "tokenizer")) # import correct text encoder class - text_encoder_cls = import_model_class_from_model_name_or_path( - args.pretrained_model_name_or_path) + text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path) # Load scheduler and models - noise_scheduler = DDPMScheduler.from_pretrained( - args.pretrained_model_name_or_path, subfolder="scheduler") + noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") text_encoder = text_encoder_cls.from_pretrained( - url_or_path_join(args.pretrained_model_name_or_path, "text_encoder")) - text_config = (text_encoder.config if isinstance(text_encoder.config, dict) - else text_encoder.config.to_dict()) - if (text_config.get("use_attention_mask", None) is not None and - text_config["use_attention_mask"]): + url_or_path_join(args.pretrained_model_name_or_path, "text_encoder") + ) + text_config = text_encoder.config if isinstance(text_encoder.config, dict) else text_encoder.config.to_dict() + if text_config.get("use_attention_mask", None) is not None and text_config["use_attention_mask"]: use_attention_mask = True else: use_attention_mask = False - vae = AutoencoderKL.from_pretrained( - args.pretrained_model_name_or_path, subfolder="vae") + vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae") unet = UNet2DConditionModel.from_pretrained( args.pretrained_model_name_or_path, - subfolder="unet", ) + subfolder="unet", + ) freeze_params(vae.parameters()) if not args.train_text_encoder: @@ -650,21 +655,20 @@ def main(): if args.train_text_encoder: set_recompute(text_encoder, True) - if args.enable_xformers_memory_efficient_attention and is_ppxformers_available( - ): + if args.enable_xformers_memory_efficient_attention and is_ppxformers_available(): try: unet.enable_xformers_memory_efficient_attention() except Exception as e: logger.warn( "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed" - f" correctly and a GPU is available: {e}") + f" correctly and a GPU is available: {e}" + ) # Dataset and DataLoaders creation: train_dataset = DreamBoothDataset( instance_data_root=args.instance_data_dir, instance_prompt=args.instance_prompt, - class_data_root=args.class_data_dir - if args.with_prior_preservation else None, + class_data_root=args.class_data_dir if args.with_prior_preservation else None, class_prompt=args.class_prompt, class_num=args.num_class_images, tokenizer=tokenizer, @@ -672,7 +676,8 @@ def main(): width=args.width, center_crop=args.center_crop, interpolation="bilinear", - random_flip=args.random_flip, ) + random_flip=args.random_flip, + ) def collate_fn(examples): input_ids = [example["instance_prompt_ids"] for example in examples] @@ -687,38 +692,35 @@ def collate_fn(examples): pixel_values = paddle.stack(pixel_values).astype("float32") input_ids = tokenizer.pad( - { - "input_ids": input_ids - }, + {"input_ids": input_ids}, padding="max_length", max_length=tokenizer.model_max_length, - return_tensors="pd", ).input_ids + return_tensors="pd", + ).input_ids return { "input_ids": input_ids, "pixel_values": pixel_values, } - train_sampler = (DistributedBatchSampler( - train_dataset, batch_size=args.train_batch_size, shuffle=True) - if num_processes > 1 else BatchSampler( - train_dataset, - batch_size=args.train_batch_size, - shuffle=True)) + train_sampler = ( + DistributedBatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True) + if num_processes > 1 + else BatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True) + ) train_dataloader = DataLoader( train_dataset, batch_sampler=train_sampler, collate_fn=collate_fn, - num_workers=args.dataloader_num_workers, ) + num_workers=args.dataloader_num_workers, + ) # Scheduler and math around the number of training steps. - num_update_steps_per_epoch = math.ceil( - len(train_dataloader) / args.gradient_accumulation_steps) + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch # Afterwards we recalculate our number of training epochs - args.num_train_epochs = math.ceil(args.max_train_steps / - num_update_steps_per_epoch) + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) if num_processes > 1: unet = paddle.DataParallel(unet) @@ -726,23 +728,22 @@ def collate_fn(examples): text_encoder = paddle.DataParallel(text_encoder) params_to_optimize = ( - list(unet.parameters()) + list(text_encoder.parameters()) - if args.train_text_encoder else unet.parameters()) + list(unet.parameters()) + list(text_encoder.parameters()) if args.train_text_encoder else unet.parameters() + ) if args.scale_lr: - args.learning_rate = (args.learning_rate * - args.gradient_accumulation_steps * - args.train_batch_size * num_processes) + args.learning_rate = ( + args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * num_processes + ) lr_scheduler = get_scheduler( args.lr_scheduler, learning_rate=args.learning_rate, - num_warmup_steps=args.lr_warmup_steps * - args.gradient_accumulation_steps, - num_training_steps=args.max_train_steps * - args.gradient_accumulation_steps, + num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, num_cycles=args.lr_num_cycles, - power=args.lr_power, ) + power=args.lr_power, + ) # Initialize the optimizer optimizer = AdamW( learning_rate=lr_scheduler, @@ -751,8 +752,8 @@ def collate_fn(examples): beta2=args.adam_beta2, weight_decay=args.adam_weight_decay, epsilon=args.adam_epsilon, - grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) - if args.max_grad_norm > 0 else None, ) + grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None, + ) if is_main_process: logger.info("----------- Configuration Arguments -----------") @@ -762,25 +763,19 @@ def collate_fn(examples): writer = get_report_to(args) # Train! - total_batch_size = (args.train_batch_size * num_processes * - args.gradient_accumulation_steps) + total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num batches each epoch = {len(train_dataloader)}") logger.info(f" Num Epochs = {args.num_train_epochs}") - logger.info( - f" Instantaneous batch size per device = {args.train_batch_size}") - logger.info( - f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" - ) - logger.info( - f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Instantaneous batch size per device = {args.train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. - progress_bar = tqdm( - range(args.max_train_steps), disable=not is_main_process) + progress_bar = tqdm(range(args.max_train_steps), disable=not is_main_process) progress_bar.set_description("Train Steps") global_step = 0 @@ -803,22 +798,24 @@ def collate_fn(examples): if args.noise_offset: # https://www.crosslabs.org//blog/diffusion-with-offset-noise noise += args.noise_offset * paddle.randn( - (latents.shape[0], latents.shape[1], 1, 1), - dtype=latents.dtype) + (latents.shape[0], latents.shape[1], 1, 1), dtype=latents.dtype + ) batch_size = latents.shape[0] # Sample a random timestep for each image timesteps = paddle.randint( 0, noise_scheduler.config.num_train_timesteps, - (batch_size, ), - dtype="int64", ) + (batch_size,), + dtype="int64", + ) # Add noise to the latents according to the noise magnitude at each timestep # (this is the forward diffusion process) noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) - if num_processes > 1 and (args.gradient_checkpointing or ( - (step + 1) % args.gradient_accumulation_steps != 0)): + if num_processes > 1 and ( + args.gradient_checkpointing or ((step + 1) % args.gradient_accumulation_steps != 0) + ): # grad acc, no_sync when (step + 1) % args.gradient_accumulation_steps != 0: # gradient_checkpointing, no_sync every where # gradient_checkpointing + grad_acc, no_sync every where @@ -826,55 +823,45 @@ def collate_fn(examples): if args.train_text_encoder: text_encoder_ctx_manager = text_encoder.no_sync() else: - text_encoder_ctx_manager = (contextlib.nullcontext() - if sys.version_info >= (3, 7) - else contextlib.suppress()) + text_encoder_ctx_manager = ( + contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress() + ) else: - unet_ctx_manager = (contextlib.nullcontext() - if sys.version_info >= (3, 7) else - contextlib.suppress()) - text_encoder_ctx_manager = (contextlib.nullcontext() - if sys.version_info >= (3, 7) else - contextlib.suppress()) + unet_ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress() + text_encoder_ctx_manager = ( + contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress() + ) with text_encoder_ctx_manager: # Get the text embedding for conditioning if use_attention_mask: - attention_mask = (batch["input_ids"] != - tokenizer.pad_token_id).cast("int64") + attention_mask = (batch["input_ids"] != tokenizer.pad_token_id).cast("int64") else: attention_mask = None - encoder_hidden_states = text_encoder( - batch["input_ids"], attention_mask=attention_mask)[0] + encoder_hidden_states = text_encoder(batch["input_ids"], attention_mask=attention_mask)[0] with unet_ctx_manager: # Predict the noise residual / sample - model_pred = unet(noisy_latents, timesteps, - encoder_hidden_states).sample + model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample # Get the target for loss depending on the prediction type if noise_scheduler.config.prediction_type == "epsilon": target = noise elif noise_scheduler.config.prediction_type == "v_prediction": - target = noise_scheduler.get_velocity(latents, noise, - timesteps) + target = noise_scheduler.get_velocity(latents, noise, timesteps) else: - raise ValueError( - f"Unknown prediction type {noise_scheduler.config.prediction_type}" - ) + raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") if args.with_prior_preservation: # Chunk the noise and model_pred into two parts and compute the loss on each part separately. - model_pred, model_pred_prior = model_pred.chunk( - 2, axis=0) + model_pred, model_pred_prior = model_pred.chunk(2, axis=0) target, target_prior = target.chunk(2, axis=0) # Compute instance loss loss = F.mse_loss(model_pred, target, reduction="mean") # Compute prior loss - prior_loss = F.mse_loss( - model_pred_prior, target_prior, reduction="mean") + prior_loss = F.mse_loss(model_pred_prior, target_prior, reduction="mean") # Add the prior loss to the instance loss. loss = loss + args.prior_loss_weight * prior_loss @@ -908,13 +895,10 @@ def collate_fn(examples): writer.add_scalar(f"train/{name}", val, global_step) if global_step % args.checkpointing_steps == 0: - save_path = os.path.join(args.output_dir, - f"checkpoint-{global_step}") - unwrap_model(unet).save_pretrained( - os.path.join(save_path, "unet")) + save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") + unwrap_model(unet).save_pretrained(os.path.join(save_path, "unet")) if args.train_text_encoder: - unwrap_model(text_encoder).save_pretrained( - os.path.join(save_path, "text_encoder")) + unwrap_model(text_encoder).save_pretrained(os.path.join(save_path, "text_encoder")) if global_step >= args.max_train_steps: break @@ -926,14 +910,12 @@ def collate_fn(examples): pipeline = DiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, unet=unwrap_model(unet), - text_encoder=unwrap_model(text_encoder), ) + text_encoder=unwrap_model(text_encoder), + ) pipeline.save_pretrained(args.output_dir) if args.push_to_hub: - repo.push_to_hub( - commit_message="End of training", - blocking=False, - auto_lfs_prune=True) + repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True) if __name__ == "__main__": diff --git a/ppdiffusers/examples/dreambooth/train_dreambooth_lora.py b/ppdiffusers/examples/dreambooth/train_dreambooth_lora.py index b8837db5fb804..b36bc8b8f2130 100644 --- a/ppdiffusers/examples/dreambooth/train_dreambooth_lora.py +++ b/ppdiffusers/examples/dreambooth/train_dreambooth_lora.py @@ -32,10 +32,10 @@ import paddle.nn.functional as F import requests from huggingface_hub import HfFolder, create_repo, upload_folder, whoami -from paddle.distributed.fleet.utils.hybrid_parallel_util import \ - fused_allreduce_gradients -from paddle.io import (BatchSampler, DataLoader, Dataset, - DistributedBatchSampler) +from paddle.distributed.fleet.utils.hybrid_parallel_util import ( + fused_allreduce_gradients, +) +from paddle.io import BatchSampler, DataLoader, Dataset, DistributedBatchSampler from paddle.optimizer import AdamW from paddle.vision import BaseTransform, transforms from paddlenlp.trainer import set_seed @@ -44,12 +44,21 @@ from PIL import Image from tqdm.auto import tqdm -from ppdiffusers import (AutoencoderKL, DDPMScheduler, DiffusionPipeline, - DPMSolverMultistepScheduler, UNet2DConditionModel, - is_ppxformers_available) +from ppdiffusers import ( + AutoencoderKL, + DDPMScheduler, + DiffusionPipeline, + DPMSolverMultistepScheduler, + UNet2DConditionModel, + is_ppxformers_available, +) from ppdiffusers.loaders import AttnProcsLayers, LoraLoaderMixin from ppdiffusers.models.attention_processor import ( - AttnProcessor, AttnProcessor2_5, LoRAAttnProcessor, LoRAAttnProcessor2_5) + AttnProcessor, + AttnProcessor2_5, + LoRAAttnProcessor, + LoRAAttnProcessor2_5, +) from ppdiffusers.optimization import get_scheduler from ppdiffusers.training_utils import freeze_params, unwrap_model from ppdiffusers.utils import TEXT_ENCODER_ATTN_MODULE, check_min_version @@ -62,14 +71,14 @@ def _retry( - func, - func_args: Optional[tuple]=None, - func_kwargs: Optional[dict]=None, - exceptions: Type[requests.exceptions.RequestException]=requests. - exceptions.RequestException, - max_retries: int=0, - base_wait_time: float=0.5, - max_wait_time: float=2, ): + func, + func_args: Optional[tuple] = None, + func_kwargs: Optional[dict] = None, + exceptions: Type[requests.exceptions.RequestException] = requests.exceptions.RequestException, + max_retries: int = 0, + base_wait_time: float = 0.5, + max_wait_time: float = 2, +): func_args = func_args or () func_kwargs = func_kwargs or {} retry = 0 @@ -80,27 +89,24 @@ def _retry( if retry >= max_retries: raise err else: - sleep_time = min(max_wait_time, base_wait_time * 2 - **retry) # Exponential backoff - logger.info( - f"{func} timed out, retrying in {sleep_time}s... [{retry/max_retries}]" - ) + sleep_time = min(max_wait_time, base_wait_time * 2**retry) # Exponential backoff + logger.info(f"{func} timed out, retrying in {sleep_time}s... [{retry/max_retries}]") time.sleep(sleep_time) retry += 1 def url_or_path_join(*path_list): - return (os.path.join(*path_list) - if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list)) + return os.path.join(*path_list) if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list) def save_model_card( - repo_id: str, - images=None, - base_model=str, - train_text_encoder=False, - prompt=str, - repo_folder=None, ): + repo_id: str, + images=None, + base_model=str, + train_text_encoder=False, + prompt=str, + repo_folder=None, +): img_str = "" for i, image in enumerate(images): image.save(os.path.join(repo_folder, f"image_{i}.png")) @@ -133,11 +139,11 @@ def save_model_card( f.write(yaml + model_card) -def import_model_class_from_model_name_or_path( - pretrained_model_name_or_path: str): +def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str): try: text_encoder_config = PretrainedConfig.from_pretrained( - url_or_path_join(pretrained_model_name_or_path, "text_encoder")) + url_or_path_join(pretrained_model_name_or_path, "text_encoder") + ) model_class = text_encoder_config.architectures[0] except Exception: model_class = "LDMBertModel" @@ -146,8 +152,9 @@ def import_model_class_from_model_name_or_path( return CLIPTextModel elif model_class == "RobertaSeriesModelWithTransformation": - from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import \ - RobertaSeriesModelWithTransformation + from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import ( + RobertaSeriesModelWithTransformation, + ) return RobertaSeriesModelWithTransformation elif model_class == "BertModel": @@ -155,8 +162,9 @@ def import_model_class_from_model_name_or_path( return BertModel elif model_class == "LDMBertModel": - from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import \ - LDMBertModel + from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import ( + LDMBertModel, + ) return LDMBertModel else: @@ -187,8 +195,7 @@ def get_report_to(args): def parse_args(input_args=None): - parser = argparse.ArgumentParser( - description="Simple example of a training dreambooth lora script.") + parser = argparse.ArgumentParser(description="Simple example of a training dreambooth lora script.") parser.add_argument( "--pretrained_model_name_or_path", type=str, @@ -207,19 +214,22 @@ def parse_args(input_args=None): type=str, default=None, required=True, - help="A folder containing the training data of instance images.", ) + help="A folder containing the training data of instance images.", + ) parser.add_argument( "--class_data_dir", type=str, default=None, required=False, - help="A folder containing the training data of class images.", ) + help="A folder containing the training data of class images.", + ) parser.add_argument( "--instance_prompt", type=str, default=None, required=True, - help="The prompt with identifier specifying the instance", ) + help="The prompt with identifier specifying the instance", + ) parser.add_argument( "--class_prompt", type=str, @@ -230,7 +240,8 @@ def parse_args(input_args=None): "--validation_prompt", type=str, default=None, - help="A prompt that is sampled during training for inference.", ) + help="A prompt that is sampled during training for inference.", + ) parser.add_argument( "--num_validation_images", type=int, @@ -244,17 +255,20 @@ def parse_args(input_args=None): help=( "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt" " `args.validation_prompt` multiple times: `args.num_validation_images`." - ), ) + ), + ) parser.add_argument( "--with_prior_preservation", default=False, action="store_true", - help="Flag to add prior preservation loss.", ) + help="Flag to add prior preservation loss.", + ) parser.add_argument( "--prior_loss_weight", type=float, default=1.0, - help="The weight of prior preservation loss.", ) + help="The weight of prior preservation loss.", + ) parser.add_argument( "--num_class_images", type=int, @@ -262,44 +276,48 @@ def parse_args(input_args=None): help=( "Minimal class images for prior preservation loss. If there are not enough images already present in" " class_data_dir, additional images will be sampled with class_prompt." - ), ) + ), + ) parser.add_argument( "--output_dir", type=str, default="lora-dreambooth-model", help="The output directory where the model predictions and checkpoints will be written.", ) - parser.add_argument( - "--seed", - type=int, - default=None, - help="A seed for reproducible training.") + parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") parser.add_argument( "--height", type=int, default=None, help=( "The height for input images, all the images in the train/validation dataset will be resized to this" - " height"), ) + " height" + ), + ) parser.add_argument( "--width", type=int, default=None, help=( "The width for input images, all the images in the train/validation dataset will be resized to this" - " width"), ) + " width" + ), + ) parser.add_argument( "--resolution", type=int, default=512, help=( "The resolution for input images, all the images in the train/validation dataset will be resized to this" - " resolution"), ) + " resolution" + ), + ) parser.add_argument( "--lora_rank", type=int, default=4, - help="The rank of lora linear.", ) + help="The rank of lora linear.", + ) parser.add_argument( "--center_crop", default=False, @@ -307,16 +325,19 @@ def parse_args(input_args=None): help=( "Whether to center crop the input images to the resolution. If not set, the images will be randomly" " cropped. The images will be resized to the resolution first before cropping." - ), ) + ), + ) parser.add_argument( "--random_flip", action="store_true", - help="whether to randomly flip images horizontally", ) + help="whether to randomly flip images horizontally", + ) parser.add_argument( "--train_batch_size", type=int, default=4, - help="Batch size (per device) for the training dataloader.", ) + help="Batch size (per device) for the training dataloader.", + ) parser.add_argument( "--train_text_encoder", action="store_true", @@ -326,7 +347,8 @@ def parse_args(input_args=None): "--sample_batch_size", type=int, default=4, - help="Batch size (per device) for sampling images.", ) + help="Batch size (per device) for sampling images.", + ) parser.add_argument("--num_train_epochs", type=int, default=1) parser.add_argument( "--max_train_steps", @@ -338,7 +360,8 @@ def parse_args(input_args=None): "--checkpointing_steps", type=int, default=500, - help=("Save a checkpoint of the training state every X updates."), ) + help=("Save a checkpoint of the training state every X updates."), + ) parser.add_argument( "--gradient_accumulation_steps", type=int, @@ -368,12 +391,15 @@ def parse_args(input_args=None): default="constant", help=( 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' - ' "constant", "constant_with_warmup"]'), ) + ' "constant", "constant_with_warmup"]' + ), + ) parser.add_argument( "--lr_warmup_steps", type=int, default=500, - help="Number of steps for the warmup in the lr scheduler.", ) + help="Number of steps for the warmup in the lr scheduler.", + ) parser.add_argument( "--lr_num_cycles", type=int, @@ -384,45 +410,47 @@ def parse_args(input_args=None): "--lr_power", type=float, default=1.0, - help="Power factor of the polynomial scheduler.", ) + help="Power factor of the polynomial scheduler.", + ) parser.add_argument( "--dataloader_num_workers", type=int, default=0, help=( "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process." - ), ) + ), + ) parser.add_argument( "--adam_beta1", type=float, default=0.9, - help="The beta1 parameter for the Adam optimizer.", ) + help="The beta1 parameter for the Adam optimizer.", + ) parser.add_argument( "--adam_beta2", type=float, default=0.999, - help="The beta2 parameter for the Adam optimizer.", ) - parser.add_argument( - "--adam_weight_decay", - type=float, - default=1e-2, - help="Weight decay to use.") + help="The beta2 parameter for the Adam optimizer.", + ) + parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") parser.add_argument( "--adam_epsilon", type=float, default=1e-08, - help="Epsilon value for the Adam optimizer", ) - parser.add_argument( - "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + help="Epsilon value for the Adam optimizer", + ) + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--push_to_hub", action="store_true", - help="Whether or not to push the model to the Hub.", ) + help="Whether or not to push the model to the Hub.", + ) parser.add_argument( "--hub_token", type=str, default=None, - help="The token to use to push to the Model Hub.", ) + help="The token to use to push to the Model Hub.", + ) parser.add_argument( "--hub_model_id", type=str, @@ -435,22 +463,22 @@ def parse_args(input_args=None): default="logs", help=( "[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to" - "*output_dir/logs"), ) + "*output_dir/logs" + ), + ) parser.add_argument( "--report_to", type=str, default="visualdl", choices=["tensorboard", "visualdl"], - help="Log writer type.", ) + help="Log writer type.", + ) parser.add_argument( "--enable_xformers_memory_efficient_attention", action="store_true", - help="Whether or not to use xformers.", ) - parser.add_argument( - "--noise_offset", - type=float, - default=0, - help="The scale of noise offset.") + help="Whether or not to use xformers.", + ) + parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.") if input_args is not None: args = parser.parse_args(input_args) @@ -462,20 +490,15 @@ def parse_args(input_args=None): if args.with_prior_preservation: if args.class_data_dir is None: - raise ValueError( - "You must specify a data directory for class images.") + raise ValueError("You must specify a data directory for class images.") if args.class_prompt is None: raise ValueError("You must specify prompt for class images.") else: # logger is not available yet if args.class_data_dir is not None: - warnings.warn( - "You need not use --class_data_dir without --with_prior_preservation." - ) + warnings.warn("You need not use --class_data_dir without --with_prior_preservation.") if args.class_prompt is not None: - warnings.warn( - "You need not use --class_prompt without --with_prior_preservation." - ) + warnings.warn("You need not use --class_prompt without --with_prior_preservation.") args.logging_dir = os.path.join(args.output_dir, args.logging_dir) if args.height is None or args.width is None and args.resolution is not None: @@ -491,18 +514,19 @@ class DreamBoothDataset(Dataset): """ def __init__( - self, - instance_data_root, - instance_prompt, - tokenizer, - class_data_root=None, - class_prompt=None, - class_num=None, - height=512, - width=512, - center_crop=False, - interpolation="bilinear", - random_flip=False, ): + self, + instance_data_root, + instance_prompt, + tokenizer, + class_data_root=None, + class_prompt=None, + class_num=None, + height=512, + width=512, + center_crop=False, + interpolation="bilinear", + random_flip=False, + ): self.height = height self.width = width self.center_crop = center_crop @@ -528,8 +552,7 @@ def __init__( if any(suffix in p.name for suffix in ext): self.class_images_path.append(p) if class_num is not None: - self.num_class_images = min( - len(self.class_images_path), class_num) + self.num_class_images = min(len(self.class_images_path), class_num) else: self.num_class_images = len(self.class_images_path) self._length = max(self.num_class_images, self.num_instance_images) @@ -537,24 +560,22 @@ def __init__( else: self.class_data_root = None - self.image_transforms = transforms.Compose([ - transforms.Resize( - (height, width), interpolation=interpolation), - transforms.CenterCrop((height, width)) - if center_crop else transforms.RandomCrop((height, width)), - transforms.RandomHorizontalFlip() - if random_flip else Lambda(lambda x: x), - transforms.ToTensor(), - transforms.Normalize([0.5], [0.5]), - ]) + self.image_transforms = transforms.Compose( + [ + transforms.Resize((height, width), interpolation=interpolation), + transforms.CenterCrop((height, width)) if center_crop else transforms.RandomCrop((height, width)), + transforms.RandomHorizontalFlip() if random_flip else Lambda(lambda x: x), + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] + ) def __len__(self): return self._length def __getitem__(self, index): example = {} - instance_image = Image.open(self.instance_images_path[ - index % self.num_instance_images]) + instance_image = Image.open(self.instance_images_path[index % self.num_instance_images]) if not instance_image.mode == "RGB": instance_image = instance_image.convert("RGB") example["instance_images"] = self.image_transforms(instance_image) @@ -563,11 +584,11 @@ def __getitem__(self, index): padding="do_not_pad", truncation=True, max_length=self.tokenizer.model_max_length, - return_attention_mask=False, ).input_ids + return_attention_mask=False, + ).input_ids if self.class_data_root: - class_image = Image.open(self.class_images_path[ - index % self.num_class_images]) + class_image = Image.open(self.class_images_path[index % self.num_class_images]) if not class_image.mode == "RGB": class_image = class_image.convert("RGB") example["class_images"] = self.image_transforms(class_image) @@ -576,7 +597,8 @@ def __getitem__(self, index): padding="do_not_pad", truncation=True, max_length=self.tokenizer.model_max_length, - return_attention_mask=False, ).input_ids + return_attention_mask=False, + ).input_ids return example @@ -598,9 +620,7 @@ def __getitem__(self, index): return example -def get_full_repo_name(model_id: str, - organization: Optional[str]=None, - token: Optional[str]=None): +def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None): if token is None: token = HfFolder.get_token() if organization is None: @@ -633,45 +653,43 @@ def main(): pipeline = DiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, safety_checker=None, - requires_safety_checker=False, ) - if (args.enable_xformers_memory_efficient_attention and - is_ppxformers_available()): + requires_safety_checker=False, + ) + if args.enable_xformers_memory_efficient_attention and is_ppxformers_available(): try: pipeline.unet.enable_xformers_memory_efficient_attention() except Exception as e: logger.warning( "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed" - f" correctly and a GPU is available: {e}") + f" correctly and a GPU is available: {e}" + ) pipeline.set_progress_bar_config(disable=True) num_new_images = args.num_class_images - cur_class_images logger.info(f"Number of class images to sample: {num_new_images}.") sample_dataset = PromptDataset(args.class_prompt, num_new_images) - batch_sampler = (DistributedBatchSampler( - sample_dataset, - batch_size=args.sample_batch_size, - shuffle=False) if num_processes > 1 else BatchSampler( - sample_dataset, - batch_size=args.sample_batch_size, - shuffle=False)) + batch_sampler = ( + DistributedBatchSampler(sample_dataset, batch_size=args.sample_batch_size, shuffle=False) + if num_processes > 1 + else BatchSampler(sample_dataset, batch_size=args.sample_batch_size, shuffle=False) + ) sample_dataloader = DataLoader( sample_dataset, batch_sampler=batch_sampler, - num_workers=args.dataloader_num_workers, ) + num_workers=args.dataloader_num_workers, + ) for example in tqdm( - sample_dataloader, - desc="Generating class images", - disable=not is_main_process, ): + sample_dataloader, + desc="Generating class images", + disable=not is_main_process, + ): images = pipeline(example["prompt"]).images for i, image in enumerate(images): hash_image = hashlib.sha1(image.tobytes()).hexdigest() - image_filename = ( - class_images_dir / - f"{example['index'][i] + cur_class_images}-{hash_image}.jpg" - ) + image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg" image.save(image_filename) pipeline.to("cpu") del pipeline @@ -687,53 +705,50 @@ def main(): elif args.pretrained_model_name_or_path: try: tokenizer = AutoTokenizer.from_pretrained( - url_or_path_join(args.pretrained_model_name_or_path, - "tokenizer")) + url_or_path_join(args.pretrained_model_name_or_path, "tokenizer") + ) except KeyError as e: if "XLMRobertaTokenizer" in str(e): from paddlenlp.transformers import XLMRobertaTokenizer tokenizer = XLMRobertaTokenizer.from_pretrained( - url_or_path_join(args.pretrained_model_name_or_path, - "tokenizer")) + url_or_path_join(args.pretrained_model_name_or_path, "tokenizer") + ) else: raise e # import correct text encoder class - text_encoder_cls = import_model_class_from_model_name_or_path( - args.pretrained_model_name_or_path) + text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path) # Load scheduler and models - noise_scheduler = DDPMScheduler.from_pretrained( - args.pretrained_model_name_or_path, subfolder="scheduler") + noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") text_encoder = text_encoder_cls.from_pretrained( - url_or_path_join(args.pretrained_model_name_or_path, "text_encoder")) - text_config = (text_encoder.config if isinstance(text_encoder.config, dict) - else text_encoder.config.to_dict()) - if (text_config.get("use_attention_mask", None) is not None and - text_config["use_attention_mask"]): + url_or_path_join(args.pretrained_model_name_or_path, "text_encoder") + ) + text_config = text_encoder.config if isinstance(text_encoder.config, dict) else text_encoder.config.to_dict() + if text_config.get("use_attention_mask", None) is not None and text_config["use_attention_mask"]: use_attention_mask = True else: use_attention_mask = False - vae = AutoencoderKL.from_pretrained( - args.pretrained_model_name_or_path, subfolder="vae") + vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae") unet = UNet2DConditionModel.from_pretrained( args.pretrained_model_name_or_path, - subfolder="unet", ) + subfolder="unet", + ) # We only train the additional adapter LoRA layers freeze_params(vae.parameters()) freeze_params(text_encoder.parameters()) freeze_params(unet.parameters()) - if args.enable_xformers_memory_efficient_attention and is_ppxformers_available( - ): + if args.enable_xformers_memory_efficient_attention and is_ppxformers_available(): try: unet.enable_xformers_memory_efficient_attention() except Exception as e: logger.warning( "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed" - f" correctly and a GPU is available: {e}") + f" correctly and a GPU is available: {e}" + ) # now we will add new LoRA weights to the attention layers # It's important to realize here how many attention weights will be added and of which sizes # The sizes of the attention layers consist only of two different variables: @@ -750,14 +765,12 @@ def main(): # Set correct lora layers unet_lora_attn_procs = {} for name, attn_processor in unet.attn_processors.items(): - cross_attention_dim = (None if name.endswith("attn1.processor") else - unet.config.cross_attention_dim) + cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim if name.startswith("mid_block"): hidden_size = unet.config.block_out_channels[-1] elif name.startswith("up_blocks"): block_id = int(name[len("up_blocks.")]) - hidden_size = list(reversed(unet.config.block_out_channels))[ - block_id] + hidden_size = list(reversed(unet.config.block_out_channels))[block_id] elif name.startswith("down_blocks"): block_id = int(name[len("down_blocks.")]) hidden_size = unet.config.block_out_channels[block_id] @@ -767,14 +780,13 @@ def main(): elif isinstance(attn_processor, AttnProcessor2_5): lora_attn_processor_class = LoRAAttnProcessor2_5 else: - raise ValueError( - f"Unknown attention processor type: {attn_processor.__class__.__name__}" - ) + raise ValueError(f"Unknown attention processor type: {attn_processor.__class__.__name__}") unet_lora_attn_procs[name] = lora_attn_processor_class( hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, - rank=args.lora_rank, ) + rank=args.lora_rank, + ) unet.set_attn_processor(unet_lora_attn_procs) unet_lora_layers = AttnProcsLayers(unet.attn_processors) @@ -790,10 +802,12 @@ def main(): text_lora_attn_procs[name] = LoRAAttnProcessor( hidden_size=module.out_proj.weight.shape[1], cross_attention_dim=None, - rank=args.lora_rank, ) + rank=args.lora_rank, + ) text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs) temp_pipeline = DiffusionPipeline.from_pretrained( - args.pretrained_model_name_or_path, text_encoder=text_encoder) + args.pretrained_model_name_or_path, text_encoder=text_encoder + ) temp_pipeline._modify_text_encoder(text_lora_attn_procs) text_encoder = temp_pipeline.text_encoder del temp_pipeline @@ -802,8 +816,7 @@ def main(): train_dataset = DreamBoothDataset( instance_data_root=args.instance_data_dir, instance_prompt=args.instance_prompt, - class_data_root=args.class_data_dir - if args.with_prior_preservation else None, + class_data_root=args.class_data_dir if args.with_prior_preservation else None, class_prompt=args.class_prompt, class_num=args.num_class_images, tokenizer=tokenizer, @@ -811,7 +824,8 @@ def main(): width=args.width, center_crop=args.center_crop, interpolation="bilinear", - random_flip=args.random_flip, ) + random_flip=args.random_flip, + ) def collate_fn(examples): input_ids = [example["instance_prompt_ids"] for example in examples] @@ -826,58 +840,55 @@ def collate_fn(examples): pixel_values = paddle.stack(pixel_values).astype("float32") input_ids = tokenizer.pad( - { - "input_ids": input_ids - }, + {"input_ids": input_ids}, padding="max_length", max_length=tokenizer.model_max_length, - return_tensors="pd", ).input_ids + return_tensors="pd", + ).input_ids return { "input_ids": input_ids, "pixel_values": pixel_values, } - train_sampler = (DistributedBatchSampler( - train_dataset, batch_size=args.train_batch_size, shuffle=True) - if num_processes > 1 else BatchSampler( - train_dataset, - batch_size=args.train_batch_size, - shuffle=True)) + train_sampler = ( + DistributedBatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True) + if num_processes > 1 + else BatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True) + ) train_dataloader = DataLoader( train_dataset, batch_sampler=train_sampler, collate_fn=collate_fn, - num_workers=args.dataloader_num_workers, ) + num_workers=args.dataloader_num_workers, + ) # Scheduler and math around the number of training steps. - num_update_steps_per_epoch = math.ceil( - len(train_dataloader) / args.gradient_accumulation_steps) + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch # Afterwards we recalculate our number of training epochs - args.num_train_epochs = math.ceil(args.max_train_steps / - num_update_steps_per_epoch) + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) if args.scale_lr: - args.learning_rate = (args.learning_rate * - args.gradient_accumulation_steps * - args.train_batch_size * num_processes) + args.learning_rate = ( + args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * num_processes + ) lr_scheduler = get_scheduler( args.lr_scheduler, learning_rate=args.learning_rate, - num_warmup_steps=args.lr_warmup_steps * - args.gradient_accumulation_steps, - num_training_steps=args.max_train_steps * - args.gradient_accumulation_steps, + num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, num_cycles=args.lr_num_cycles, - power=args.lr_power, ) + power=args.lr_power, + ) - params_to_optimize = (list(unet_lora_layers.parameters()) + - list(text_encoder_lora_layers.parameters()) - if args.train_text_encoder else - unet_lora_layers.parameters()) + params_to_optimize = ( + list(unet_lora_layers.parameters()) + list(text_encoder_lora_layers.parameters()) + if args.train_text_encoder + else unet_lora_layers.parameters() + ) # Optimizer creation optimizer = AdamW( learning_rate=lr_scheduler, @@ -886,8 +897,8 @@ def collate_fn(examples): beta2=args.adam_beta2, weight_decay=args.adam_weight_decay, epsilon=args.adam_epsilon, - grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) - if args.max_grad_norm > 0 else None, ) + grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None, + ) if num_processes > 1: unet = paddle.DataParallel(unet) @@ -902,25 +913,19 @@ def collate_fn(examples): writer = get_report_to(args) # Train! - total_batch_size = (args.train_batch_size * num_processes * - args.gradient_accumulation_steps) + total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num batches each epoch = {len(train_dataloader)}") logger.info(f" Num Epochs = {args.num_train_epochs}") - logger.info( - f" Instantaneous batch size per device = {args.train_batch_size}") - logger.info( - f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" - ) - logger.info( - f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Instantaneous batch size per device = {args.train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. - progress_bar = tqdm( - range(args.max_train_steps), disable=not is_main_process) + progress_bar = tqdm(range(args.max_train_steps), disable=not is_main_process) progress_bar.set_description("Train Steps") global_step = 0 vae.eval() @@ -941,52 +946,43 @@ def collate_fn(examples): if args.noise_offset: # https://www.crosslabs.org//blog/diffusion-with-offset-noise noise += args.noise_offset * paddle.randn( - (latents.shape[0], latents.shape[1], 1, 1), - dtype=latents.dtype) + (latents.shape[0], latents.shape[1], 1, 1), dtype=latents.dtype + ) batch_size = latents.shape[0] # Sample a random timestep for each image - timesteps = paddle.randint( - 0, noise_scheduler.config.num_train_timesteps, - (batch_size, )).cast("int64") + timesteps = paddle.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,)).cast("int64") # Add noise to the latents according to the noise magnitude at each timestep # (this is the forward diffusion process) noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) - if num_processes > 1 and (args.gradient_checkpointing or ( - (step + 1) % args.gradient_accumulation_steps != 0)): + if num_processes > 1 and ( + args.gradient_checkpointing or ((step + 1) % args.gradient_accumulation_steps != 0) + ): # grad acc, no_sync when (step + 1) % args.gradient_accumulation_steps != 0: # gradient_checkpointing, no_sync every where # gradient_checkpointing + grad_acc, no_sync every where unet_ctx_manager = unet.no_sync() else: - unet_ctx_manager = (contextlib.nullcontext() - if sys.version_info >= (3, 7) else - contextlib.suppress()) + unet_ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress() if use_attention_mask: - attention_mask = ( - batch["input_ids"] != tokenizer.pad_token_id).cast("int64") + attention_mask = (batch["input_ids"] != tokenizer.pad_token_id).cast("int64") else: attention_mask = None - encoder_hidden_states = text_encoder( - batch["input_ids"], attention_mask=attention_mask)[0] + encoder_hidden_states = text_encoder(batch["input_ids"], attention_mask=attention_mask)[0] with unet_ctx_manager: # Predict the noise residual / sample - model_pred = unet(noisy_latents, timesteps, - encoder_hidden_states).sample + model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample # Get the target for loss depending on the prediction type if noise_scheduler.config.prediction_type == "epsilon": target = noise elif noise_scheduler.config.prediction_type == "v_prediction": - target = noise_scheduler.get_velocity(latents, noise, - timesteps) + target = noise_scheduler.get_velocity(latents, noise, timesteps) else: - raise ValueError( - f"Unknown prediction type {noise_scheduler.config.prediction_type}" - ) + raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") if args.with_prior_preservation: # Chunk the noise and model_pred into two parts and compute the loss on each part separately. @@ -997,8 +993,7 @@ def collate_fn(examples): loss = F.mse_loss(model_pred, target, reduction="mean") # Compute prior loss - prior_loss = F.mse_loss( - model_pred_prior, target_prior, reduction="mean") + prior_loss = F.mse_loss(model_pred_prior, target_prior, reduction="mean") # Add the prior loss to the instance loss. loss = loss + args.prior_loss_weight * prior_loss @@ -1032,54 +1027,52 @@ def collate_fn(examples): writer.add_scalar(f"train/{name}", val, global_step) if global_step % args.checkpointing_steps == 0: - save_path = os.path.join(args.output_dir, - f"checkpoint-{global_step}") + save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") # We combine the text encoder and UNet LoRA parameters with a simple # custom logic. So, use `LoraLoaderMixin.save_lora_weights()`. LoraLoaderMixin.save_lora_weights( save_directory=save_path, unet_lora_layers=unet_lora_layers, - text_encoder_lora_layers=text_encoder_lora_layers, ) + text_encoder_lora_layers=text_encoder_lora_layers, + ) logger.info(f"Saved lora weights to {save_path}") if global_step >= args.max_train_steps: break if is_main_process: - if (args.validation_prompt is not None and - epoch % args.validation_epochs == 0): + if args.validation_prompt is not None and epoch % args.validation_epochs == 0: logger.info( f"Running validation... \n Generating {args.num_validation_images} images with prompt:" - f" {args.validation_prompt}.") + f" {args.validation_prompt}." + ) # create pipeline pipeline = DiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, unet=unwrap_model(unet), text_encoder=unwrap_model(text_encoder), safety_checker=None, - requires_safety_checker=False, ) - pipeline.scheduler = DPMSolverMultistepScheduler.from_config( - pipeline.scheduler.config) + requires_safety_checker=False, + ) + pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config) pipeline.set_progress_bar_config(disable=True) # run inference - generator = (paddle.Generator().manual_seed(args.seed) - if args.seed else None) + generator = paddle.Generator().manual_seed(args.seed) if args.seed else None images = [ pipeline( args.validation_prompt, num_inference_steps=25, - generator=generator, ).images[0] + generator=generator, + ).images[0] for _ in range(args.num_validation_images) ] np_images = np.stack([np.asarray(img) for img in images]) if args.report_to == "tensorboard": - writer.add_images( - "test", np_images, epoch, dataformats="NHWC") + writer.add_images("test", np_images, epoch, dataformats="NHWC") else: - writer.add_image( - "test", np_images, epoch, dataformats="NHWC") + writer.add_image("test", np_images, epoch, dataformats="NHWC") del pipeline if args.train_text_encoder: @@ -1092,28 +1085,25 @@ def collate_fn(examples): LoraLoaderMixin.save_lora_weights( save_directory=args.output_dir, unet_lora_layers=unet_lora_layers, - text_encoder_lora_layers=text_encoder_lora_layers, ) + text_encoder_lora_layers=text_encoder_lora_layers, + ) # Final inference # Load previous pipeline pipeline = DiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, safety_checker=None, - requires_safety_checker=False, ) - pipeline.scheduler = DPMSolverMultistepScheduler.from_config( - pipeline.scheduler.config) + requires_safety_checker=False, + ) + pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config) # load attention processors pipeline.load_lora_weights(args.output_dir) # run inference if args.validation_prompt and args.num_validation_images > 0: - generator = paddle.Generator().manual_seed( - args.seed) if args.seed else None + generator = paddle.Generator().manual_seed(args.seed) if args.seed else None images = [ - pipeline( - args.validation_prompt, - num_inference_steps=25, - generator=generator).images[0] + pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0] for _ in range(args.num_validation_images) ] np_images = np.stack([np.asarray(img) for img in images]) @@ -1128,8 +1118,7 @@ def collate_fn(examples): # logic to push to HF Hub if args.push_to_hub: if args.hub_model_id is None: - repo_id = get_full_repo_name( - Path(args.output_dir).name, token=args.hub_token) + repo_id = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_id = args.hub_model_id @@ -1142,14 +1131,16 @@ def collate_fn(examples): }, base_wait_time=1.0, max_retries=5, - max_wait_time=10.0, ) + max_wait_time=10.0, + ) save_model_card( repo_id, images=images, base_model=args.pretrained_model_name_or_path, prompt=args.instance_prompt, - repo_folder=args.output_dir, ) + repo_folder=args.output_dir, + ) # Upload model logger.info(f"Pushing to {repo_id}") _retry( @@ -1164,7 +1155,8 @@ def collate_fn(examples): }, base_wait_time=1.0, max_retries=5, - max_wait_time=20.0, ) + max_wait_time=20.0, + ) if __name__ == "__main__": diff --git a/ppdiffusers/examples/inference/dual_text_and_image_guided_generation-versatile_diffusion.py b/ppdiffusers/examples/inference/dual_text_and_image_guided_generation-versatile_diffusion.py index 8da60623e57c1..fb7a20763c805 100644 --- a/ppdiffusers/examples/inference/dual_text_and_image_guided_generation-versatile_diffusion.py +++ b/ppdiffusers/examples/inference/dual_text_and_image_guided_generation-versatile_diffusion.py @@ -19,13 +19,9 @@ image = load_image(url) text = "a red car in the sun" -pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained( - "shi-labs/versatile-diffusion") +pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion") pipe.remove_unused_weights() text_to_image_strength = 0.75 -image = pipe( - prompt=text, image=image, - text_to_image_strength=text_to_image_strength).images[0] -image.save( - "dual_text_and_image_guided_generation-versatile_diffusion-result.png") +image = pipe(prompt=text, image=image, text_to_image_strength=text_to_image_strength).images[0] +image.save("dual_text_and_image_guided_generation-versatile_diffusion-result.png") diff --git a/ppdiffusers/examples/inference/image_guided_image_inpainting-paint_by_example.py b/ppdiffusers/examples/inference/image_guided_image_inpainting-paint_by_example.py index 59d805fed60f1..99812e2bd2122 100644 --- a/ppdiffusers/examples/inference/image_guided_image_inpainting-paint_by_example.py +++ b/ppdiffusers/examples/inference/image_guided_image_inpainting-paint_by_example.py @@ -29,7 +29,5 @@ # 使用fp16加快生成速度 with paddle.amp.auto_cast(True): - image = pipe( - image=init_image, mask_image=mask_image, - example_image=example_image).images[0] + image = pipe(image=init_image, mask_image=mask_image, example_image=example_image).images[0] image.save("image_guided_image_inpainting-paint_by_example-result.png") diff --git a/ppdiffusers/examples/inference/image_inpainting-repaint.py b/ppdiffusers/examples/inference/image_inpainting-repaint.py index 4e3cf9d1270c2..3d4a971fd734b 100644 --- a/ppdiffusers/examples/inference/image_inpainting-repaint.py +++ b/ppdiffusers/examples/inference/image_inpainting-repaint.py @@ -15,19 +15,15 @@ from ppdiffusers import RePaintPipeline, RePaintScheduler from ppdiffusers.utils import load_image -img_url = ( - "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/celeba_hq_256.png" -) +img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/celeba_hq_256.png" mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/mask_256.png" # Load the original image and the mask as PIL images original_image = load_image(img_url).resize((256, 256)) mask_image = load_image(mask_url).resize((256, 256)) -scheduler = RePaintScheduler.from_pretrained( - "google/ddpm-ema-celebahq-256", subfolder="scheduler") -pipe = RePaintPipeline.from_pretrained( - "google/ddpm-ema-celebahq-256", scheduler=scheduler) +scheduler = RePaintScheduler.from_pretrained("google/ddpm-ema-celebahq-256", subfolder="scheduler") +pipe = RePaintPipeline.from_pretrained("google/ddpm-ema-celebahq-256", scheduler=scheduler) output = pipe( image=original_image, @@ -35,7 +31,8 @@ num_inference_steps=250, eta=0.0, jump_length=10, - jump_n_sample=10, ) + jump_n_sample=10, +) inpainted_image = output.images[0] inpainted_image.save("image_inpainting-repaint-result.png") diff --git a/ppdiffusers/examples/inference/image_mixing-stable_diffusion.py b/ppdiffusers/examples/inference/image_mixing-stable_diffusion.py index 4889b99839ad0..ea5294247238b 100644 --- a/ppdiffusers/examples/inference/image_mixing-stable_diffusion.py +++ b/ppdiffusers/examples/inference/image_mixing-stable_diffusion.py @@ -28,17 +28,16 @@ def download_image(url): # Loading additional models -feature_extractor = CLIPFeatureExtractor.from_pretrained( - "laion/CLIP-ViT-B-32-laion2B-s34B-b79K") -clip_model = CLIPModel.from_pretrained( - "laion/CLIP-ViT-B-32-laion2B-s34B-b79K", paddle_dtype=paddle.float16) +feature_extractor = CLIPFeatureExtractor.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K") +clip_model = CLIPModel.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K", paddle_dtype=paddle.float16) mixing_pipeline = DiffusionPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", custom_pipeline="clip_guided_images_mixing_stable_diffusion", clip_model=clip_model, feature_extractor=feature_extractor, - paddle_dtype=paddle.float16, ) + paddle_dtype=paddle.float16, +) mixing_pipeline.enable_attention_slicing() # Pipline running @@ -64,6 +63,7 @@ def download_image(url): guidance_scale=9.0, batch_size=1, clip_guidance_scale=100, - generator=generator, ).images + generator=generator, +).images pipe_images[0].save("clip_guided_images_mixing_stable_diffusion.png") diff --git a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-alt_diffusion.py b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-alt_diffusion.py index 537dc6cf71437..1525fc680c2c2 100644 --- a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-alt_diffusion.py +++ b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-alt_diffusion.py @@ -26,8 +26,6 @@ prompt = "奇幻的景观,以一种艺术的形式。" # 使用fp16加快生成速度 with paddle.amp.auto_cast(True): - image = pipe( - prompt=prompt, image=init_image, strength=0.75, - guidance_scale=7.5).images[0] + image = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0] image.save("image_to_image_text_guided_generation-alt_diffusion-result.png") diff --git a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-controlnet.py b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-controlnet.py index d1cf291ca57f0..b1d9267b2ac0d 100644 --- a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-controlnet.py +++ b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-controlnet.py @@ -19,9 +19,8 @@ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny") pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - safety_checker=None, - controlnet=controlnet) + "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet +) pipe.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) diff --git a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-deepfloyd_if.py b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-deepfloyd_if.py index 5f106c65341f3..bdd71eb35c00d 100644 --- a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-deepfloyd_if.py +++ b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-deepfloyd_if.py @@ -24,7 +24,8 @@ pipe = IFImg2ImgPipeline.from_pretrained( "DeepFloyd/IF-I-XL-v1.0", variant="fp16", - paddle_dtype=paddle.float16, ) + paddle_dtype=paddle.float16, +) pipe.enable_xformers_memory_efficient_attention() prompt = "A fantasy landscape in style minecraft" prompt_embeds, negative_embeds = pipe.encode_prompt(prompt) @@ -33,25 +34,26 @@ image=original_image, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, - output_type="pd", ).images + output_type="pd", +).images pipe.to(paddle_device="cpu") # save intermediate image pil_image = pd_to_pil(image) -pil_image[0].save( - "./image_to_image_text_guided_generation-deepfloyd_if-if_stage_I.png") +pil_image[0].save("./image_to_image_text_guided_generation-deepfloyd_if-if_stage_I.png") super_res_1_pipe = IFImg2ImgSuperResolutionPipeline.from_pretrained( "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", - paddle_dtype=paddle.float16, ) + paddle_dtype=paddle.float16, +) super_res_1_pipe.enable_xformers_memory_efficient_attention() image = super_res_1_pipe( image=image, original_image=original_image, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_embeds, ).images -image[0].save( - "./image_to_image_text_guided_generation-deepfloyd_if-if_stage_II.png") + negative_prompt_embeds=negative_embeds, +).images +image[0].save("./image_to_image_text_guided_generation-deepfloyd_if-if_stage_II.png") diff --git a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion.py b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion.py index 8de116547d619..5b2d857d58b4a 100644 --- a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion.py +++ b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion.py @@ -18,8 +18,7 @@ from ppdiffusers.utils import load_image # 加载pipeline -pipe = StableDiffusionImg2ImgPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5") +pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") # 下载初始图片 url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png" @@ -29,8 +28,6 @@ prompt = "A fantasy landscape, trending on artstation" # 使用fp16加快生成速度 with paddle.amp.auto_cast(True): - image = pipe( - prompt=prompt, image=init_image, strength=0.75, - guidance_scale=7.5).images[0] + image = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0] image.save("image_to_image_text_guided_generation-stable_diffusion-result.png") diff --git a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion_2.py b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion_2.py index 6103f2a54a722..67472607587b3 100644 --- a/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion_2.py +++ b/ppdiffusers/examples/inference/image_to_image_text_guided_generation-stable_diffusion_2.py @@ -17,8 +17,7 @@ from ppdiffusers import StableDiffusionImg2ImgPipeline from ppdiffusers.utils import load_image -pipe = StableDiffusionImg2ImgPipeline.from_pretrained( - "stabilityai/stable-diffusion-2") +pipe = StableDiffusionImg2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2") # 下载初始图片 url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png" @@ -28,9 +27,6 @@ prompt = "A fantasy landscape, trending on artstation" # 使用fp16加快生成速度 with paddle.amp.auto_cast(True): - image = pipe( - prompt=prompt, image=init_image, strength=0.75, - guidance_scale=7.5).images[0] + image = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0] -image.save( - "image_to_image_text_guided_generation-stable_diffusion_2-result.png") +image.save("image_to_image_text_guided_generation-stable_diffusion_2-result.png") diff --git a/ppdiffusers/examples/inference/image_to_text_generation-unidiffuser.py b/ppdiffusers/examples/inference/image_to_text_generation-unidiffuser.py index 1f7dc26f085bc..1c7678b55930c 100644 --- a/ppdiffusers/examples/inference/image_to_text_generation-unidiffuser.py +++ b/ppdiffusers/examples/inference/image_to_text_generation-unidiffuser.py @@ -16,8 +16,7 @@ from ppdiffusers.utils import load_image pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser") -image = load_image( - "https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg") +image = load_image("https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg") result = pipe(mode="i2t", image=image, prompt=None) text = result.texts[0] with open("image_to_text_generation-unidiffuser-result.txt", "w") as f: diff --git a/ppdiffusers/examples/inference/image_variation-stable_diffusion.py b/ppdiffusers/examples/inference/image_variation-stable_diffusion.py index 3d03fdb457501..a8478035c8c87 100644 --- a/ppdiffusers/examples/inference/image_variation-stable_diffusion.py +++ b/ppdiffusers/examples/inference/image_variation-stable_diffusion.py @@ -21,19 +21,21 @@ "lambdalabs/sd-image-variations-diffusers", revision="v2.0", from_diffusers=True, - from_hf_hub=True, ) + from_hf_hub=True, +) -im = load_image( - "https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg") +im = load_image("https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg") -tform = transforms.Compose([ - transforms.ToTensor(), - transforms.Resize( - (224, 224), - interpolation="bicubic", ), - transforms.Normalize([0.48145466, 0.4578275, 0.40821073], - [0.26862954, 0.26130258, 0.27577711]), -]) +tform = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Resize( + (224, 224), + interpolation="bicubic", + ), + transforms.Normalize([0.48145466, 0.4578275, 0.40821073], [0.26862954, 0.26130258, 0.27577711]), + ] +) inp = tform(im) out = sd_pipe(im, guidance_scale=3) diff --git a/ppdiffusers/examples/inference/image_variation-unidiffuser.py b/ppdiffusers/examples/inference/image_variation-unidiffuser.py index d2bd06a9c5ec0..c334c673ff288 100644 --- a/ppdiffusers/examples/inference/image_variation-unidiffuser.py +++ b/ppdiffusers/examples/inference/image_variation-unidiffuser.py @@ -16,8 +16,7 @@ from ppdiffusers.utils import load_image pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser") -image = load_image( - "https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg") +image = load_image("https://bj.bcebos.com/v1/paddlenlp/models/community/thu-ml/data/space.jpg") result = pipe(mode="i2t2i", image=image, prompt=None) image = result.images[0] image.save("image_variation-unidiffuser-result.png") diff --git a/ppdiffusers/examples/inference/image_variation-versatile_diffusion.py b/ppdiffusers/examples/inference/image_variation-versatile_diffusion.py index 08c7fbfb6c409..3b2ec2596cbcb 100644 --- a/ppdiffusers/examples/inference/image_variation-versatile_diffusion.py +++ b/ppdiffusers/examples/inference/image_variation-versatile_diffusion.py @@ -18,8 +18,7 @@ url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/benz.jpg" image = load_image(url) -pipe = VersatileDiffusionImageVariationPipeline.from_pretrained( - "shi-labs/versatile-diffusion") +pipe = VersatileDiffusionImageVariationPipeline.from_pretrained("shi-labs/versatile-diffusion") image = pipe(image).images[0] image.save("image_variation-versatile_diffusion-result.png") diff --git a/ppdiffusers/examples/inference/super_resolution-latent_diffusion.py b/ppdiffusers/examples/inference/super_resolution-latent_diffusion.py index 79f9528c1a741..a986de034bc05 100644 --- a/ppdiffusers/examples/inference/super_resolution-latent_diffusion.py +++ b/ppdiffusers/examples/inference/super_resolution-latent_diffusion.py @@ -18,8 +18,7 @@ from ppdiffusers.utils import load_image # 加载pipeline -pipe = LDMSuperResolutionPipeline.from_pretrained( - "CompVis/ldm-super-resolution-4x-openimages") +pipe = LDMSuperResolutionPipeline.from_pretrained("CompVis/ldm-super-resolution-4x-openimages") # 下载初始图片 url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" diff --git a/ppdiffusers/examples/inference/text_guided_generation-semantic_stable_diffusion.py b/ppdiffusers/examples/inference/text_guided_generation-semantic_stable_diffusion.py index b5d317b5abcce..b6b29f140e86d 100644 --- a/ppdiffusers/examples/inference/text_guided_generation-semantic_stable_diffusion.py +++ b/ppdiffusers/examples/inference/text_guided_generation-semantic_stable_diffusion.py @@ -16,8 +16,7 @@ from ppdiffusers import SemanticStableDiffusionPipeline -pipe = SemanticStableDiffusionPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5") +pipe = SemanticStableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") pipe.set_progress_bar_config(disable=None) prompt = "a photo of a cat" edit = { @@ -38,6 +37,7 @@ guidance_scale=guidance_scale, num_inference_steps=50, width=512, - height=512, ) + height=512, +) image = output.images[0] image.save("text_guided_generation-semantic_stable_diffusion-result.png") diff --git a/ppdiffusers/examples/inference/text_guided_image_inpainting-deepfloyd_if.py b/ppdiffusers/examples/inference/text_guided_image_inpainting-deepfloyd_if.py index 013eeec9b316f..26115f88d6506 100644 --- a/ppdiffusers/examples/inference/text_guided_image_inpainting-deepfloyd_if.py +++ b/ppdiffusers/examples/inference/text_guided_image_inpainting-deepfloyd_if.py @@ -14,8 +14,7 @@ import paddle -from ppdiffusers import (IFInpaintingPipeline, - IFInpaintingSuperResolutionPipeline) +from ppdiffusers import IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline from ppdiffusers.utils import load_image, pd_to_pil url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/person.png" @@ -24,8 +23,7 @@ url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/glasses_mask.png" mask_image = load_image(url) -pipe = IFInpaintingPipeline.from_pretrained( - "DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16) +pipe = IFInpaintingPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16) pipe.enable_xformers_memory_efficient_attention() prompt = "blue sunglasses" prompt_embeds, negative_embeds = pipe.encode_prompt(prompt) @@ -35,7 +33,8 @@ mask_image=mask_image, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, - output_type="pd", ).images + output_type="pd", +).images pipe.to(paddle_device="cpu") # save intermediate image pil_image = pd_to_pil(image) @@ -45,7 +44,8 @@ "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", - paddle_dtype=paddle.float16, ) + paddle_dtype=paddle.float16, +) super_res_1_pipe.enable_xformers_memory_efficient_attention() image = super_res_1_pipe( @@ -53,5 +53,6 @@ mask_image=mask_image, original_image=original_image, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_embeds, ).images + negative_prompt_embeds=negative_embeds, +).images image[0].save("./text_guided_image_inpainting-deepfloyd_if-if_stage_II.png") diff --git a/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion.py b/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion.py index dd2dde2fe504c..0fdfe1946a84f 100644 --- a/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion.py +++ b/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion.py @@ -23,13 +23,10 @@ init_image = load_image(img_url).resize((512, 512)) mask_image = load_image(mask_url).resize((512, 512)) -pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained( - "runwayml/stable-diffusion-v1-5") +pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5") prompt = "a cat sitting on a bench" with paddle.amp.auto_cast(True): - image = pipe( - prompt=prompt, image=init_image, mask_image=mask_image, - strength=0.75).images[0] + image = pipe(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.75).images[0] image.save("text_guided_image_inpainting-stable_diffusion-result.png") diff --git a/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion_2.py b/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion_2.py index c89ecf9f8de59..6b27f9a60cf88 100644 --- a/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion_2.py +++ b/ppdiffusers/examples/inference/text_guided_image_inpainting-stable_diffusion_2.py @@ -21,8 +21,7 @@ init_image = load_image(img_url).resize((512, 512)) mask_image = load_image(mask_url).resize((512, 512)) -pipe = StableDiffusionInpaintPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-inpainting") +pipe = StableDiffusionInpaintPipeline.from_pretrained("stabilityai/stable-diffusion-2-inpainting") prompt = "Face of a yellow cat, high resolution, sitting on a park bench" # image and mask_image should be PIL images. diff --git a/ppdiffusers/examples/inference/text_guided_image_upscaling-stable_diffusion_2.py b/ppdiffusers/examples/inference/text_guided_image_upscaling-stable_diffusion_2.py index 736b2a2d09f37..de2298e710d3c 100644 --- a/ppdiffusers/examples/inference/text_guided_image_upscaling-stable_diffusion_2.py +++ b/ppdiffusers/examples/inference/text_guided_image_upscaling-stable_diffusion_2.py @@ -15,8 +15,7 @@ from ppdiffusers import StableDiffusionUpscalePipeline from ppdiffusers.utils import load_image -pipe = StableDiffusionUpscalePipeline.from_pretrained( - "stabilityai/stable-diffusion-x4-upscaler") +pipe = StableDiffusionUpscalePipeline.from_pretrained("stabilityai/stable-diffusion-x4-upscaler") url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/low_res_cat.png" low_res_img = load_image(url).resize((128, 128)) diff --git a/ppdiffusers/examples/inference/text_to_audio_generation-audio_ldm.py b/ppdiffusers/examples/inference/text_to_audio_generation-audio_ldm.py index 511f0f55ac93b..2b4c1b1330a97 100644 --- a/ppdiffusers/examples/inference/text_to_audio_generation-audio_ldm.py +++ b/ppdiffusers/examples/inference/text_to_audio_generation-audio_ldm.py @@ -18,8 +18,7 @@ from ppdiffusers import AudioLDMPipeline -pipe = AudioLDMPipeline.from_pretrained( - "cvssp/audioldm", paddle_dtype=paddle.float16) +pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm", paddle_dtype=paddle.float16) prompt = "Techno music with a strong, upbeat tempo and high melodic riffs" audio = pipe(prompt, num_inference_steps=10, audio_length_in_s=5.0).audios[0] diff --git a/ppdiffusers/examples/inference/text_to_image_generation-alt_diffusion.py b/ppdiffusers/examples/inference/text_to_image_generation-alt_diffusion.py index f6863fe8f4f8c..fccaff284995e 100644 --- a/ppdiffusers/examples/inference/text_to_image_generation-alt_diffusion.py +++ b/ppdiffusers/examples/inference/text_to_image_generation-alt_diffusion.py @@ -14,10 +14,8 @@ from ppdiffusers import AltDiffusionPipeline, DPMSolverMultistepScheduler -scheduler = DPMSolverMultistepScheduler.from_pretrained( - "BAAI/AltDiffusion", subfolder="scheduler") -pipe = AltDiffusionPipeline.from_pretrained( - "BAAI/AltDiffusion", scheduler=scheduler) +scheduler = DPMSolverMultistepScheduler.from_pretrained("BAAI/AltDiffusion", subfolder="scheduler") +pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", scheduler=scheduler) prompt = "黑暗精灵公主,非常详细,幻想,非常详细,数字绘画,概念艺术,敏锐的焦点,插图" # or in English: diff --git a/ppdiffusers/examples/inference/text_to_image_generation-controlnet.py b/ppdiffusers/examples/inference/text_to_image_generation-controlnet.py index a016bbfbe1019..9b420b5aa57ba 100644 --- a/ppdiffusers/examples/inference/text_to_image_generation-controlnet.py +++ b/ppdiffusers/examples/inference/text_to_image_generation-controlnet.py @@ -21,15 +21,13 @@ controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny") pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - controlnet=controlnet, - safety_checker=None) + "runwayml/stable-diffusion-v1-5", controlnet=controlnet, safety_checker=None +) resolution = 512 image = np.array( - load_image( - "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png" - )) + load_image("https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/control_bird_canny_demo.png") +) image = cv2.Canny(image, 100, 200) image = image[:, :, None] image = np.concatenate([image, image, image], axis=2) @@ -43,5 +41,6 @@ num_inference_steps=50, height=resolution, width=resolution, - controlnet_conditioning_scale=1.0, ).images[0] + controlnet_conditioning_scale=1.0, +).images[0] image.save("text_to_image_generation-controlnet-result-bird_canny.png") diff --git a/ppdiffusers/examples/inference/text_to_image_generation-deepfloyd_if.py b/ppdiffusers/examples/inference/text_to_image_generation-deepfloyd_if.py index b060557c4a7cb..f55ded139341f 100644 --- a/ppdiffusers/examples/inference/text_to_image_generation-deepfloyd_if.py +++ b/ppdiffusers/examples/inference/text_to_image_generation-deepfloyd_if.py @@ -14,20 +14,19 @@ import paddle -from ppdiffusers import (DiffusionPipeline, IFPipeline, - IFSuperResolutionPipeline) +from ppdiffusers import DiffusionPipeline, IFPipeline, IFSuperResolutionPipeline from ppdiffusers.utils import pd_to_pil # Stage 1: generate images -pipe = IFPipeline.from_pretrained( - "DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16) +pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16) pipe.enable_xformers_memory_efficient_attention() prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"' prompt_embeds, negative_embeds = pipe.encode_prompt(prompt) image = pipe( prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, - output_type="pd", ).images + output_type="pd", +).images # save intermediate image pil_image = pd_to_pil(image) @@ -40,27 +39,30 @@ "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", - paddle_dtype=paddle.float16, ) + paddle_dtype=paddle.float16, +) super_res_1_pipe.enable_xformers_memory_efficient_attention() image = super_res_1_pipe( image=image, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, - output_type="pd", ).images + output_type="pd", +).images # save intermediate image pil_image = pd_to_pil(image) -pil_image[0].save( - "text_to_image_generation-deepfloyd_if-result-if_stage_II.png") +pil_image[0].save("text_to_image_generation-deepfloyd_if-result-if_stage_II.png") # save gpu memory super_res_1_pipe.to(paddle_device="cpu") # Stage 3: super resolution stage2 super_res_2_pipe = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-x4-upscaler", paddle_dtype=paddle.float16) + "stabilityai/stable-diffusion-x4-upscaler", paddle_dtype=paddle.float16 +) super_res_2_pipe.enable_xformers_memory_efficient_attention() image = super_res_2_pipe( prompt=prompt, - image=image, ).images + image=image, +).images image[0].save("text_to_image_generation-deepfloyd_if-result-if_stage_III.png") diff --git a/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_safe.py b/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_safe.py index 89ebf5ee3570d..4a71ac1a6b273 100644 --- a/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_safe.py +++ b/ppdiffusers/examples/inference/text_to_image_generation-stable_diffusion_safe.py @@ -15,10 +15,8 @@ from ppdiffusers import StableDiffusionPipelineSafe from ppdiffusers.pipelines.stable_diffusion_safe import SafetyConfig -pipe = StableDiffusionPipelineSafe.from_pretrained( - "runwayml/stable-diffusion-v1-5") +pipe = StableDiffusionPipelineSafe.from_pretrained("runwayml/stable-diffusion-v1-5") print(pipe.safety_concept) prompt = "the four horsewomen of the apocalypse, painting by tom of finland, gaston bussiere, craig mullins, j. c. leyendecker" out = pipe(prompt=prompt, **SafetyConfig.MAX) -out.images[0].save( - "text_to_image_generation-stable_diffusion_safe-result.png.png") +out.images[0].save("text_to_image_generation-stable_diffusion_safe-result.png.png") diff --git a/ppdiffusers/examples/inference/text_to_image_generation-t2i-adapter.py b/ppdiffusers/examples/inference/text_to_image_generation-t2i-adapter.py index 0c00344a7f602..0d0ef4e6ce819 100644 --- a/ppdiffusers/examples/inference/text_to_image_generation-t2i-adapter.py +++ b/ppdiffusers/examples/inference/text_to_image_generation-t2i-adapter.py @@ -16,21 +16,20 @@ from ppdiffusers import StableDiffusionAdapterPipeline, T2IAdapter from ppdiffusers.utils import PIL_INTERPOLATION, load_image -input_image = load_image( - "https://huggingface.co/RzZ/sd-v1-4-adapter-color/resolve/main/color_ref.png" -) +input_image = load_image("https://huggingface.co/RzZ/sd-v1-4-adapter-color/resolve/main/color_ref.png") color_palette = input_image.resize((8, 8)) -color_palette = color_palette.resize( - (512, 512), resample=PIL_INTERPOLATION["nearest"]) +color_palette = color_palette.resize((512, 512), resample=PIL_INTERPOLATION["nearest"]) adapter = T2IAdapter.from_pretrained("westfish/sd-v1-4-adapter-color") pipe = StableDiffusionAdapterPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", adapter=adapter, - paddle_dtype=paddle.float16, ) + paddle_dtype=paddle.float16, +) image = pipe( prompt="At night, glowing cubes in front of the beach", - image=color_palette, ).images[0] + image=color_palette, +).images[0] image.save("text_to_image_generation-t2i-adapter-result-color_adapter.png") diff --git a/ppdiffusers/examples/inference/text_to_image_generation-versatile_diffusion.py b/ppdiffusers/examples/inference/text_to_image_generation-versatile_diffusion.py index db8d5261d101a..d777a8ce31db3 100644 --- a/ppdiffusers/examples/inference/text_to_image_generation-versatile_diffusion.py +++ b/ppdiffusers/examples/inference/text_to_image_generation-versatile_diffusion.py @@ -14,8 +14,7 @@ from ppdiffusers import VersatileDiffusionTextToImagePipeline -pipe = VersatileDiffusionTextToImagePipeline.from_pretrained( - "shi-labs/versatile-diffusion") +pipe = VersatileDiffusionTextToImagePipeline.from_pretrained("shi-labs/versatile-diffusion") pipe.remove_unused_weights() image = pipe("an astronaut riding on a horse on mars").images[0] diff --git a/ppdiffusers/examples/inference/text_to_image_generation_mixture_tiling-stable_diffusion.py b/ppdiffusers/examples/inference/text_to_image_generation_mixture_tiling-stable_diffusion.py index cb4171be41abc..fd93408658d48 100644 --- a/ppdiffusers/examples/inference/text_to_image_generation_mixture_tiling-stable_diffusion.py +++ b/ppdiffusers/examples/inference/text_to_image_generation_mixture_tiling-stable_diffusion.py @@ -19,25 +19,30 @@ beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", - num_train_timesteps=1000, ) + num_train_timesteps=1000, +) pipeline = DiffusionPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", scheduler=scheduler, - custom_pipeline="mixture_tiling.py", ) + custom_pipeline="mixture_tiling.py", +) pipeline # Mixture of Diffusers generation image = pipeline( - prompt=[[ - "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", - "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", - "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", - ]], + prompt=[ + [ + "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", + "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", + "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", + ] + ], tile_height=640, tile_width=640, tile_row_overlap=0, tile_col_overlap=256, guidance_scale=8, seed=7178915308, - num_inference_steps=50, )["images"][0] + num_inference_steps=50, +)["images"][0] image.save("mixture_tiling" + ".png") diff --git a/ppdiffusers/examples/inference/text_to_video_generation-synth.py b/ppdiffusers/examples/inference/text_to_video_generation-synth.py index 9fd346c0f5bc1..e197cb41f426d 100644 --- a/ppdiffusers/examples/inference/text_to_video_generation-synth.py +++ b/ppdiffusers/examples/inference/text_to_video_generation-synth.py @@ -24,4 +24,5 @@ imageio.mimsave( "text_to_video_generation-synth-result-astronaut_riding_a_horse.mp4", video_frames, - fps=8, ) + fps=8, +) diff --git a/ppdiffusers/examples/inference/text_to_video_generation-zero.py b/ppdiffusers/examples/inference/text_to_video_generation-zero.py index b26103c3f32e2..0e4efb3563d50 100644 --- a/ppdiffusers/examples/inference/text_to_video_generation-zero.py +++ b/ppdiffusers/examples/inference/text_to_video_generation-zero.py @@ -13,14 +13,14 @@ # limitations under the License. import imageio + # pip install imageio[ffmpeg] import paddle from ppdiffusers import TextToVideoZeroPipeline model_id = "runwayml/stable-diffusion-v1-5" -pipe = TextToVideoZeroPipeline.from_pretrained( - model_id, paddle_dtype=paddle.float16) +pipe = TextToVideoZeroPipeline.from_pretrained(model_id, paddle_dtype=paddle.float16) prompt = "A panda is playing guitar on times square" result = pipe(prompt=prompt).images diff --git a/ppdiffusers/examples/inference/unconditional_audio_generation-audio_diffusion.py b/ppdiffusers/examples/inference/unconditional_audio_generation-audio_diffusion.py index 92557d8d6e2f4..e1914bab67daa 100644 --- a/ppdiffusers/examples/inference/unconditional_audio_generation-audio_diffusion.py +++ b/ppdiffusers/examples/inference/unconditional_audio_generation-audio_diffusion.py @@ -18,8 +18,7 @@ from ppdiffusers import AudioDiffusionPipeline # 加载模型和scheduler -pipe = AudioDiffusionPipeline.from_pretrained( - "teticio/audio-diffusion-ddim-256") +pipe = AudioDiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256") pipe.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(42) @@ -29,8 +28,7 @@ # 保存音频到本地 for i, audio in enumerate(audio): - write(f"audio_diffusion_test{i}.wav", pipe.mel.sample_rate, - audio.transpose()) + write(f"audio_diffusion_test{i}.wav", pipe.mel.sample_rate, audio.transpose()) # 保存图片 image.save("unconditional_audio_generation-audio_diffusion-result.png") diff --git a/ppdiffusers/examples/inference/unconditional_audio_generation-dance_diffusion.py b/ppdiffusers/examples/inference/unconditional_audio_generation-dance_diffusion.py index 051f61f892230..9114555e75a38 100644 --- a/ppdiffusers/examples/inference/unconditional_audio_generation-dance_diffusion.py +++ b/ppdiffusers/examples/inference/unconditional_audio_generation-dance_diffusion.py @@ -27,4 +27,5 @@ write( f"unconditional_audio_generation-dance_diffusion-result_{i}.wav", pipe.unet.sample_rate, - audio.transpose(), ) + audio.transpose(), + ) diff --git a/ppdiffusers/examples/inference/unconditional_audio_generation-spectrogram_diffusion.py b/ppdiffusers/examples/inference/unconditional_audio_generation-spectrogram_diffusion.py index d498dfbd88225..fe99d89347981 100644 --- a/ppdiffusers/examples/inference/unconditional_audio_generation-spectrogram_diffusion.py +++ b/ppdiffusers/examples/inference/unconditional_audio_generation-spectrogram_diffusion.py @@ -22,9 +22,9 @@ # Download MIDI from: wget https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/beethoven_hammerklavier_2.mid mid_file_path = ppdiffusers_url_download( "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/beethoven_hammerklavier_2.mid", - cache_dir=".", ) -pipe = SpectrogramDiffusionPipeline.from_pretrained( - "google/music-spectrogram-diffusion", paddle_dtype=paddle.float16) + cache_dir=".", +) +pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion", paddle_dtype=paddle.float16) processor = MidiProcessor() output = pipe(processor(mid_file_path)) audio = output.audios[0] diff --git a/ppdiffusers/examples/inference/unconditional_image_generation-stochastic_karras_ve.py b/ppdiffusers/examples/inference/unconditional_image_generation-stochastic_karras_ve.py index 2e22c143e2271..90f93ac299ed4 100644 --- a/ppdiffusers/examples/inference/unconditional_image_generation-stochastic_karras_ve.py +++ b/ppdiffusers/examples/inference/unconditional_image_generation-stochastic_karras_ve.py @@ -16,8 +16,7 @@ scheduler = KarrasVeScheduler() # 加载模型和scheduler -pipe = KarrasVePipeline.from_pretrained( - "google/ncsnpp-celebahq-256", scheduler=scheduler) +pipe = KarrasVePipeline.from_pretrained("google/ncsnpp-celebahq-256", scheduler=scheduler) # 执行pipeline进行推理 image = pipe().images diff --git a/ppdiffusers/examples/inference/unconditional_image_text_joint_generation-unidiffuser.py b/ppdiffusers/examples/inference/unconditional_image_text_joint_generation-unidiffuser.py index fec274338d9ad..38aed057ce167 100644 --- a/ppdiffusers/examples/inference/unconditional_image_text_joint_generation-unidiffuser.py +++ b/ppdiffusers/examples/inference/unconditional_image_text_joint_generation-unidiffuser.py @@ -19,6 +19,5 @@ image = result.images[0] image.save("unconditional_image_text_generation-unidiffuser-result.png") text = result.texts[0] -with open("unconditional_image_text_generation-unidiffuser-result.txt", - "w") as f: +with open("unconditional_image_text_generation-unidiffuser-result.txt", "w") as f: print("{}\n".format(text), file=f) diff --git a/ppdiffusers/examples/stable_diffusion/generate_images.py b/ppdiffusers/examples/stable_diffusion/generate_images.py index e20424e75e4ee..933fd0b771040 100644 --- a/ppdiffusers/examples/stable_diffusion/generate_images.py +++ b/ppdiffusers/examples/stable_diffusion/generate_images.py @@ -22,9 +22,14 @@ from paddlenlp.transformers import CLIPTextModel from tqdm.auto import tqdm -from ppdiffusers import (DDIMScheduler, EulerAncestralDiscreteScheduler, - LMSDiscreteScheduler, PNDMScheduler, - StableDiffusionPipeline, UNet2DConditionModel) +from ppdiffusers import ( + DDIMScheduler, + EulerAncestralDiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + StableDiffusionPipeline, + UNet2DConditionModel, +) from ppdiffusers.utils import DOWNLOAD_SERVER, PPDIFFUSERS_CACHE base_url = DOWNLOAD_SERVER + "/CompVis/data/" @@ -43,32 +48,30 @@ def batchify(data, batch_size=16): def generate_images( - unet_model_name_or_path, - text_encoder_model_name_or_path=None, - batch_size=16, - file="coco30k.csv", - save_path="output", - seed=42, - scheduler_type="ddim", - eta=0.0, - num_inference_steps=50, - guidance_scales=[3, 4, 5, 6, 7, 8], - height=256, - width=256, - device="gpu", - variant="bf16", ): + unet_model_name_or_path, + text_encoder_model_name_or_path=None, + batch_size=16, + file="coco30k.csv", + save_path="output", + seed=42, + scheduler_type="ddim", + eta=0.0, + num_inference_steps=50, + guidance_scales=[3, 4, 5, 6, 7, 8], + height=256, + width=256, + device="gpu", + variant="bf16", +): paddle.set_device(device) if variant == "fp32": variant = None - unet = UNet2DConditionModel.from_pretrained( - unet_model_name_or_path, variant=variant) + unet = UNet2DConditionModel.from_pretrained(unet_model_name_or_path, variant=variant) kwargs = {"safety_checker": None, "unet": unet} if text_encoder_model_name_or_path is not None: - text_encoder = CLIPTextModel.from_pretrained( - text_encoder_model_name_or_path, variant=variant) + text_encoder = CLIPTextModel.from_pretrained(text_encoder_model_name_or_path, variant=variant) kwargs["text_encoder"] = text_encoder - pipe = StableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", **kwargs) + pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", **kwargs) pipe.set_progress_bar_config(disable=True) beta_start = pipe.scheduler.beta_start beta_end = pipe.scheduler.beta_end @@ -80,17 +83,14 @@ def generate_images( set_alpha_to_one=False, steps_offset=1, # Make sure the scheduler compatible with PNDM - skip_prk_steps=True, ) + skip_prk_steps=True, + ) elif scheduler_type == "lms": - scheduler = LMSDiscreteScheduler( - beta_start=beta_start, - beta_end=beta_end, - beta_schedule="scaled_linear") + scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear") elif scheduler_type == "euler-ancestral": scheduler = EulerAncestralDiscreteScheduler( - beta_start=beta_start, - beta_end=beta_end, - beta_schedule="scaled_linear") + beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear" + ) elif scheduler_type == "ddim": scheduler = DDIMScheduler( beta_start=beta_start, @@ -99,7 +99,8 @@ def generate_images( # Make sure the scheduler compatible with DDIM clip_sample=False, set_alpha_to_one=False, - steps_offset=1, ) + steps_offset=1, + ) else: raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") pipe.scheduler = scheduler @@ -122,7 +123,8 @@ def generate_images( eta=eta, height=height, width=width, - num_inference_steps=num_inference_steps, )[0] + num_inference_steps=num_inference_steps, + )[0] for image in images: path = os.path.join(new_save_path, "{:05d}_000.png".format(i)) image.save(path) @@ -136,28 +138,33 @@ def generate_images( default=None, type=str, required=True, - help="unet_model_name_or_path.", ) + help="unet_model_name_or_path.", + ) parser.add_argument( "--text_encoder_model_name_or_path", default=None, type=str, - help="text_encoder_model_name_or_path.", ) + help="text_encoder_model_name_or_path.", + ) parser.add_argument( "--file", default="coco30k", type=str, - help="eval file.", ) + help="eval file.", + ) parser.add_argument( "--variant", default="fp32", type=str, choices=["fp32", "bf16"], - help="eval file.", ) + help="eval file.", + ) parser.add_argument( "--seed", default=42, type=int, - help="random seed.", ) + help="random seed.", + ) parser.add_argument( "--scheduler_type", default="ddim", @@ -167,22 +174,15 @@ def generate_images( ) parser.add_argument("--device", default="gpu", type=str, help="device") parser.add_argument("--batch_size", default=16, type=int, help="batch_size") - parser.add_argument( - "--num_inference_steps", - default=50, - type=int, - help="num_inference_steps") - parser.add_argument( - "--save_path", - default="outputs", - type=str, - help="Path to the output file.") + parser.add_argument("--num_inference_steps", default=50, type=int, help="num_inference_steps") + parser.add_argument("--save_path", default="outputs", type=str, help="Path to the output file.") parser.add_argument( "--guidance_scales", default=[1.5, 2, 3, 4, 5, 6, 7, 8], nargs="+", type=str, - help="guidance_scales list.", ) + help="guidance_scales list.", + ) parser.add_argument("--height", default=256, type=int, help="height.") parser.add_argument("--width", default=256, type=int, help="width.") args = parser.parse_args() @@ -210,4 +210,5 @@ def generate_images( height=args.height, width=args.width, device=args.device, - variant=args.variant, ) + variant=args.variant, + ) diff --git a/ppdiffusers/examples/stable_diffusion/sd/model.py b/ppdiffusers/examples/stable_diffusion/sd/model.py index 449a74df28ff4..bd0df892a83b1 100644 --- a/ppdiffusers/examples/stable_diffusion/sd/model.py +++ b/ppdiffusers/examples/stable_diffusion/sd/model.py @@ -21,8 +21,13 @@ from paddlenlp.transformers import AutoTokenizer, CLIPTextModel from paddlenlp.utils.log import logger -from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler, - UNet2DConditionModel, is_ppxformers_available) +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + DDPMScheduler, + UNet2DConditionModel, + is_ppxformers_available, +) from ppdiffusers.initializer import reset_initialized_parameter, zeros_ from ppdiffusers.models.attention import AttentionBlock from ppdiffusers.models.ema import LitEma @@ -37,30 +42,31 @@ def __init__(self, model_args): self.model_args = model_args tokenizer_name_or_path = ( model_args.tokenizer_name - if model_args.tokenizer_name is not None else - os.path.join(model_args.pretrained_model_name_or_path, "tokenizer")) + if model_args.tokenizer_name is not None + else os.path.join(model_args.pretrained_model_name_or_path, "tokenizer") + ) vae_name_or_path = ( model_args.vae_name_or_path - if model_args.vae_name_or_path is not None else - os.path.join(model_args.pretrained_model_name_or_path, "vae")) + if model_args.vae_name_or_path is not None + else os.path.join(model_args.pretrained_model_name_or_path, "vae") + ) text_encoder_name_or_path = ( model_args.text_encoder_name_or_path - if model_args.text_encoder_name_or_path is not None else - os.path.join(model_args.pretrained_model_name_or_path, - "text_encoder")) + if model_args.text_encoder_name_or_path is not None + else os.path.join(model_args.pretrained_model_name_or_path, "text_encoder") + ) unet_name_or_path = ( model_args.unet_name_or_path - if model_args.unet_name_or_path is not None else - os.path.join(model_args.pretrained_model_name_or_path, "unet")) + if model_args.unet_name_or_path is not None + else os.path.join(model_args.pretrained_model_name_or_path, "unet") + ) # init model and tokenizer tokenizer_kwargs = {} if model_args.model_max_length is not None: tokenizer_kwargs["model_max_length"] = model_args.model_max_length - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, - **tokenizer_kwargs) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, **tokenizer_kwargs) self.vae = AutoencoderKL.from_pretrained(vae_name_or_path) - self.text_encoder = CLIPTextModel.from_pretrained( - text_encoder_name_or_path) + self.text_encoder = CLIPTextModel.from_pretrained(text_encoder_name_or_path) try: self.unet = UNet2DConditionModel.from_pretrained(unet_name_or_path) except Exception: @@ -88,9 +94,9 @@ def __init__(self, model_args): beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000, - prediction_type=self.model_args.prediction_type, ) - self.register_buffer("alphas_cumprod", - self.noise_scheduler.alphas_cumprod) + prediction_type=self.model_args.prediction_type, + ) + self.register_buffer("alphas_cumprod", self.noise_scheduler.alphas_cumprod) self.eval_scheduler = DDIMScheduler( beta_start=0.00085, beta_end=0.012, @@ -99,7 +105,8 @@ def __init__(self, model_args): clip_sample=False, set_alpha_to_one=False, steps_offset=1, - prediction_type=self.model_args.prediction_type, ) + prediction_type=self.model_args.prediction_type, + ) self.eval_scheduler.set_timesteps(self.model_args.num_inference_steps) self.use_ema = False self.model_ema = None @@ -109,7 +116,7 @@ def compute_snr(self, timesteps): Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849 """ sqrt_alphas_cumprod = self.alphas_cumprod**0.5 - sqrt_one_minus_alphas_cumprod = (1.0 - self.alphas_cumprod)**0.5 + sqrt_one_minus_alphas_cumprod = (1.0 - self.alphas_cumprod) ** 0.5 # Expand the tensors. # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026 @@ -118,15 +125,13 @@ def compute_snr(self, timesteps): sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None] alpha = sqrt_alphas_cumprod.expand(timesteps.shape) - sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[ - timesteps].cast("float32") + sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[timesteps].cast("float32") while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape): - sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., - None] + sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None] sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape) # Compute SNR. - snr = (alpha / sigma)**2 + snr = (alpha / sigma) ** 2 return snr def forward(self, input_ids=None, pixel_values=None, **kwargs): @@ -143,14 +148,14 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs): if self.model_args.noise_offset: # https://www.crosslabs.org//blog/diffusion-with-offset-noise noise += self.model_args.noise_offset * paddle.randn( - (latents.shape[0], latents.shape[1], 1, 1), dtype=noise.dtype) + (latents.shape[0], latents.shape[1], 1, 1), dtype=noise.dtype + ) if self.model_args.input_perturbation: - new_noise = noise + self.model_args.input_perturbation * paddle.randn( - noise.shape, dtype=noise.dtype) + new_noise = noise + self.model_args.input_perturbation * paddle.randn(noise.shape, dtype=noise.dtype) - timesteps = paddle.randint( - 0, self.noise_scheduler.config.num_train_timesteps, - (latents.shape[0], )).cast("int64") + timesteps = paddle.randint(0, self.noise_scheduler.config.num_train_timesteps, (latents.shape[0],)).cast( + "int64" + ) # Add noise to the latents according to the noise magnitude at each timestep # (this is the forward diffusion process) if self.model_args.input_perturbation: @@ -165,7 +170,8 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs): model_pred = self.unet( sample=noisy_latents, timestep=timesteps, - encoder_hidden_states=encoder_hidden_states, ).sample + encoder_hidden_states=encoder_hidden_states, + ).sample # Get the target for loss depending on the prediction type if self.model_args.prediction_type == "epsilon": @@ -173,64 +179,58 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs): elif self.model_args.prediction_type == "v_prediction": target = self.get_velocity(latents, noise, timesteps) else: - raise ValueError( - f"Unknown prediction type {self.model_args.prediction_type}") + raise ValueError(f"Unknown prediction type {self.model_args.prediction_type}") # compute loss if self.model_args.snr_gamma is None: - loss = (F.mse_loss( - model_pred.cast("float32"), - target.cast("float32"), - reduction="none").mean([1, 2, 3]).mean()) + loss = ( + F.mse_loss(model_pred.cast("float32"), target.cast("float32"), reduction="none").mean([1, 2, 3]).mean() + ) else: # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556. # Since we predict the noise instead of x_0, the original formulation is slightly changed. # This is discussed in Section 4.2 of the same paper. snr = self.compute_snr(timesteps) - mse_loss_weights = (paddle.stack( - [snr, self.model_args.snr_gamma * paddle.ones_like(timesteps)], - axis=1, ).min(axis=1)[0] / snr) + mse_loss_weights = ( + paddle.stack([snr, self.model_args.snr_gamma * paddle.ones_like(timesteps)], axis=1,).min( + axis=1 + )[0] + / snr + ) # We first calculate the original loss. Then we mean over the non-batch dimensions and # rebalance the sample-wise losses with their respective loss weights. # Finally, we take the mean of the rebalanced loss. - loss = F.mse_loss( - model_pred.cast("float32"), - target.cast("float32"), - reduction="none") + loss = F.mse_loss(model_pred.cast("float32"), target.cast("float32"), reduction="none") loss = loss.mean(list(range(1, len(loss.shape)))) * mse_loss_weights loss = loss.mean() return loss def add_noise( - self, - original_samples: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor, ) -> paddle.Tensor: - sqrt_alpha_prod = self.alphas_cumprod[timesteps]**0.5 + self, + original_samples: paddle.Tensor, + noise: paddle.Tensor, + timesteps: paddle.Tensor, + ) -> paddle.Tensor: + sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5 sqrt_alpha_prod = sqrt_alpha_prod.flatten() while len(sqrt_alpha_prod.shape) < len(original_samples.shape): sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) - sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps])**0.5 + sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5 sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() - while len(sqrt_one_minus_alpha_prod.shape) < len( - original_samples.shape): + while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape): sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) - noisy_samples = (sqrt_alpha_prod * original_samples + - sqrt_one_minus_alpha_prod * noise) + noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise return noisy_samples - def get_velocity(self, - sample: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor) -> paddle.Tensor: - sqrt_alpha_prod = self.alphas_cumprod[timesteps]**0.5 + def get_velocity(self, sample: paddle.Tensor, noise: paddle.Tensor, timesteps: paddle.Tensor) -> paddle.Tensor: + sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5 sqrt_alpha_prod = sqrt_alpha_prod.flatten() while len(sqrt_alpha_prod.shape) < len(sample.shape): sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) - sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps])**0.5 + sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5 sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape): sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) @@ -285,20 +285,19 @@ def decode_image(self, pixel_values=None, max_batch=8, **kwargs): @paddle.no_grad() def log_image( - self, - input_ids=None, - height=256, - width=256, - eta=0.0, - guidance_scale=7.5, - max_batch=8, - **kwargs, ): + self, + input_ids=None, + height=256, + width=256, + eta=0.0, + guidance_scale=7.5, + max_batch=8, + **kwargs, + ): self.eval() with self.ema_scope(): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") # only log max_batch image if input_ids.shape[0] > max_batch: input_ids = input_ids[:max_batch] @@ -311,34 +310,25 @@ def log_image( padding="max_length", truncation=True, max_length=max_length, - return_tensors="pd", ) + return_tensors="pd", + ) uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0] - text_embeddings = paddle.concat( - [uncond_embeddings, text_embeddings], axis=0) + text_embeddings = paddle.concat([uncond_embeddings, text_embeddings], axis=0) - latents = paddle.randn((input_ids.shape[0], self.unet.in_channels, - height // 8, width // 8)) + latents = paddle.randn((input_ids.shape[0], self.unet.in_channels, height // 8, width // 8)) latents = latents * self.eval_scheduler.init_noise_sigma - accepts_eta = "eta" in set( - inspect.signature(self.eval_scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta for t in self.eval_scheduler.timesteps: - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.eval_scheduler.scale_model_input( - latent_model_input, t) - noise_pred = self.unet( - latent_model_input, - t, - encoder_hidden_states=text_embeddings).sample + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t) + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) - latents = self.eval_scheduler.step( - noise_pred, t, latents, **extra_step_kwargs).prev_sample + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clip(0, 1).transpose([0, 2, 3, 1]) * 255.0 @@ -347,8 +337,7 @@ def log_image( def set_recompute(self, use_recompute=False): if use_recompute: self.unet.enable_gradient_checkpointing() - if self.model_args.train_text_encoder and hasattr( - self.text_encoder, "gradient_checkpointing_enable"): + if self.model_args.train_text_encoder and hasattr(self.text_encoder, "gradient_checkpointing_enable"): self.text_encoder.gradient_checkpointing_enable() def gradient_checkpointing_enable(self): @@ -362,26 +351,21 @@ def set_xformers(self, use_xformers=False): ) else: try: - attention_op = os.getenv("FLAG_XFORMERS_ATTENTION_OP", - "none").lower() + attention_op = os.getenv("FLAG_XFORMERS_ATTENTION_OP", "none").lower() if attention_op == "none": attention_op = None - self.unet.enable_xformers_memory_efficient_attention( - attention_op) - if hasattr(self.vae, - "enable_xformers_memory_efficient_attention"): - self.vae.enable_xformers_memory_efficient_attention( - attention_op) - if hasattr(self.text_encoder, - "enable_xformers_memory_efficient_attention"): - self.text_encoder.enable_xformers_memory_efficient_attention( - attention_op) + self.unet.enable_xformers_memory_efficient_attention(attention_op) + if hasattr(self.vae, "enable_xformers_memory_efficient_attention"): + self.vae.enable_xformers_memory_efficient_attention(attention_op) + if hasattr(self.text_encoder, "enable_xformers_memory_efficient_attention"): + self.text_encoder.enable_xformers_memory_efficient_attention(attention_op) except Exception as e: logger.warn( "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed" - f" correctly and a GPU is available: {e}") + f" correctly and a GPU is available: {e}" + ) def set_ema(self, use_ema=False): self.use_ema = use_ema diff --git a/ppdiffusers/examples/stable_diffusion/sd/sd_args.py b/ppdiffusers/examples/stable_diffusion/sd/sd_args.py index d15e6b0894fe4..4ca34e749fc3f 100644 --- a/ppdiffusers/examples/stable_diffusion/sd/sd_args.py +++ b/ppdiffusers/examples/stable_diffusion/sd/sd_args.py @@ -34,39 +34,34 @@ @dataclass class SDTrainingArguments(TrainingArguments): - image_logging_steps: int = field( - default=1000, metadata={"help": "Log image every X steps."}) - to_static: bool = field( - default=False, metadata={"help": "Whether or not to_static"}) + image_logging_steps: int = field(default=1000, metadata={"help": "Log image every X steps."}) + to_static: bool = field(default=False, metadata={"help": "Whether or not to_static"}) benchmark: bool = field( default=False, - metadata={"help": "Whether or not run benchmark."}, ) + metadata={"help": "Whether or not run benchmark."}, + ) profiler_options: Optional[str] = field( default=None, - metadata={"help": "profiler_options."}, ) + metadata={"help": "profiler_options."}, + ) report_to: Optional[List[str]] = field( default_factory=lambda: ["custom_visualdl"], - metadata={ - "help": - "The list of integrations to report the results and logs to." - }, ) + metadata={"help": "The list of integrations to report the results and logs to."}, + ) resolution: int = field( default=512, metadata={ - "help": - "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution." - }, ) - use_ema: bool = field( - default=False, metadata={"help": "Whether or not use ema"}) + "help": "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution." + }, + ) + use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"}) enable_xformers_memory_efficient_attention: bool = field( - default=False, - metadata={"help": "enable_xformers_memory_efficient_attention."}) + default=False, metadata={"help": "enable_xformers_memory_efficient_attention."} + ) only_save_updated_model: bool = field( - default=True, - metadata={"help": "Whether or not save only_save_updated_model"}) - unet_learning_rate: float = field( - default=None, - metadata={"help": "The initial learning rate for Unet Model."}) + default=True, metadata={"help": "Whether or not save only_save_updated_model"} + ) + unet_learning_rate: float = field(default=None, metadata={"help": "The initial learning rate for Unet Model."}) text_encoder_learning_rate: float = field( default=None, metadata={"help": "The initial learning rate for Text Encoder Model."}, @@ -75,19 +70,17 @@ class SDTrainingArguments(TrainingArguments): def __post_init__(self): super().__post_init__() self.image_logging_steps = ( - (math.ceil(self.image_logging_steps / self.logging_steps) * - self.logging_steps) if self.image_logging_steps > 0 else -1) - self.use_ema = str2bool(os.getenv("FLAG_USE_EMA", - "False")) or self.use_ema + (math.ceil(self.image_logging_steps / self.logging_steps) * self.logging_steps) + if self.image_logging_steps > 0 + else -1 + ) + self.use_ema = str2bool(os.getenv("FLAG_USE_EMA", "False")) or self.use_ema self.enable_xformers_memory_efficient_attention = ( - str2bool(os.getenv("FLAG_XFORMERS", "False")) or - self.enable_xformers_memory_efficient_attention) - self.recompute = (str2bool(os.getenv("FLAG_RECOMPUTE", "False")) or - self.recompute) - self.benchmark = (str2bool(os.getenv("FLAG_BENCHMARK", "False")) or - self.benchmark) - self.to_static = (str2bool(os.getenv("FLAG_TO_STATIC", "False")) or - self.to_static) + str2bool(os.getenv("FLAG_XFORMERS", "False")) or self.enable_xformers_memory_efficient_attention + ) + self.recompute = str2bool(os.getenv("FLAG_RECOMPUTE", "False")) or self.recompute + self.benchmark = str2bool(os.getenv("FLAG_BENCHMARK", "False")) or self.benchmark + self.to_static = str2bool(os.getenv("FLAG_TO_STATIC", "False")) or self.to_static if self.text_encoder_learning_rate is None: self.text_encoder_learning_rate = self.learning_rate @@ -105,45 +98,34 @@ def __post_init__(self): @dataclass class SDModelArguments: - vae_name_or_path: Optional[str] = field( - default=None, metadata={"help": "vae_name_or_path"}) - text_encoder_name_or_path: Optional[str] = field( - default=None, metadata={"help": "text_encoder_name_or_path"}) - unet_name_or_path: Optional[str] = field( - default=None, metadata={"help": "unet_name_or_path"}) + vae_name_or_path: Optional[str] = field(default=None, metadata={"help": "vae_name_or_path"}) + text_encoder_name_or_path: Optional[str] = field(default=None, metadata={"help": "text_encoder_name_or_path"}) + unet_name_or_path: Optional[str] = field(default=None, metadata={"help": "unet_name_or_path"}) tokenizer_name: Optional[str] = field( default=None, - metadata={ - "help": - "Pretrained tokenizer name or path if not the same as pretrained_model_name_or_path" - }, ) + metadata={"help": "Pretrained tokenizer name or path if not the same as pretrained_model_name_or_path"}, + ) pretrained_model_name_or_path: str = field( default="CompVis/stable-diffusion-v1-4", - metadata={ - "help": - "Path to pretrained model or model, when we want to resume training." - }, ) - model_max_length: int = field( - default=77, metadata={"help": "Pretrained tokenizer model_max_length"}) + metadata={"help": "Path to pretrained model or model, when we want to resume training."}, + ) + model_max_length: int = field(default=77, metadata={"help": "Pretrained tokenizer model_max_length"}) prediction_type: str = field( default="epsilon", metadata={ - "help": - "prediction_type, prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf)" - }, ) - num_inference_steps: int = field( - default=50, metadata={"help": "num_inference_steps"}) - train_text_encoder: bool = field( - default=False, metadata={"help": "Whether or not train text encoder"}) - - noise_offset: float = field( - default=0, metadata={"help": "The scale of noise offset."}) + "help": "prediction_type, prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf)" + }, + ) + num_inference_steps: int = field(default=50, metadata={"help": "num_inference_steps"}) + train_text_encoder: bool = field(default=False, metadata={"help": "Whether or not train text encoder"}) + + noise_offset: float = field(default=0, metadata={"help": "The scale of noise offset."}) snr_gamma: Optional[float] = field( default=None, metadata={ - "help": - "SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. More details here: https://arxiv.org/abs/2303.09556." - }, ) + "help": "SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. More details here: https://arxiv.org/abs/2303.09556." + }, + ) input_perturbation: Optional[float] = field( default=0, metadata={"help": "The scale of input perturbation. Recommended 0.1."}, @@ -158,14 +140,18 @@ class SDDataArguments: file_list: str = field( default="./data/filelist/train.filelist.list", - metadata={"help": "The name of the file_list."}, ) + metadata={"help": "The name of the file_list."}, + ) num_records: int = field(default=10000000, metadata={"help": "num_records"}) buffer_size: int = field( default=100, - metadata={"help": "Buffer size"}, ) + metadata={"help": "Buffer size"}, + ) shuffle_every_n_samples: int = field( default=5, - metadata={"help": "shuffle_every_n_samples."}, ) + metadata={"help": "shuffle_every_n_samples."}, + ) interpolation: str = field( default="lanczos", - metadata={"help": "interpolation method"}, ) + metadata={"help": "interpolation method"}, + ) diff --git a/ppdiffusers/examples/stable_diffusion/sd/sd_trainer.py b/ppdiffusers/examples/stable_diffusion/sd/sd_trainer.py index 042f4f9410724..0ef65c15cac26 100644 --- a/ppdiffusers/examples/stable_diffusion/sd/sd_trainer.py +++ b/ppdiffusers/examples/stable_diffusion/sd/sd_trainer.py @@ -22,7 +22,11 @@ from paddle.io import DataLoader from paddlenlp.trainer import PrinterCallback, ProgressCallback, Trainer from paddlenlp.trainer.integrations import ( - INTEGRATION_TO_CALLBACK, TrainerCallback, VisualDLCallback, rewrite_logs) + INTEGRATION_TO_CALLBACK, + TrainerCallback, + VisualDLCallback, + rewrite_logs, +) from paddlenlp.transformers.model_utils import _add_variant from paddlenlp.utils import profiler from paddlenlp.utils.log import logger @@ -58,19 +62,17 @@ def autocast_smart_context_manager(self, args): custom_black_list=set(custom_black_list), custom_white_list=set(custom_white_list), level=args.fp16_opt_level, - dtype=amp_dtype, ) + dtype=amp_dtype, + ) else: - ctx_manager = (contextlib.nullcontext() - if sys.version_info >= (3, 7) else - contextlib.suppress()) + ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress() return ctx_manager def on_step_end(self, args, state, control, model=None, **kwargs): if hasattr(model, "on_train_batch_end"): model.on_train_batch_end() - if (args.image_logging_steps > 0 and - state.global_step % args.image_logging_steps == 0): + if args.image_logging_steps > 0 and state.global_step % args.image_logging_steps == 0: control.should_log = True def on_log(self, args, state, control, logs=None, **kwargs): @@ -78,26 +80,32 @@ def on_log(self, args, state, control, logs=None, **kwargs): inputs = kwargs.get("inputs", None) model = kwargs.get("model", None) image_logs = {} - if (inputs is not None and model is not None and - args.image_logging_steps > 0 and - state.global_step % args.image_logging_steps == 0): + if ( + inputs is not None + and model is not None + and args.image_logging_steps > 0 + and state.global_step % args.image_logging_steps == 0 + ): with self.autocast_smart_context_manager(args): max_batch = 4 if args.resolution > 256 else 8 image_logs["reconstruction"] = model.decode_image( - pixel_values=inputs["pixel_values"], max_batch=max_batch) + pixel_values=inputs["pixel_values"], max_batch=max_batch + ) image_logs["ddim-samples-1.0"] = model.log_image( input_ids=inputs["input_ids"], guidance_scale=1.0, height=args.resolution, width=args.resolution, - max_batch=max_batch, ) + max_batch=max_batch, + ) image_logs["ddim-samples-7.5"] = model.log_image( input_ids=inputs["input_ids"], guidance_scale=7.5, height=args.resolution, width=args.resolution, - max_batch=max_batch, ) + max_batch=max_batch, + ) if not state.is_world_process_zero: return @@ -110,10 +118,8 @@ def on_log(self, args, state, control, logs=None, **kwargs): logs["unet_lr"] = base_learning_rate if model.train_text_encoder: if args.text_encoder_learning_rate != args.unet_learning_rate: - logs[ - "unet_lr"] = base_learning_rate * args.unet_learning_rate - logs["text_encoder_lr"] = (base_learning_rate * - args.text_encoder_learning_rate) + logs["unet_lr"] = base_learning_rate * args.unet_learning_rate + logs["text_encoder_lr"] = base_learning_rate * args.text_encoder_learning_rate else: logs["text_encoder_lr"] = base_learning_rate @@ -127,11 +133,11 @@ def on_log(self, args, state, control, logs=None, **kwargs): "Trainer is attempting to log a value of " f'"{v}" of type {type(v)} for key "{k}" as a scalar. ' "This invocation of VisualDL's writer.add_scalar() " - "is incorrect so we dropped this attribute.") + "is incorrect so we dropped this attribute." + ) # log images for k, v in image_logs.items(): - self.vdl_writer.add_image( - k, v, state.global_step, dataformats="NHWC") + self.vdl_writer.add_image(k, v, state.global_step, dataformats="NHWC") self.vdl_writer.flush() @@ -172,8 +178,7 @@ def __init__(self, benchmark=True, profiler_options=None): self.profiler_options = profiler_options def on_train_begin(self, args, state, control, **kwargs): - assert (args.gradient_accumulation_steps == 1 and not args.do_eval and - not args.do_predict) + assert args.gradient_accumulation_steps == 1 and not args.do_eval and not args.do_predict if self.benchmark: self.reader_cost_avg = AverageStatistical() @@ -198,8 +203,7 @@ def on_step_end(self, args, state, control, **kwargs): def on_log(self, args, state, control, logs=None, **kwargs): if self.benchmark: if logs is not None and "interval_steps_per_second" in logs: - self.batch_start = self.batch_start + ( - time.time() - self.maybe_log_save_evaluate_start) + self.batch_start = self.batch_start + (time.time() - self.maybe_log_save_evaluate_start) ips = logs["interval_steps_per_second"] * args.train_batch_size avg_batch_cost = 1 / logs["interval_steps_per_second"] logger.info( @@ -211,14 +215,15 @@ def on_log(self, args, state, control, logs=None, **kwargs): self.reader_cost_avg.get_average(), avg_batch_cost, args.train_batch_size, - ips, )) + ips, + ) + ) self.reader_cost_avg.reset() def on_epoch_end(self, args, state, control, **kwargs): if self.benchmark: train_epoch_cost = time.time() - self.epoch_start - logger.info("train epoch: %d, epoch_cost: %.5f s" % - (state.epoch, train_epoch_cost)) + logger.info("train epoch: %d, epoch_cost: %.5f s" % (state.epoch, train_epoch_cost)) # register visualdl_with_image @@ -232,7 +237,9 @@ def __init__(self, **kwargs): self.add_callback( BenchmarkCallback( benchmark=self.args.benchmark, - profiler_options=self.args.profiler_options, )) + profiler_options=self.args.profiler_options, + ) + ) if self.args.benchmark: if self.args.disable_tqdm: self.pop_callback(PrinterCallback) @@ -251,34 +258,27 @@ def get_train_dataloader(self): self.train_dataset, batch_size=self.args.train_batch_size, num_workers=self.args.dataloader_num_workers, - worker_init_fn=worker_init_fn, ) + worker_init_fn=worker_init_fn, + ) else: return super().get_train_dataloader() - def _save(self, - output_dir=None, - state_dict=None, - merge_tensor_parallel=False): + def _save(self, output_dir=None, state_dict=None, merge_tensor_parallel=False): output_dir = output_dir if output_dir is not None else self.args.output_dir os.makedirs(output_dir, exist_ok=True) if self.args.only_save_updated_model: unwraped_model = unwrap_model(self.model) logger.info(f"Saving unet checkpoint to {output_dir}/unet") - unwraped_model.unet.save_pretrained( - os.path.join(output_dir, "unet")) + unwraped_model.unet.save_pretrained(os.path.join(output_dir, "unet")) if unwraped_model.use_ema: logger.info(f"Saving ema unet checkpoint to {output_dir}/unet") with unwraped_model.ema_scope(): - unwraped_model.unet.save_pretrained( - os.path.join(output_dir, "unet"), variant="ema") + unwraped_model.unet.save_pretrained(os.path.join(output_dir, "unet"), variant="ema") if unwraped_model.train_text_encoder: - logger.info( - f"Saving text encoder checkpoint to {output_dir}/text_encoder" - ) - unwraped_model.text_encoder.save_pretrained( - os.path.join(output_dir, "text_encoder")) + logger.info(f"Saving text encoder checkpoint to {output_dir}/text_encoder") + unwraped_model.text_encoder.save_pretrained(os.path.join(output_dir, "text_encoder")) else: logger.info(f"Saving model checkpoint to {output_dir}") if state_dict is None: @@ -287,10 +287,10 @@ def _save(self, state_dict, os.path.join( output_dir, - _add_variant(PADDLE_WEIGHTS_NAME, - self.args.weight_name_suffix), ), ) + _add_variant(PADDLE_WEIGHTS_NAME, self.args.weight_name_suffix), + ), + ) if self.args.should_save: if self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) - paddle.save(self.args, - os.path.join(output_dir, TRAINING_ARGS_NAME)) + paddle.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) diff --git a/ppdiffusers/examples/stable_diffusion/sd/text_image_pair_dataset.py b/ppdiffusers/examples/stable_diffusion/sd/text_image_pair_dataset.py index 82d71e6c5f816..b41f0b799469f 100644 --- a/ppdiffusers/examples/stable_diffusion/sd/text_image_pair_dataset.py +++ b/ppdiffusers/examples/stable_diffusion/sd/text_image_pair_dataset.py @@ -46,8 +46,7 @@ def parse_src(filename): elif data_source == "laion_aes": text_json = json.loads(vec[2]) img_b64 = vec[5] - caption = text_json.get("caption_en", - text_json.get("blip_caption_en", "")) + caption = text_json.get("caption_en", text_json.get("blip_caption_en", "")) else: _, captions, _, _, _, img_b64 = vec[:6] caption = random.sample(captions.split("|"), 1)[0].replace("\1", "") @@ -77,23 +76,26 @@ def _get_param(self, img, output_size): class TextImagePair(IterableDataset): def __init__( - self, - file_list, - size, - num_records, - image_processing=None, - buffer_size=1000, - shuffle_every_n_samples=5, - interpolation="lanczos", - tokenizer=None, ): + self, + file_list, + size, + num_records, + image_processing=None, + buffer_size=1000, + shuffle_every_n_samples=5, + interpolation="lanczos", + tokenizer=None, + ): self.size = size if image_processing is None: - self.image_processing = transforms.Compose([ - transforms.Resize(int(size / 0.9), interpolation), - RandomCrop(size), - transforms.ToTensor(), - transforms.Normalize(0.5, 0.5), - ]) + self.image_processing = transforms.Compose( + [ + transforms.Resize(int(size / 0.9), interpolation), + RandomCrop(size), + transforms.ToTensor(), + transforms.Normalize(0.5, 0.5), + ] + ) else: self.image_processing = image_processing self.text_processing = lambda caption: tokenizer( @@ -101,7 +103,8 @@ def __init__( padding="max_length", truncation=True, max_length=tokenizer.model_max_length, - return_tensors="pd", ).input_ids[0] + return_tensors="pd", + ).input_ids[0] self.file_list = [] file_weights = [] with open(file_list, "r") as f: @@ -122,19 +125,14 @@ def __init__( file_weights = file_weights / file_weight_sum print(f"sample weights of files: {file_weights}") self.file_weights_cumsum = np.cumsum(file_weights) - self.file_weights_cumsum = np.concatenate( - [[0.0], self.file_weights_cumsum]) + self.file_weights_cumsum = np.concatenate([[0.0], self.file_weights_cumsum]) else: print("sample each file list with same probabiliy") self.file_weights_cumsum = None self.num_records = num_records - self.file_ids = [ - np.arange(len(filelist)) for filelist in self.file_list - ] - print( - f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}" - ) + self.file_ids = [np.arange(len(filelist)) for filelist in self.file_list] + print(f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}") self.buffer_size = buffer_size self.shuffle_every_n_samples = shuffle_every_n_samples @@ -143,9 +141,7 @@ def sample_loader(self, file_ids, filenames): random.shuffle(file_ids) for i in file_ids: filename = filenames[i].strip("\n") - with gzip.open(filename, - "rb") if filename.endswith(".gz") else open( - filename, "rb") as f: + with gzip.open(filename, "rb") if filename.endswith(".gz") else open(filename, "rb") as f: # retry = 0 while True: line = f.readline() @@ -171,19 +167,14 @@ def sample_loader(self, file_ids, filenames): if w < self.size or h < self.size: continue yield { - "pixel_values": - self.image_processing(data["image"]), - "input_ids": - self.text_processing(data["caption"]), + "pixel_values": self.image_processing(data["image"]), + "input_ids": self.text_processing(data["caption"]), } def random_load_from_multi_dataset(self): - print( - f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}" - ) + print(f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}") sample_loader_per_dataset = [ - iter(self.sample_loader(self.file_ids[i], self.file_list[i])) - for i in range(len(self.file_ids)) + iter(self.sample_loader(self.file_ids[i], self.file_list[i])) for i in range(len(self.file_ids)) ] while True: @@ -192,8 +183,7 @@ def random_load_from_multi_dataset(self): else: rand_num = random.random() for i in range(len(self.file_list)): - if (self.file_weights_cumsum[i] <= rand_num < - self.file_weights_cumsum[i + 1]): + if self.file_weights_cumsum[i] <= rand_num < self.file_weights_cumsum[i + 1]: break sample_loader = sample_loader_per_dataset[i] # debug diff --git a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/model.py b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/model.py index a50d56e2b5b11..a7afb1ddf6c41 100644 --- a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/model.py +++ b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/model.py @@ -17,8 +17,7 @@ import torch import torch.nn as nn import torch.nn.functional as F -from diffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler, - UNet2DConditionModel) +from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, UNet2DConditionModel from transformers import AutoTokenizer, CLIPTextModel from transformers.utils.logging import get_logger @@ -35,9 +34,8 @@ def __init__(self, model, decay=0.9999, use_num_upates=True): self.register_buffer("decay", torch.tensor(decay, dtype=torch.float32)) self.register_buffer( "num_updates", - torch.tensor( - 0, dtype=torch.int) if use_num_upates else torch.tensor( - -1, dtype=torch.int), ) + torch.tensor(0, dtype=torch.int) if use_num_upates else torch.tensor(-1, dtype=torch.int), + ) for name, p in model.named_parameters(): if p.requires_grad: @@ -53,8 +51,7 @@ def forward(self, model): if self.num_updates >= 0: self.num_updates += 1 - decay = min(self.decay, - (1 + self.num_updates) / (10 + self.num_updates)) + decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates)) one_minus_decay = 1.0 - decay @@ -65,10 +62,8 @@ def forward(self, model): for key in m_param: if m_param[key].requires_grad: sname = self.m_name2s_name[key] - shadow_params[sname] = shadow_params[sname].type_as(m_param[ - key]) - shadow_params[sname].sub_( - one_minus_decay * (shadow_params[sname] - m_param[key])) + shadow_params[sname] = shadow_params[sname].type_as(m_param[key]) + shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key])) else: assert key not in self.m_name2s_name @@ -77,8 +72,7 @@ def copy_to(self, model): shadow_params = dict(self.named_buffers()) for key in m_param: if m_param[key].requires_grad: - m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]] - .data) + m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data) else: assert key not in self.m_name2s_name @@ -89,9 +83,7 @@ def store(self, parameters): parameters: Iterable of `torch.nn.Parameter`; the parameters to be temporarily stored. """ - self.collected_params = [ - param.detach().cpu().clone() for param in parameters - ] + self.collected_params = [param.detach().cpu().clone() for param in parameters] def restore(self, parameters): """ @@ -113,19 +105,26 @@ class StableDiffusionModel(nn.Module): def __init__(self, model_args): super().__init__() self.model_args = model_args - tokenizer_name_or_path = (model_args.tokenizer_name - if model_args.tokenizer_name is not None else - model_args.pretrained_model_name_or_path) - vae_name_or_path = (model_args.vae_name_or_path - if model_args.vae_name_or_path is not None else - model_args.pretrained_model_name_or_path) + tokenizer_name_or_path = ( + model_args.tokenizer_name + if model_args.tokenizer_name is not None + else model_args.pretrained_model_name_or_path + ) + vae_name_or_path = ( + model_args.vae_name_or_path + if model_args.vae_name_or_path is not None + else model_args.pretrained_model_name_or_path + ) text_encoder_name_or_path = ( model_args.text_encoder_name_or_path - if model_args.text_encoder_name_or_path is not None else - model_args.pretrained_model_name_or_path) - unet_name_or_path = (model_args.unet_name_or_path - if model_args.unet_name_or_path is not None else - model_args.pretrained_model_name_or_path) + if model_args.text_encoder_name_or_path is not None + else model_args.pretrained_model_name_or_path + ) + unet_name_or_path = ( + model_args.unet_name_or_path + if model_args.unet_name_or_path is not None + else model_args.pretrained_model_name_or_path + ) # init model and tokenizer tokenizer_kwargs = {} if model_args.model_max_length is not None: @@ -134,14 +133,12 @@ def __init__(self, model_args): tokenizer_name_or_path, **tokenizer_kwargs, subfolder="tokenizer", - use_fast=False, ) - self.vae = AutoencoderKL.from_pretrained( - vae_name_or_path, subfolder="vae") - self.text_encoder = CLIPTextModel.from_pretrained( - text_encoder_name_or_path, subfolder="text_encoder") + use_fast=False, + ) + self.vae = AutoencoderKL.from_pretrained(vae_name_or_path, subfolder="vae") + self.text_encoder = CLIPTextModel.from_pretrained(text_encoder_name_or_path, subfolder="text_encoder") try: - self.unet = UNet2DConditionModel.from_pretrained( - unet_name_or_path, subfolder="unet") + self.unet = UNet2DConditionModel.from_pretrained(unet_name_or_path, subfolder="unet") except Exception: self.unet = UNet2DConditionModel.from_config(unet_name_or_path) logger.info("Init unet model from scratch!") @@ -166,9 +163,9 @@ def __init__(self, model_args): beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000, - prediction_type=self.model_args.prediction_type, ) - self.register_buffer("alphas_cumprod", - self.noise_scheduler.alphas_cumprod) + prediction_type=self.model_args.prediction_type, + ) + self.register_buffer("alphas_cumprod", self.noise_scheduler.alphas_cumprod) self.eval_scheduler = DDIMScheduler( beta_start=0.00085, beta_end=0.012, @@ -177,7 +174,8 @@ def __init__(self, model_args): clip_sample=False, set_alpha_to_one=False, steps_offset=1, - prediction_type=self.model_args.prediction_type, ) + prediction_type=self.model_args.prediction_type, + ) self.eval_scheduler.set_timesteps(self.model_args.num_inference_steps) self.use_ema = False self.model_ema = None @@ -187,25 +185,22 @@ def compute_snr(self, timesteps): Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849 """ sqrt_alphas_cumprod = self.alphas_cumprod**0.5 - sqrt_one_minus_alphas_cumprod = (1.0 - self.alphas_cumprod)**0.5 + sqrt_one_minus_alphas_cumprod = (1.0 - self.alphas_cumprod) ** 0.5 # Expand the tensors. # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026 - sqrt_alphas_cumprod = sqrt_alphas_cumprod.to( - device=timesteps.device)[timesteps].float() + sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float() while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape): sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None] alpha = sqrt_alphas_cumprod.expand(timesteps.shape) - sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to( - device=timesteps.device)[timesteps].float() + sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float() while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape): - sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., - None] + sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None] sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape) # Compute SNR. - snr = (alpha / sigma)**2 + snr = (alpha / sigma) ** 2 return snr def forward(self, input_ids=None, pixel_values=None, **kwargs): @@ -220,17 +215,18 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs): noise += self.model_args.noise_offset * torch.randn( (latents.shape[0], latents.shape[1], 1, 1), dtype=noise.dtype, - device=noise.device, ) + device=noise.device, + ) if self.model_args.input_perturbation: - new_noise = noise + self.model_args.input_perturbation * torch.randn_like( - noise) + new_noise = noise + self.model_args.input_perturbation * torch.randn_like(noise) timesteps = torch.randint( 0, self.noise_scheduler.config.num_train_timesteps, - (latents.shape[0], ), + (latents.shape[0],), dtype=torch.long, - device=latents.device, ) + device=latents.device, + ) # Add noise to the latents according to the noise magnitude at each timestep # (this is the forward diffusion process) if self.model_args.input_perturbation: @@ -239,15 +235,15 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs): noisy_latents = self.add_noise(latents, noise, timesteps) # text encode - encoder_hidden_states = self.text_encoder( - input_ids, return_dict=False)[0] + encoder_hidden_states = self.text_encoder(input_ids, return_dict=False)[0] # unet model_pred = self.unet( sample=noisy_latents, timestep=timesteps, encoder_hidden_states=encoder_hidden_states, - return_dict=False, )[0] + return_dict=False, + )[0] # Get the target for loss depending on the prediction type if self.model_args.prediction_type == "epsilon": @@ -255,62 +251,53 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs): elif self.model_args.prediction_type == "v_prediction": target = self.get_velocity(latents, noise, timesteps) else: - raise ValueError( - f"Unknown prediction type {self.model_args.prediction_type}") + raise ValueError(f"Unknown prediction type {self.model_args.prediction_type}") # compute loss if self.model_args.snr_gamma is None: - loss = (F.mse_loss( - model_pred.float(), target.float(), reduction="none") - .mean([1, 2, 3]).mean()) + loss = F.mse_loss(model_pred.float(), target.float(), reduction="none").mean([1, 2, 3]).mean() else: # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556. # Since we predict the noise instead of x_0, the original formulation is slightly changed. # This is discussed in Section 4.2 of the same paper. snr = self.compute_snr(timesteps) - mse_loss_weights = (torch.stack( - [snr, self.model_args.snr_gamma * torch.ones_like(timesteps)], - dim=1).min(dim=1)[0] / snr) + mse_loss_weights = ( + torch.stack([snr, self.model_args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr + ) # We first calculate the original loss. Then we mean over the non-batch dimensions and # rebalance the sample-wise losses with their respective loss weights. # Finally, we take the mean of the rebalanced loss. - loss = F.mse_loss( - model_pred.float(), target.float(), reduction="none") - loss = loss.mean( - dim=list(range(1, len(loss.shape)))) * mse_loss_weights + loss = F.mse_loss(model_pred.float(), target.float(), reduction="none") + loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights loss = loss.mean() return loss def add_noise( - self, - original_samples: torch.Tensor, - noise: torch.Tensor, - timesteps: torch.Tensor, ) -> torch.Tensor: - sqrt_alpha_prod = self.alphas_cumprod[timesteps]**0.5 + self, + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.Tensor, + ) -> torch.Tensor: + sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5 sqrt_alpha_prod = sqrt_alpha_prod.flatten() while len(sqrt_alpha_prod.shape) < len(original_samples.shape): sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) - sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps])**0.5 + sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5 sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() - while len(sqrt_one_minus_alpha_prod.shape) < len( - original_samples.shape): + while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape): sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) - noisy_samples = (sqrt_alpha_prod * original_samples + - sqrt_one_minus_alpha_prod * noise) + noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise return noisy_samples - def get_velocity(self, - sample: torch.Tensor, - noise: torch.Tensor, - timesteps: torch.Tensor) -> torch.Tensor: - sqrt_alpha_prod = self.alphas_cumprod[timesteps]**0.5 + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor) -> torch.Tensor: + sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5 sqrt_alpha_prod = sqrt_alpha_prod.flatten() while len(sqrt_alpha_prod.shape) < len(sample.shape): sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) - sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps])**0.5 + sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5 sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape): sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) @@ -350,20 +337,19 @@ def decode_image(self, pixel_values=None, max_batch=8, **kwargs): @torch.no_grad() def log_image( - self, - input_ids=None, - height=256, - width=256, - eta=0.0, - guidance_scale=7.5, - max_batch=8, - **kwargs, ): + self, + input_ids=None, + height=256, + width=256, + eta=0.0, + guidance_scale=7.5, + max_batch=8, + **kwargs, + ): self.eval() with self.ema_scope(): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") # only log max_batch image if input_ids.shape[0] > max_batch: input_ids = input_ids[:max_batch] @@ -376,44 +362,40 @@ def log_image( padding="max_length", truncation=True, max_length=max_length, - return_tensors="pt", ) + return_tensors="pt", + ) uncond_embeddings = self.text_encoder( uncond_input.input_ids.to(device=input_ids.device), - return_dict=False, )[0] - text_embeddings = torch.cat( - [uncond_embeddings, text_embeddings], dim=0) - - latents = torch.randn(( - input_ids.shape[0], - self.unet.config.in_channels, - height // 8, - width // 8, )).to(device=input_ids.device) + return_dict=False, + )[0] + text_embeddings = torch.cat([uncond_embeddings, text_embeddings], dim=0) + + latents = torch.randn( + ( + input_ids.shape[0], + self.unet.config.in_channels, + height // 8, + width // 8, + ) + ).to(device=input_ids.device) latents = latents * self.eval_scheduler.init_noise_sigma - accepts_eta = "eta" in set( - inspect.signature(self.eval_scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta for t in self.eval_scheduler.timesteps: - latent_model_input = (torch.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.eval_scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = torch.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t) noise_pred = self.unet( latent_model_input, t, encoder_hidden_states=text_embeddings, - return_dict=False, )[0] + return_dict=False, + )[0] if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) - latents = self.eval_scheduler.step( - noise_pred, - t, - latents, - **extra_step_kwargs, - return_dict=False)[0] + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clamp(0, 1).permute(0, 2, 3, 1) * 255.0 @@ -422,8 +404,7 @@ def log_image( def set_recompute(self, use_recompute=False): if use_recompute: self.unet.enable_gradient_checkpointing() - if self.model_args.train_text_encoder and hasattr( - self.text_encoder, "gradient_checkpointing_enable"): + if self.model_args.train_text_encoder and hasattr(self.text_encoder, "gradient_checkpointing_enable"): self.text_encoder.gradient_checkpointing_enable() def gradient_checkpointing_enable(self): @@ -433,17 +414,15 @@ def set_xformers(self, use_xformers=False): if use_xformers: try: self.unet.enable_xformers_memory_efficient_attention() - if hasattr(self.vae, - "enable_xformers_memory_efficient_attention"): + if hasattr(self.vae, "enable_xformers_memory_efficient_attention"): self.vae.enable_xformers_memory_efficient_attention() - if hasattr(self.text_encoder, - "enable_xformers_memory_efficient_attention"): - self.text_encoder.enable_xformers_memory_efficient_attention( - ) + if hasattr(self.text_encoder, "enable_xformers_memory_efficient_attention"): + self.text_encoder.enable_xformers_memory_efficient_attention() except Exception as e: logger.warn( "Could not enable memory efficient attention. Make sure develop torchtorch is installed" - f" correctly and a GPU is available: {e}") + f" correctly and a GPU is available: {e}" + ) def set_ema(self, use_ema=False): self.use_ema = use_ema diff --git a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_args.py b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_args.py index 4efe98bed8a65..b49d994418a77 100644 --- a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_args.py +++ b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_args.py @@ -46,63 +46,58 @@ def str2bool(v): if not str2bool(os.getenv("FLAG_SDP", "True")): if hasattr(torch.nn.functional, "scaled_dot_product_attention"): - torch.nn.functional.scaled_dot_product_attention_ = ( - torch.nn.functional.scaled_dot_product_attention) + torch.nn.functional.scaled_dot_product_attention_ = torch.nn.functional.scaled_dot_product_attention del torch.nn.functional.scaled_dot_product_attention - print( - "Removed `torch.nn.functional.scaled_dot_product_attention`, we will use default attention implement." - ) + print("Removed `torch.nn.functional.scaled_dot_product_attention`, we will use default attention implement.") @dataclass class SDTrainingArguments(TrainingArguments): - image_logging_steps: int = field( - default=1000, metadata={"help": "Log image every X steps."}) + image_logging_steps: int = field(default=1000, metadata={"help": "Log image every X steps."}) recompute: bool = field( default=False, - metadata={"help": "Whether or not run recompute."}, ) + metadata={"help": "Whether or not run recompute."}, + ) benchmark: bool = field( default=False, - metadata={"help": "Whether or not run benchmark."}, ) + metadata={"help": "Whether or not run benchmark."}, + ) report_to: Optional[List[str]] = field( default_factory=lambda: ["custom_visualdl"], - metadata={ - "help": - "The list of integrations to report the results and logs to." - }, ) + metadata={"help": "The list of integrations to report the results and logs to."}, + ) resolution: int = field( default=512, metadata={ - "help": - "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution." - }, ) - use_ema: bool = field( - default=False, metadata={"help": "Whether or not use ema"}) + "help": "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution." + }, + ) + use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"}) enable_xformers_memory_efficient_attention: bool = field( - default=False, - metadata={"help": "enable_xformers_memory_efficient_attention."}) + default=False, metadata={"help": "enable_xformers_memory_efficient_attention."} + ) only_save_updated_model: bool = field( - default=True, - metadata={"help": "Whether or not save only_save_updated_model"}) + default=True, metadata={"help": "Whether or not save only_save_updated_model"} + ) log_level: str = field( default="info", - metadata={"help": "log_level."}, ) + metadata={"help": "log_level."}, + ) def __post_init__(self): super().__post_init__() self.image_logging_steps = ( - (math.ceil(self.image_logging_steps / self.logging_steps) * - self.logging_steps) if self.image_logging_steps > 0 else -1) - self.use_ema = str2bool(os.getenv("FLAG_USE_EMA", - "False")) or self.use_ema + (math.ceil(self.image_logging_steps / self.logging_steps) * self.logging_steps) + if self.image_logging_steps > 0 + else -1 + ) + self.use_ema = str2bool(os.getenv("FLAG_USE_EMA", "False")) or self.use_ema self.enable_xformers_memory_efficient_attention = ( - str2bool(os.getenv("FLAG_XFORMERS", "False")) or - self.enable_xformers_memory_efficient_attention) - self.recompute = (str2bool(os.getenv("FLAG_RECOMPUTE", "False")) or - self.recompute) + str2bool(os.getenv("FLAG_XFORMERS", "False")) or self.enable_xformers_memory_efficient_attention + ) + self.recompute = str2bool(os.getenv("FLAG_RECOMPUTE", "False")) or self.recompute self.gradient_checkpointing = self.gradient_checkpointing or self.recompute - self.benchmark = (str2bool(os.getenv("FLAG_BENCHMARK", "False")) or - self.benchmark) + self.benchmark = str2bool(os.getenv("FLAG_BENCHMARK", "False")) or self.benchmark def print_config(self, args=None, key=""): """ @@ -115,8 +110,7 @@ def print_config(self, args=None, key=""): logger.info("{:^40}".format("{} Configuration Arguments".format(key))) logger.info("{:30}: {}".format("torch version", torch.__version__)) - logger.info("{:30}: {}".format("torch commit id", - torch.version.git_version)) + logger.info("{:30}: {}".format("torch commit id", torch.version.git_version)) for a in dir(args): if a[:2] != "__": # don't print double underscore methods @@ -129,45 +123,34 @@ def print_config(self, args=None, key=""): @dataclass class SDModelArguments: - vae_name_or_path: Optional[str] = field( - default=None, metadata={"help": "vae_name_or_path"}) - text_encoder_name_or_path: Optional[str] = field( - default=None, metadata={"help": "text_encoder_name_or_path"}) - unet_name_or_path: Optional[str] = field( - default=None, metadata={"help": "unet_name_or_path"}) + vae_name_or_path: Optional[str] = field(default=None, metadata={"help": "vae_name_or_path"}) + text_encoder_name_or_path: Optional[str] = field(default=None, metadata={"help": "text_encoder_name_or_path"}) + unet_name_or_path: Optional[str] = field(default=None, metadata={"help": "unet_name_or_path"}) tokenizer_name: Optional[str] = field( default=None, - metadata={ - "help": - "Pretrained tokenizer name or path if not the same as pretrained_model_name_or_path" - }, ) + metadata={"help": "Pretrained tokenizer name or path if not the same as pretrained_model_name_or_path"}, + ) pretrained_model_name_or_path: str = field( default="CompVis/stable-diffusion-v1-4", - metadata={ - "help": - "Path to pretrained model or model, when we want to resume training." - }, ) - model_max_length: int = field( - default=77, metadata={"help": "Pretrained tokenizer model_max_length"}) + metadata={"help": "Path to pretrained model or model, when we want to resume training."}, + ) + model_max_length: int = field(default=77, metadata={"help": "Pretrained tokenizer model_max_length"}) prediction_type: str = field( default="epsilon", metadata={ - "help": - "prediction_type, prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf)" - }, ) - num_inference_steps: int = field( - default=50, metadata={"help": "num_inference_steps"}) - train_text_encoder: bool = field( - default=False, metadata={"help": "Whether or not train text encoder"}) - - noise_offset: float = field( - default=0, metadata={"help": "The scale of noise offset."}) + "help": "prediction_type, prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf)" + }, + ) + num_inference_steps: int = field(default=50, metadata={"help": "num_inference_steps"}) + train_text_encoder: bool = field(default=False, metadata={"help": "Whether or not train text encoder"}) + + noise_offset: float = field(default=0, metadata={"help": "The scale of noise offset."}) snr_gamma: Optional[float] = field( default=None, metadata={ - "help": - "SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. More details here: https://arxiv.org/abs/2303.09556." - }, ) + "help": "SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. More details here: https://arxiv.org/abs/2303.09556." + }, + ) input_perturbation: Optional[float] = field( default=0, metadata={"help": "The scale of input perturbation. Recommended 0.1."}, @@ -182,14 +165,18 @@ class SDDataArguments: file_list: str = field( default="./data/filelist/train.filelist.list", - metadata={"help": "The name of the file_list."}, ) + metadata={"help": "The name of the file_list."}, + ) num_records: int = field(default=10000000, metadata={"help": "num_records"}) buffer_size: int = field( default=100, - metadata={"help": "Buffer size"}, ) + metadata={"help": "Buffer size"}, + ) shuffle_every_n_samples: int = field( default=5, - metadata={"help": "shuffle_every_n_samples."}, ) + metadata={"help": "shuffle_every_n_samples."}, + ) interpolation: str = field( default="lanczos", - metadata={"help": "interpolation method"}, ) + metadata={"help": "interpolation method"}, + ) diff --git a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_trainer.py b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_trainer.py index 6420971caadf8..5338a0c72d142 100644 --- a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_trainer.py +++ b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/sd_trainer.py @@ -29,12 +29,13 @@ def on_log( - self, - args: TrainingArguments, - state: TrainerState, - control: TrainerControl, - logs, - **kwargs, ): + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + logs, + **kwargs, +): control.should_log = False return self.call_event("on_log", args, state, control, logs=logs, **kwargs) @@ -64,9 +65,7 @@ def __init__(self, vdl_writer=None): visualdl has_visualdl = False if not has_visualdl: - raise RuntimeError( - "VisualDLWithImageCallback requires visualdl to be installed. Please install visualdl." - ) + raise RuntimeError("VisualDLWithImageCallback requires visualdl to be installed. Please install visualdl.") if has_visualdl: try: from visualdl import LogWriter @@ -81,8 +80,7 @@ def __init__(self, vdl_writer=None): def on_step_end(self, args, state, control, model=None, **kwargs): if hasattr(model, "on_train_batch_end"): model.on_train_batch_end() - if (args.image_logging_steps > 0 and - state.global_step % args.image_logging_steps == 0): + if args.image_logging_steps > 0 and state.global_step % args.image_logging_steps == 0: control.should_log = True def _init_summary_writer(self, args, log_dir=None): @@ -108,34 +106,38 @@ def on_train_begin(self, args, state, control, **kwargs): self.vdl_writer.add_text("model_config", model_config_json) if hasattr(self.vdl_writer, "add_hparams"): - self.vdl_writer.add_hparams( - args.to_sanitized_dict(), metrics_list=[]) + self.vdl_writer.add_hparams(args.to_sanitized_dict(), metrics_list=[]) def on_log(self, args, state, control, logs=None, **kwargs): # log image on each node inputs = kwargs.get("inputs", None) model = kwargs.get("model", None) image_logs = {} - if (inputs is not None and model is not None and - args.image_logging_steps > 0 and - state.global_step % args.image_logging_steps == 0): + if ( + inputs is not None + and model is not None + and args.image_logging_steps > 0 + and state.global_step % args.image_logging_steps == 0 + ): max_batch = 4 if args.resolution > 256 else 8 image_logs["reconstruction"] = model.decode_image( - pixel_values=inputs["pixel_values"].to(args.device), - max_batch=max_batch) + pixel_values=inputs["pixel_values"].to(args.device), max_batch=max_batch + ) image_logs["ddim-samples-1.0"] = model.log_image( input_ids=inputs["input_ids"].to(args.device), guidance_scale=1.0, height=args.resolution, width=args.resolution, - max_batch=max_batch, ) + max_batch=max_batch, + ) image_logs["ddim-samples-7.5"] = model.log_image( input_ids=inputs["input_ids"].to(args.device), guidance_scale=7.5, height=args.resolution, width=args.resolution, - max_batch=max_batch, ) + max_batch=max_batch, + ) if not state.is_world_process_zero: return @@ -153,11 +155,11 @@ def on_log(self, args, state, control, logs=None, **kwargs): "Trainer is attempting to log a value of " f'"{v}" of type {type(v)} for key "{k}" as a scalar. ' "This invocation of VisualDL's writer.add_scalar() " - "is incorrect so we dropped this attribute.") + "is incorrect so we dropped this attribute." + ) # log images for k, v in image_logs.items(): - self.vdl_writer.add_image( - k, v, state.global_step, dataformats="NHWC") + self.vdl_writer.add_image(k, v, state.global_step, dataformats="NHWC") self.vdl_writer.flush() def on_train_end(self, args, state, control, **kwargs): @@ -202,8 +204,7 @@ def __init__(self, benchmark=True, **kwargs): self.benchmark = benchmark def on_train_begin(self, args, state, control, **kwargs): - assert (args.gradient_accumulation_steps == 1 and not args.do_eval and - not args.do_predict) + assert args.gradient_accumulation_steps == 1 and not args.do_eval and not args.do_predict if self.benchmark: self.reader_cost_avg = AverageStatistical() @@ -225,8 +226,7 @@ def on_step_end(self, args, state, control, **kwargs): def on_log(self, args, state, control, logs=None, **kwargs): if self.benchmark: if logs is not None and "interval_steps_per_second" in logs: - self.batch_start = self.batch_start + ( - time.time() - self.maybe_log_save_evaluate_start) + self.batch_start = self.batch_start + (time.time() - self.maybe_log_save_evaluate_start) ips = logs["interval_steps_per_second"] * args.train_batch_size avg_batch_cost = 1 / logs["interval_steps_per_second"] logger.info( @@ -238,14 +238,15 @@ def on_log(self, args, state, control, logs=None, **kwargs): self.reader_cost_avg.get_average(), avg_batch_cost, args.train_batch_size, - ips, )) + ips, + ) + ) self.reader_cost_avg.reset() def on_epoch_end(self, args, state, control, **kwargs): if self.benchmark: train_epoch_cost = time.time() - self.epoch_start - logger.info("train epoch: %d, epoch_cost: %.5f s" % - (state.epoch, train_epoch_cost)) + logger.info("train epoch: %d, epoch_cost: %.5f s" % (state.epoch, train_epoch_cost)) # register visualdl_with_image @@ -280,22 +281,22 @@ def get_train_dataloader(self): self.train_dataset, batch_size=self._train_batch_size, num_workers=self.args.dataloader_num_workers, - worker_init_fn=None - if self.args.world_size <= 1 else worker_init_fn, ) + worker_init_fn=None if self.args.world_size <= 1 else worker_init_fn, + ) else: return super().get_train_dataloader() def _inner_training_loop( - self, - batch_size=None, - args=None, - resume_from_checkpoint=None, - trial=None, - ignore_keys_for_eval=None, ): + self, + batch_size=None, + args=None, + resume_from_checkpoint=None, + trial=None, + ignore_keys_for_eval=None, + ): self.accelerator.free_memory() self._train_batch_size = batch_size - logger.debug( - f"Currently training with a batch size of: {self._train_batch_size}") + logger.debug(f"Currently training with a batch size of: {self._train_batch_size}") # Data loader and number of training steps train_dataloader = self.get_train_dataloader() @@ -303,32 +304,27 @@ def _inner_training_loop( # number of training epochs: num_train_epochs # number of training steps per epoch: num_update_steps_per_epoch # total number of training steps to execute: max_steps - total_train_batch_size = (args.train_batch_size * - args.gradient_accumulation_steps * - args.world_size) + total_train_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.world_size len_dataloader = None if has_length(train_dataloader): len_dataloader = len(train_dataloader) - num_update_steps_per_epoch = (len_dataloader // - args.gradient_accumulation_steps) + num_update_steps_per_epoch = len_dataloader // args.gradient_accumulation_steps num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1) num_examples = self.num_examples(train_dataloader) if args.max_steps > 0: max_steps = args.max_steps num_train_epochs = args.max_steps // num_update_steps_per_epoch + int( - args.max_steps % num_update_steps_per_epoch > 0) + args.max_steps % num_update_steps_per_epoch > 0 + ) # May be slightly incorrect if the last batch in the training dataloader has a smaller size but it's # the best we can do. num_train_samples = args.max_steps * total_train_batch_size else: - max_steps = math.ceil(args.num_train_epochs * - num_update_steps_per_epoch) + max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch) num_train_epochs = math.ceil(args.num_train_epochs) - num_train_samples = (self.num_examples(train_dataloader) * - args.num_train_epochs) - elif (args.max_steps > - 0): # Rely on max_steps when dataloader does not have a working size + num_train_samples = self.num_examples(train_dataloader) * args.num_train_epochs + elif args.max_steps > 0: # Rely on max_steps when dataloader does not have a working size max_steps = args.max_steps # Setting a very large number of epochs so we go as many times as necessary over the iterator. num_train_epochs = sys.maxsize @@ -338,7 +334,8 @@ def _inner_training_loop( else: raise ValueError( "args.max_steps must be set to a positive value if dataloader does not have a length, was" - f" {args.max_steps}") + f" {args.max_steps}" + ) # Compute absolute values for logging, eval, and save if given as ratio if args.logging_steps and args.logging_steps < 1: @@ -354,18 +351,20 @@ def _inner_training_loop( # references registered here no longer work on other gpus, breaking the module raise ValueError( "Currently --debug underflow_overflow is not supported under DP. Please use DDP" - " (torch.distributed.launch).") + " (torch.distributed.launch)." + ) else: debug_overflow = DebugUnderflowOverflow(self.model) # noqa delay_optimizer_creation = ( - self.sharded_ddp is not None and - self.sharded_ddp != ShardedDDPOption.SIMPLE or - is_sagemaker_mp_enabled() or self.fsdp is not None) + self.sharded_ddp is not None + and self.sharded_ddp != ShardedDDPOption.SIMPLE + or is_sagemaker_mp_enabled() + or self.fsdp is not None + ) if self.is_deepspeed_enabled: - self.optimizer, self.lr_scheduler = deepspeed_init( - self, num_training_steps=max_steps) + self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps) if not delay_optimizer_creation: self.create_optimizer_and_scheduler(num_training_steps=max_steps) @@ -396,12 +395,12 @@ def _inner_training_loop( if self.use_apex: model = self.accelerator.prepare(self.model) else: - model, self.optimizer = self.accelerator.prepare( - self.model, self.optimizer) + model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer) else: # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config. model, self.optimizer, self.lr_scheduler = self.accelerator.prepare( - self.model, self.optimizer, self.lr_scheduler) + self.model, self.optimizer, self.lr_scheduler + ) if self.is_fsdp_enabled: self.model = model @@ -417,8 +416,7 @@ def _inner_training_loop( self._globalstep_last_start_time = time.time() # deepspeed ckpt loading if resume_from_checkpoint is not None and self.is_deepspeed_enabled: - deepspeed_load_checkpoint(self.model_wrapped, - resume_from_checkpoint) + deepspeed_load_checkpoint(self.model_wrapped, resume_from_checkpoint) # Check if saved optimizer or scheduler states exist self._load_optimizer_and_scheduler(resume_from_checkpoint) @@ -431,19 +429,11 @@ def _inner_training_loop( logger.info("***** Running training *****") logger.info(f" Num examples = {num_examples:,}") logger.info(f" Num Epochs = {num_train_epochs:,}") - logger.info( - f" Instantaneous batch size per device = {self._train_batch_size:,}" - ) - logger.info( - f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}" - ) - logger.info( - f" Gradient Accumulation steps = {args.gradient_accumulation_steps}" - ) + logger.info(f" Instantaneous batch size per device = {self._train_batch_size:,}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {max_steps:,}") - logger.info( - f" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}" - ) + logger.info(f" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}") self.state.epoch = 0 start_time = time.time() @@ -453,24 +443,19 @@ def _inner_training_loop( # Check if continuing training from a checkpoint if resume_from_checkpoint is not None and os.path.isfile( - os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)): - self.state = TrainerState.load_from_json( - os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)) + os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME) + ): + self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)) epochs_trained = self.state.global_step // num_update_steps_per_epoch if not args.ignore_data_skip: - steps_trained_in_current_epoch = self.state.global_step % ( - num_update_steps_per_epoch) + steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch) steps_trained_in_current_epoch *= args.gradient_accumulation_steps else: steps_trained_in_current_epoch = 0 - logger.info( - " Continuing training from checkpoint, will skip to saved global_step" - ) + logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(f" Continuing training from epoch {epochs_trained}") - logger.info( - f" Continuing training from global step {self.state.global_step}" - ) + logger.info(f" Continuing training from global step {self.state.global_step}") if not args.ignore_data_skip: if skip_first_batches is None: logger.info( @@ -478,18 +463,16 @@ def _inner_training_loop( f" {steps_trained_in_current_epoch} batches in the first epoch. If this takes a lot of time," " you can install the latest version of Accelerate with `pip install -U accelerate`.You can" " also add the `--ignore_data_skip` flag to your launch command, but you will resume the" - " training on data already seen by your model.") + " training on data already seen by your model." + ) else: logger.info( f" Will skip the first {epochs_trained} epochs then the first" f" {steps_trained_in_current_epoch} batches in the first epoch." ) - if (self.is_local_process_zero() and not args.disable_tqdm and - skip_first_batches is None): - steps_trained_progress_bar = tqdm( - total=steps_trained_in_current_epoch) - steps_trained_progress_bar.set_description( - "Skipping the first batches") + if self.is_local_process_zero() and not args.disable_tqdm and skip_first_batches is None: + steps_trained_progress_bar = tqdm(total=steps_trained_in_current_epoch) + steps_trained_progress_bar.set_description("Skipping the first batches") # Update the references self.callback_handler.model = self.model @@ -501,9 +484,7 @@ def _inner_training_loop( # parameter to Train when using DDP. self.state.trial_name = self.hp_name(self._trial) if trial is not None: - assignments = (trial.assignments - if self.hp_search_backend == HPSearchBackend.SIGOPT - else trial) + assignments = trial.assignments if self.hp_search_backend == HPSearchBackend.SIGOPT else trial self.state.trial_params = hp_params(assignments) else: self.state.trial_params = None @@ -521,15 +502,14 @@ def _inner_training_loop( self._globalstep_last_logged = self.state.global_step model.zero_grad() - self.control = self.callback_handler.on_train_begin(args, self.state, - self.control) + self.control = self.callback_handler.on_train_begin(args, self.state, self.control) # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point. if not args.ignore_data_skip: for epoch in range(epochs_trained): - is_random_sampler = hasattr( - train_dataloader, "sampler") and isinstance( - train_dataloader.sampler, RandomSampler) + is_random_sampler = hasattr(train_dataloader, "sampler") and isinstance( + train_dataloader.sampler, RandomSampler + ) if is_torch_less_than_1_11 or not is_random_sampler: # We just need to begin an iteration to create the randomization of the sampler. # That was before PyTorch 1.11 however... @@ -542,17 +522,13 @@ def _inner_training_loop( total_batched_samples = 0 for epoch in range(epochs_trained, num_train_epochs): - if isinstance(train_dataloader, DataLoader) and isinstance( - train_dataloader.sampler, DistributedSampler): + if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler): train_dataloader.sampler.set_epoch(epoch) - elif hasattr(train_dataloader, "dataset") and isinstance( - train_dataloader.dataset, IterableDatasetShard): + elif hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDatasetShard): train_dataloader.dataset.set_epoch(epoch) if is_torch_tpu_available(): - parallel_loader = pl.ParallelLoader( - train_dataloader, - [args.device]).per_device_loader(args.device) + parallel_loader = pl.ParallelLoader(train_dataloader, [args.device]).per_device_loader(args.device) epoch_iterator = parallel_loader else: epoch_iterator = train_dataloader @@ -561,22 +537,20 @@ def _inner_training_loop( if args.past_index >= 0: self._past = None - steps_in_epoch = (len(epoch_iterator) - if len_dataloader is not None else - args.max_steps * args.gradient_accumulation_steps) - self.control = self.callback_handler.on_epoch_begin( - args, self.state, self.control) + steps_in_epoch = ( + len(epoch_iterator) + if len_dataloader is not None + else args.max_steps * args.gradient_accumulation_steps + ) + self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control) - if (epoch == epochs_trained and - resume_from_checkpoint is not None and - steps_trained_in_current_epoch == 0): + if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0: self._load_rng_state(resume_from_checkpoint) rng_to_sync = False steps_skipped = 0 if skip_first_batches is not None and steps_trained_in_current_epoch > 0: - epoch_iterator = skip_first_batches( - epoch_iterator, steps_trained_in_current_epoch) + epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch) steps_skipped = steps_trained_in_current_epoch steps_trained_in_current_epoch = 0 rng_to_sync = True @@ -601,18 +575,18 @@ def _inner_training_loop( steps_trained_progress_bar = None if step % args.gradient_accumulation_steps == 0: - self.control = self.callback_handler.on_step_begin( - args, self.state, self.control) + self.control = self.callback_handler.on_step_begin(args, self.state, self.control) with self.accelerator.accumulate(model): tr_loss_step = self.training_step(model, inputs) - if (args.logging_nan_inf_filter and - not is_torch_tpu_available() and - (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))): + if ( + args.logging_nan_inf_filter + and not is_torch_tpu_available() + and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) + ): # if loss is nan or inf simply add the average of previous logged losses - tr_loss += tr_loss / (1 + self.state.global_step - - self._globalstep_last_logged) + tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged) else: tr_loss += tr_loss_step @@ -622,9 +596,10 @@ def _inner_training_loop( # the `or` condition of `steps_in_epoch <= args.gradient_accumulation_steps` is not covered # in accelerate if total_batched_samples % args.gradient_accumulation_steps == 0 or ( - # last step in epoch but step is always smaller than gradient_accumulation_steps - steps_in_epoch <= args.gradient_accumulation_steps and - (step + 1) == steps_in_epoch): + # last step in epoch but step is always smaller than gradient_accumulation_steps + steps_in_epoch <= args.gradient_accumulation_steps + and (step + 1) == steps_in_epoch + ): # Gradient clipping if args.max_grad_norm is not None and args.max_grad_norm > 0: # deepspeed does its own clipping @@ -633,10 +608,7 @@ def _inner_training_loop( # Reduce gradients first for XLA if is_torch_tpu_available(): gradients = xm._fetch_gradients(self.optimizer) - xm.all_reduce( - "sum", - gradients, - scale=1.0 / xm.xrt_world_size()) + xm.all_reduce("sum", gradients, scale=1.0 / xm.xrt_world_size()) # AMP: gradients need unscaling self.scaler.unscale_(self.optimizer) @@ -652,11 +624,13 @@ def _inner_training_loop( # Revert to normal clipping otherwise, handling Apex or full precision nn.utils.clip_grad_norm_( amp.master_params(self.optimizer), - args.max_grad_norm, ) + args.max_grad_norm, + ) else: self.accelerator.clip_grad_norm_( model.parameters(), - args.max_grad_norm, ) + args.max_grad_norm, + ) # Optimizer step optimizer_was_run = True @@ -674,22 +648,20 @@ def _inner_training_loop( optimizer_was_run = scale_before <= scale_after else: self.optimizer.step() - optimizer_was_run = ( - not self.accelerator.optimizer_step_was_skipped) + optimizer_was_run = not self.accelerator.optimizer_step_was_skipped if optimizer_was_run: # Delay optimizer scheduling until metrics are generated if not isinstance( - self.lr_scheduler, - torch.optim.lr_scheduler.ReduceLROnPlateau, ): + self.lr_scheduler, + torch.optim.lr_scheduler.ReduceLROnPlateau, + ): self.lr_scheduler.step() model.zero_grad() self.state.global_step += 1 - self.state.epoch = ( - epoch + (step + 1 + steps_skipped) / steps_in_epoch) - self.control = self.callback_handler.on_step_end( - args, self.state, self.control) + self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch + self.control = self.callback_handler.on_step_end(args, self.state, self.control) self._maybe_log_save_evaluate( tr_loss, @@ -697,10 +669,10 @@ def _inner_training_loop( trial, epoch, ignore_keys_for_eval, - inputs=inputs, ) + inputs=inputs, + ) else: - self.control = self.callback_handler.on_substep_end( - args, self.state, self.control) + self.control = self.callback_handler.on_substep_end(args, self.state, self.control) if self.control.should_epoch_stop or self.control.should_training_stop: break @@ -712,15 +684,8 @@ def _inner_training_loop( ) self.control.should_training_stop = True - self.control = self.callback_handler.on_epoch_end(args, self.state, - self.control) - self._maybe_log_save_evaluate( - tr_loss, - model, - trial, - epoch, - ignore_keys_for_eval, - inputs=inputs) + self.control = self.callback_handler.on_epoch_end(args, self.state, self.control) + self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval, inputs=inputs) if DebugOption.TPU_METRICS_DEBUG in self.args.debug: if is_torch_tpu_available(): @@ -738,9 +703,7 @@ def _inner_training_loop( # Clean the state at the end of training delattr(self, "_past") - logger.info( - "\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n" - ) + logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") if args.load_best_model_at_end and self.state.best_model_checkpoint is not None: # Wait for everyone to get here so we are sur the model has been saved by process 0. if is_torch_tpu_available(): @@ -760,7 +723,8 @@ def _inner_training_loop( "train", start_time, num_samples=num_train_samples, - num_steps=self.state.max_steps, ) + num_steps=self.state.max_steps, + ) self.store_flos() metrics["total_flos"] = self.state.total_flos metrics["train_loss"] = train_loss @@ -772,27 +736,20 @@ def _inner_training_loop( self.log(metrics) run_dir = self._get_output_dir(trial) - checkpoints_sorted = self._sorted_checkpoints( - use_mtime=False, output_dir=run_dir) + checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir) # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save. - if (self.args.should_save and - self.state.best_model_checkpoint is not None and - self.args.save_total_limit == 1): + if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1: for checkpoint in checkpoints_sorted: if checkpoint != self.state.best_model_checkpoint: - logger.info( - f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit" - ) + logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit") shutil.rmtree(checkpoint) - self.control = self.callback_handler.on_train_end(args, self.state, - self.control) + self.control = self.callback_handler.on_train_end(args, self.state, self.control) return TrainOutput(self.state.global_step, train_loss, metrics) - def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, - ignore_keys_for_eval, **kwargs): + def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval, **kwargs): if self.control.should_log: if is_torch_tpu_available(): xm.mark_step() @@ -806,15 +763,15 @@ def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, tr_loss -= tr_loss logs["loss"] = round( - tr_loss_scalar / - (self.state.global_step - self._globalstep_last_logged), - 4, ) + tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), + 4, + ) logs["learning_rate"] = self._get_learning_rate() logs["global_step"] = int(self.state.global_step) - total_train_batch_size = (self.args.train_batch_size * - self.args.gradient_accumulation_steps * - self.args.world_size) + total_train_batch_size = ( + self.args.train_batch_size * self.args.gradient_accumulation_steps * self.args.world_size + ) num_steps = self.state.global_step - self._globalstep_last_logged self.store_flos() logs.update( @@ -822,7 +779,9 @@ def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, "interval", self._globalstep_last_start_time, num_samples=total_train_batch_size * num_steps, - num_steps=num_steps, )) + num_steps=num_steps, + ) + ) self._total_loss_scalar += tr_loss_scalar self._globalstep_last_logged = self.state.global_step @@ -834,20 +793,19 @@ def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, if self.control.should_evaluate: if isinstance(self.eval_dataset, dict): metrics = {} - for eval_dataset_name, eval_dataset in self.eval_dataset.items( - ): + for eval_dataset_name, eval_dataset in self.eval_dataset.items(): dataset_metrics = self.evaluate( eval_dataset=eval_dataset, ignore_keys=ignore_keys_for_eval, - metric_key_prefix=f"eval_{eval_dataset_name}", ) + metric_key_prefix=f"eval_{eval_dataset_name}", + ) metrics.update(dataset_metrics) else: metrics = self.evaluate(ignore_keys=ignore_keys_for_eval) self._report_to_hp_search(trial, self.state.global_step, metrics) # Run delayed LR scheduler now that metrics are populated - if isinstance(self.lr_scheduler, - torch.optim.lr_scheduler.ReduceLROnPlateau): + if isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): metric_to_check = self.args.metric_for_best_model if not metric_to_check.startswith("eval_"): metric_to_check = f"eval_{metric_to_check}" @@ -855,17 +813,15 @@ def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, if self.control.should_save: self._save_checkpoint(model, trial, metrics=metrics) - self.control = self.callback_handler.on_save(self.args, self.state, - self.control) + self.control = self.callback_handler.on_save(self.args, self.state, self.control) def log(self, logs: Dict[str, float], **kwargs) -> None: if self.state.epoch is not None: logs["epoch"] = round(self.state.epoch, 2) - output = { ** logs, ** {"step": self.state.global_step}} + output = {**logs, **{"step": self.state.global_step}} self.state.log_history.append(output) - self.control = self.callback_handler.on_log( - self.args, self.state, self.control, logs, **kwargs) + self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs, **kwargs) def _save(self, output_dir=None, state_dict=None): output_dir = output_dir if output_dir is not None else self.args.output_dir @@ -873,34 +829,26 @@ def _save(self, output_dir=None, state_dict=None): if self.args.only_save_updated_model: unwraped_model = unwrap_model(self.model) logger.info(f"Saving unet checkpoint to {output_dir}/unet") - unwraped_model.unet.save_pretrained( - os.path.join(output_dir, "unet")) + unwraped_model.unet.save_pretrained(os.path.join(output_dir, "unet")) if unwraped_model.use_ema: logger.info(f"Saving ema unet checkpoint to {output_dir}/unet") with unwraped_model.ema_scope(): - unwraped_model.unet.save_pretrained( - os.path.join(output_dir, "unet"), variant="ema") + unwraped_model.unet.save_pretrained(os.path.join(output_dir, "unet"), variant="ema") if unwraped_model.train_text_encoder: - logger.info( - f"Saving text encoder checkpoint to {output_dir}/text_encoder" - ) - unwraped_model.text_encoder.save_pretrained( - os.path.join(output_dir, "text_encoder")) + logger.info(f"Saving text encoder checkpoint to {output_dir}/text_encoder") + unwraped_model.text_encoder.save_pretrained(os.path.join(output_dir, "text_encoder")) else: logger.info(f"Saving model checkpoint to {output_dir}") if state_dict is None: state_dict = self.model.state_dict() - logger.info( - "Trainer.model is not a `PreTrainedModel`, only saving its state dict." - ) + logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.") if self.args.save_safetensors: import safetensors - safetensors.torch.save_file( - state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME)) + safetensors.torch.save_file(state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME)) else: torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME)) diff --git a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/text_image_pair_dataset.py b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/text_image_pair_dataset.py index 23507e6820cf0..6cbf69c57a1d4 100644 --- a/ppdiffusers/examples/stable_diffusion/torch_sd/sd/text_image_pair_dataset.py +++ b/ppdiffusers/examples/stable_diffusion/torch_sd/sd/text_image_pair_dataset.py @@ -45,8 +45,7 @@ def parse_src(filename): elif data_source == "laion_aes": text_json = json.loads(vec[2]) img_b64 = vec[5] - caption = text_json.get("caption_en", - text_json.get("blip_caption_en", "")) + caption = text_json.get("caption_en", text_json.get("blip_caption_en", "")) else: _, captions, _, _, _, img_b64 = vec[:6] caption = random.sample(captions.split("|"), 1)[0].replace("\1", "") @@ -63,24 +62,27 @@ def parse_src(filename): class TextImagePair(IterableDataset): def __init__( - self, - file_list, - size, - num_records, - image_processing=None, - buffer_size=1000, - shuffle_every_n_samples=5, - interpolation="lanczos", - tokenizer=None, ): + self, + file_list, + size, + num_records, + image_processing=None, + buffer_size=1000, + shuffle_every_n_samples=5, + interpolation="lanczos", + tokenizer=None, + ): self.size = size assert interpolation == "lanczos" if image_processing is None: - self.image_processing = transforms.Compose([ - transforms.Resize(int(size / 0.9), InterpolationMode.LANCZOS), - transforms.RandomCrop(size), - transforms.ToTensor(), - transforms.Normalize(0.5, 0.5), - ]) + self.image_processing = transforms.Compose( + [ + transforms.Resize(int(size / 0.9), InterpolationMode.LANCZOS), + transforms.RandomCrop(size), + transforms.ToTensor(), + transforms.Normalize(0.5, 0.5), + ] + ) else: self.image_processing = image_processing self.text_processing = lambda caption: tokenizer( @@ -88,7 +90,8 @@ def __init__( padding="max_length", truncation=True, max_length=tokenizer.model_max_length, - return_tensors="pt", ).input_ids[0] + return_tensors="pt", + ).input_ids[0] self.file_list = [] file_weights = [] with open(file_list, "r") as f: @@ -109,19 +112,14 @@ def __init__( file_weights = file_weights / file_weight_sum print(f"sample weights of files: {file_weights}") self.file_weights_cumsum = np.cumsum(file_weights) - self.file_weights_cumsum = np.concatenate( - [[0.0], self.file_weights_cumsum]) + self.file_weights_cumsum = np.concatenate([[0.0], self.file_weights_cumsum]) else: print("sample each file list with same probabiliy") self.file_weights_cumsum = None self.num_records = num_records - self.file_ids = [ - np.arange(len(filelist)) for filelist in self.file_list - ] - print( - f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}" - ) + self.file_ids = [np.arange(len(filelist)) for filelist in self.file_list] + print(f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}") self.buffer_size = buffer_size self.shuffle_every_n_samples = shuffle_every_n_samples @@ -130,9 +128,7 @@ def sample_loader(self, file_ids, filenames): random.shuffle(file_ids) for i in file_ids: filename = filenames[i].strip("\n") - with gzip.open(filename, - "rb") if filename.endswith(".gz") else open( - filename, "rb") as f: + with gzip.open(filename, "rb") if filename.endswith(".gz") else open(filename, "rb") as f: # retry = 0 while True: line = f.readline() @@ -158,19 +154,14 @@ def sample_loader(self, file_ids, filenames): if w < self.size or h < self.size: continue yield { - "pixel_values": - self.image_processing(data["image"]), - "input_ids": - self.text_processing(data["caption"]), + "pixel_values": self.image_processing(data["image"]), + "input_ids": self.text_processing(data["caption"]), } def random_load_from_multi_dataset(self): - print( - f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}" - ) + print(f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}") sample_loader_per_dataset = [ - iter(self.sample_loader(self.file_ids[i], self.file_list[i])) - for i in range(len(self.file_ids)) + iter(self.sample_loader(self.file_ids[i], self.file_list[i])) for i in range(len(self.file_ids)) ] while True: @@ -179,8 +170,7 @@ def random_load_from_multi_dataset(self): else: rand_num = random.random() for i in range(len(self.file_list)): - if (self.file_weights_cumsum[i] <= rand_num < - self.file_weights_cumsum[i + 1]): + if self.file_weights_cumsum[i] <= rand_num < self.file_weights_cumsum[i + 1]: break sample_loader = sample_loader_per_dataset[i] # debug diff --git a/ppdiffusers/examples/stable_diffusion/torch_sd/train_txt2img_laion400m_trainer.py b/ppdiffusers/examples/stable_diffusion/torch_sd/train_txt2img_laion400m_trainer.py index 0f6ad8874e14d..668ad3aae54a9 100644 --- a/ppdiffusers/examples/stable_diffusion/torch_sd/train_txt2img_laion400m_trainer.py +++ b/ppdiffusers/examples/stable_diffusion/torch_sd/train_txt2img_laion400m_trainer.py @@ -13,16 +13,20 @@ # limitations under the License. import os -import torch import transformers -from sd import (SDDataArguments, SDModelArguments, SDTrainingArguments, - StableDiffusionModel, StableDiffusionTrainer, TextImagePair) +from sd import ( + SDDataArguments, + SDModelArguments, + SDTrainingArguments, + StableDiffusionModel, + StableDiffusionTrainer, + TextImagePair, +) from transformers.trainer import get_last_checkpoint, set_seed def main(): - parser = transformers.HfArgumentParser( - (SDModelArguments, SDDataArguments, SDTrainingArguments)) + parser = transformers.HfArgumentParser((SDModelArguments, SDDataArguments, SDTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() log_level = training_args.get_process_log_level() @@ -37,16 +41,14 @@ def main(): # Detecting last checkpoint. last_checkpoint = None - if (os.path.isdir(training_args.output_dir) and training_args.do_train and - not training_args.overwrite_output_dir): + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len( - os.listdir(training_args.output_dir)) > 0: + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome.") - elif (last_checkpoint is not None and - training_args.resume_from_checkpoint is None): + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: print( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." @@ -65,13 +67,15 @@ def main(): buffer_size=data_args.buffer_size, shuffle_every_n_samples=data_args.shuffle_every_n_samples, interpolation=data_args.interpolation, - tokenizer=model.tokenizer, ) + tokenizer=model.tokenizer, + ) trainer = StableDiffusionTrainer( model=model, args=training_args, train_dataset=train_dataset, - tokenizer=model.tokenizer, ) + tokenizer=model.tokenizer, + ) checkpoint = None if training_args.resume_from_checkpoint is not None: diff --git a/ppdiffusers/examples/stable_diffusion/train_txt2img_laion400m_trainer.py b/ppdiffusers/examples/stable_diffusion/train_txt2img_laion400m_trainer.py index 7e0b5e6488085..4f4cd63ceb164 100644 --- a/ppdiffusers/examples/stable_diffusion/train_txt2img_laion400m_trainer.py +++ b/ppdiffusers/examples/stable_diffusion/train_txt2img_laion400m_trainer.py @@ -17,13 +17,18 @@ import paddle from paddlenlp.trainer import PdArgumentParser, get_last_checkpoint, set_seed from paddlenlp.utils.log import logger -from sd import (SDDataArguments, SDModelArguments, SDTrainingArguments, - StableDiffusionModel, StableDiffusionTrainer, TextImagePair) +from sd import ( + SDDataArguments, + SDModelArguments, + SDTrainingArguments, + StableDiffusionModel, + StableDiffusionTrainer, + TextImagePair, +) def main(): - parser = PdArgumentParser( - (SDModelArguments, SDDataArguments, SDTrainingArguments)) + parser = PdArgumentParser((SDModelArguments, SDDataArguments, SDTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() training_args.print_config(model_args, "Model") training_args.print_config(data_args, "Data") @@ -32,16 +37,14 @@ def main(): # Detecting last checkpoint. last_checkpoint = None - if (os.path.isdir(training_args.output_dir) and training_args.do_train and - not training_args.overwrite_output_dir): + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len( - os.listdir(training_args.output_dir)) > 0: + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome.") - elif (last_checkpoint is not None and - training_args.resume_from_checkpoint is None): + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." @@ -55,19 +58,16 @@ def main(): model.set_ema(training_args.use_ema) if training_args.to_static: - input_ids = paddle.static.InputSpec( - name="input_ids", - shape=[-1, model_args.model_max_length], - dtype="int64") + input_ids = paddle.static.InputSpec(name="input_ids", shape=[-1, model_args.model_max_length], dtype="int64") pixel_values = paddle.static.InputSpec( name="pixel_values", shape=[-1, 3, training_args.resolution, training_args.resolution], - dtype="float32", ) + dtype="float32", + ) specs = [input_ids, pixel_values] paddle.jit.ignore_module([os]) model = paddle.jit.to_static(model, input_spec=specs) - logger.info("Successfully to apply @to_static with specs: {}".format( - specs)) + logger.info("Successfully to apply @to_static with specs: {}".format(specs)) train_dataset = TextImagePair( file_list=data_args.file_list, @@ -76,18 +76,19 @@ def main(): buffer_size=data_args.buffer_size, shuffle_every_n_samples=data_args.shuffle_every_n_samples, interpolation=data_args.interpolation, - tokenizer=model.tokenizer, ) + tokenizer=model.tokenizer, + ) trainer = StableDiffusionTrainer( model=model, args=training_args, train_dataset=train_dataset, - tokenizer=model.tokenizer, ) + tokenizer=model.tokenizer, + ) if model_args.train_text_encoder: if training_args.text_encoder_learning_rate == training_args.unet_learning_rate: - params_to_train = itertools.chain(model.text_encoder.parameters(), - model.unet.parameters()) + params_to_train = itertools.chain(model.text_encoder.parameters(), model.unet.parameters()) else: # overwrite default learning rate with 1.0 training_args.learning_rate = 1.0 diff --git a/ppdiffusers/examples/t2i-adapter/adapter/adapter_args.py b/ppdiffusers/examples/t2i-adapter/adapter/adapter_args.py index aee1fac6ac23b..857c78b0ae1a9 100644 --- a/ppdiffusers/examples/t2i-adapter/adapter/adapter_args.py +++ b/ppdiffusers/examples/t2i-adapter/adapter/adapter_args.py @@ -24,60 +24,46 @@ class ModelArguments: adapter_config_file: Optional[str] = field( default="./config/openpose_adapter.json", - metadata={"help": "adapter_config_file"}, ) - vae_name_or_path: Optional[str] = field( - default=None, metadata={"help": "pretrained_vae_name_or_path"}) - text_encoder_name_or_path: Optional[str] = field( - default=None, metadata={"help": "text_encoder_name_or_path"}) - unet_name_or_path: Optional[str] = field( - default=None, metadata={"help": "unet_encoder_name_or_path"}) + metadata={"help": "adapter_config_file"}, + ) + vae_name_or_path: Optional[str] = field(default=None, metadata={"help": "pretrained_vae_name_or_path"}) + text_encoder_name_or_path: Optional[str] = field(default=None, metadata={"help": "text_encoder_name_or_path"}) + unet_name_or_path: Optional[str] = field(default=None, metadata={"help": "unet_encoder_name_or_path"}) tokenizer_name: Optional[str] = field( default=None, - metadata={ - "help": - "Pretrained tokenizer name or path if not the same as model_name" - }, ) - model_max_length: Optional[int] = field( - default=77, metadata={"help": "Pretrained tokenizer model_max_length"}) - num_inference_steps: Optional[int] = field( - default=50, metadata={"help": "num_inference_steps"}) - use_ema: bool = field( - default=False, metadata={"help": "Whether or not use ema"}) + metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}, + ) + model_max_length: Optional[int] = field(default=77, metadata={"help": "Pretrained tokenizer model_max_length"}) + num_inference_steps: Optional[int] = field(default=50, metadata={"help": "num_inference_steps"}) + use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"}) pretrained_model_name_or_path: str = field( default="runwayml/stable-diffusion-v1-5", - metadata={ - "help": - "Path to pretrained model or model, when we want to resume training." - }, ) + metadata={"help": "Path to pretrained model or model, when we want to resume training."}, + ) pretrained_adapter_name_or_path: str = field( default=None, metadata={ - "help": - "The pretrained weight of adapter, which is used to facilitate loading the same initialization for training." - }, ) - image_logging_steps: Optional[int] = field( - default=1000, metadata={"help": "Log image every X steps."}) - use_paddle_conv_init: bool = field( - default=False, - metadata={"help": "Whether or not use paddle conv2d init."}) - is_ldmbert: bool = field( - default=False, metadata={"help": "Whether to use ldmbert."}) + "help": "The pretrained weight of adapter, which is used to facilitate loading the same initialization for training." + }, + ) + image_logging_steps: Optional[int] = field(default=1000, metadata={"help": "Log image every X steps."}) + use_paddle_conv_init: bool = field(default=False, metadata={"help": "Whether or not use paddle conv2d init."}) + is_ldmbert: bool = field(default=False, metadata={"help": "Whether to use ldmbert."}) enable_xformers_memory_efficient_attention: bool = field( - default=False, - metadata={"help": "enable_xformers_memory_efficient_attention."}) - control_type: Optional[str] = field( - default="canny", metadata={"help": "The type of control"}) + default=False, metadata={"help": "enable_xformers_memory_efficient_attention."} + ) + control_type: Optional[str] = field(default="canny", metadata={"help": "The type of control"}) latents_path: str = field( default=None, - metadata={"help": "Path to latents, used for alignment."}, ) - random_alignment: bool = field( - default=False, metadata={"help": "Whether to align random."}) + metadata={"help": "Path to latents, used for alignment."}, + ) + random_alignment: bool = field(default=False, metadata={"help": "Whether to align random."}) timestep_sample_schedule: Optional[str] = field( default="linear", metadata={ - "help": - "The type of timestep-sampling schedule during training, select from ['linear', 'cosine', 'cubic']." - }, ) + "help": "The type of timestep-sampling schedule during training, select from ['linear', 'cosine', 'cubic']." + }, + ) @dataclass @@ -88,26 +74,29 @@ class DataArguments: file_list: str = field( default="./data/filelist/train.filelist.list", - metadata={"help": "The name of the file_list."}, ) + metadata={"help": "The name of the file_list."}, + ) resolution: int = field( default=512, metadata={ - "help": - "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution." - }, ) + "help": "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution." + }, + ) num_records: int = field(default=10000000, metadata={"help": "num_records"}) buffer_size: int = field( default=100, - metadata={"help": "Buffer size"}, ) + metadata={"help": "Buffer size"}, + ) shuffle_every_n_samples: int = field( default=5, - metadata={"help": "shuffle_every_n_samples."}, ) + metadata={"help": "shuffle_every_n_samples."}, + ) data_format: str = field( default="default", metadata={ - "help": - "The data format, must be 'default' or 'img2img'. The img2img format directly provides control image." - }, ) + "help": "The data format, must be 'default' or 'img2img'. The img2img format directly provides control image." + }, + ) @dataclass @@ -116,45 +105,28 @@ class GenerateArguments: Arguments pertaining to specify the model generation settings. """ - use_controlnet: bool = field( - default=False, metadata={"help": "Whether or not use text condition"}) - use_dumpy_dataset: bool = field( - default=False, metadata={"help": "Whether or not use dummpy dataset"}) - adapter_model_name_or_path: str = field( - default=None, metadata={"help": "adapter model name or path."}) - sd_model_name_or_path: str = field( - default=None, metadata={"help": "sd model name or path."}) - file: str = field( - default="data/test.openpose.filelist", metadata={"help": "eval file."}) + use_controlnet: bool = field(default=False, metadata={"help": "Whether or not use text condition"}) + use_dumpy_dataset: bool = field(default=False, metadata={"help": "Whether or not use dummpy dataset"}) + adapter_model_name_or_path: str = field(default=None, metadata={"help": "adapter model name or path."}) + sd_model_name_or_path: str = field(default=None, metadata={"help": "sd model name or path."}) + file: str = field(default="data/test.openpose.filelist", metadata={"help": "eval file."}) seed: int = field(default=42, metadata={"help": "random seed."}) scheduler_type: str = field( default="ddim", - metadata={ - "help": - "Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler-ancest']" - }, ) + metadata={"help": "Type of scheduler to use. Should be one of ['pndm', 'lms', 'ddim', 'euler-ancest']"}, + ) device: str = field(default="gpu", metadata={"help": "device"}) batch_size: int = field(default=16, metadata={"help": "batch_size"}) - num_inference_steps: int = field( - default=50, metadata={"help": "num_inference_steps"}) - save_path: str = field( - default="output/adapter/", - metadata={"help": "Path to the output file."}) - guidance_scales: str = field( - default_factory=lambda: [5, 7, 9], - metadata={"help": "guidance_scales list."}) + num_inference_steps: int = field(default=50, metadata={"help": "num_inference_steps"}) + save_path: str = field(default="output/adapter/", metadata={"help": "Path to the output file."}) + guidance_scales: str = field(default_factory=lambda: [5, 7, 9], metadata={"help": "guidance_scales list."}) height: int = field(default=512, metadata={"help": "height."}) width: int = field(default=512, metadata={"help": "width."}) - max_generation_limits: int = field( - default=1000, metadata={"help": "max generation limits."}) - use_text_cond: bool = field( - default=True, metadata={"help": "Whether or not use text condition"}) + max_generation_limits: int = field(default=1000, metadata={"help": "max generation limits."}) + use_text_cond: bool = field(default=True, metadata={"help": "Whether or not use text condition"}) use_default_neg_text_cond: bool = field( default=True, - metadata={ - "help": "Whether or not use default negative text condition" - }, ) - generate_data_format: str = field( - default="img2img", metadata={"help": "Generate data format."}) - generate_control_image_processor_type: str = field( - default="openpose", metadata={"help": "Generate data format."}) + metadata={"help": "Whether or not use default negative text condition"}, + ) + generate_data_format: str = field(default="img2img", metadata={"help": "Generate data format."}) + generate_control_image_processor_type: str = field(default="openpose", metadata={"help": "Generate data format."}) diff --git a/ppdiffusers/examples/t2i-adapter/adapter/adapter_trainer.py b/ppdiffusers/examples/t2i-adapter/adapter/adapter_trainer.py index a6151bf307d1c..b7ff85077b613 100644 --- a/ppdiffusers/examples/t2i-adapter/adapter/adapter_trainer.py +++ b/ppdiffusers/examples/t2i-adapter/adapter/adapter_trainer.py @@ -20,8 +20,11 @@ import paddle.amp.auto_cast as autocast from paddle.io import DataLoader from paddlenlp.trainer import Trainer -from paddlenlp.trainer.integrations import (INTEGRATION_TO_CALLBACK, - VisualDLCallback, rewrite_logs) +from paddlenlp.trainer.integrations import ( + INTEGRATION_TO_CALLBACK, + VisualDLCallback, + rewrite_logs, +) from paddlenlp.utils.log import logger from ppdiffusers.training_utils import unwrap_model @@ -40,19 +43,17 @@ def autocast_smart_context_manager(self, args): "c_softmax_with_cross_entropy", ], level=args.fp16_opt_level, - dtype=amp_dtype, ) + dtype=amp_dtype, + ) else: - ctx_manager = (contextlib.nullcontext() - if sys.version_info >= (3, 7) else - contextlib.suppress()) + ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress() return ctx_manager def on_step_end(self, args, state, control, model=None, **kwargs): if hasattr(model, "on_train_batch_end"): model.on_train_batch_end() - if (args.image_logging_steps > 0 and - state.global_step % args.image_logging_steps == 0): + if args.image_logging_steps > 0 and state.global_step % args.image_logging_steps == 0: control.should_log = True def on_log(self, args, state, control, logs=None, **kwargs): @@ -63,20 +64,22 @@ def on_log(self, args, state, control, logs=None, **kwargs): inputs = kwargs.get("inputs", None) model = kwargs.get("model", None) image_logs = {} - if (inputs is not None and model is not None and - args.image_logging_steps > 0 and - state.global_step % args.image_logging_steps == 0): + if ( + inputs is not None + and model is not None + and args.image_logging_steps > 0 + and state.global_step % args.image_logging_steps == 0 + ): with self.autocast_smart_context_manager(args): - image_logs["reconstruction"] = model.decode_image( - pixel_values=inputs["pixel_values"]) - image_logs["control"] = model.decode_control_image( - adapter_cond=inputs["adapter_cond"]) + image_logs["reconstruction"] = model.decode_image(pixel_values=inputs["pixel_values"]) + image_logs["control"] = model.decode_control_image(adapter_cond=inputs["adapter_cond"]) image_logs["ddim-samples-9.0"] = model.log_image( input_ids=inputs["input_ids"], adapter_cond=inputs["adapter_cond"], guidance_scale=9.0, height=args.resolution, - width=args.resolution, ) + width=args.resolution, + ) if self.vdl_writer is None: self._init_summary_writer(args) @@ -91,11 +94,11 @@ def on_log(self, args, state, control, logs=None, **kwargs): "Trainer is attempting to log a value of " f'"{v}" of type {type(v)} for key "{k}" as a scalar. ' "This invocation of VisualDL's writer.add_scalar() " - "is incorrect so we dropped this attribute.") + "is incorrect so we dropped this attribute." + ) # log images for k, v in image_logs.items(): - self.vdl_writer.add_image( - k, v, state.global_step, dataformats="NHWC") + self.vdl_writer.add_image(k, v, state.global_step, dataformats="NHWC") self.vdl_writer.flush() @@ -104,12 +107,9 @@ def on_log(self, args, state, control, logs=None, **kwargs): def collate_fn(examples): - pixel_values = paddle.stack( - [paddle.to_tensor(example["pixel_values"]) for example in examples]) - input_ids = paddle.stack( - [paddle.to_tensor(example["input_ids"]) for example in examples]) - adapter_cond = paddle.stack( - [paddle.to_tensor(example["adapter_cond"]) for example in examples]) + pixel_values = paddle.stack([paddle.to_tensor(example["pixel_values"]) for example in examples]) + input_ids = paddle.stack([paddle.to_tensor(example["input_ids"]) for example in examples]) + adapter_cond = paddle.stack([paddle.to_tensor(example["adapter_cond"]) for example in examples]) batch = { "input_ids": input_ids, @@ -133,18 +133,16 @@ def get_train_dataloader(self): batch_size=self.args.train_batch_size, num_workers=self.args.dataloader_num_workers, worker_init_fn=worker_init_fn, - collate_fn=collate_fn, ) + collate_fn=collate_fn, + ) else: return super().get_train_dataloader() - def _save(self, - output_dir=None, - state_dict=None, - merge_tensor_parallel=False): + def _save(self, output_dir=None, state_dict=None, merge_tensor_parallel=False): super()._save( output_dir=output_dir, state_dict=state_dict, - merge_tensor_parallel=merge_tensor_parallel, ) + merge_tensor_parallel=merge_tensor_parallel, + ) output_dir = output_dir if output_dir is not None else self.args.output_dir - unwrap_model(self.model).adapter.save_pretrained( - os.path.join(output_dir, "adapter")) + unwrap_model(self.model).adapter.save_pretrained(os.path.join(output_dir, "adapter")) diff --git a/ppdiffusers/examples/t2i-adapter/adapter/data_preprocess.py b/ppdiffusers/examples/t2i-adapter/adapter/data_preprocess.py index 5dd1dec076803..e179df14c8f40 100644 --- a/ppdiffusers/examples/t2i-adapter/adapter/data_preprocess.py +++ b/ppdiffusers/examples/t2i-adapter/adapter/data_preprocess.py @@ -45,8 +45,7 @@ def process_data(line, filename, data_format): control_image_b64str = None caption = "" - caption += text_json.get("caption_en", - text_json.get("blip_caption_en", "")) + caption += text_json.get("caption_en", text_json.get("blip_caption_en", "")) if caption != "": image_base64 = image_b64str else: @@ -65,11 +64,9 @@ def parse_line(line, filename, data_format="default"): res = process_data(line, filename, data_format) if res is not None: image_base64, caption, _id, control_image_base64 = res - image = Image.open(io.BytesIO(base64.b64decode( - image_base64))).convert("RGB") + image = Image.open(io.BytesIO(base64.b64decode(image_base64))).convert("RGB") if control_image_base64 is not None: - image_extract = io.BytesIO( - base64.b64decode(control_image_base64)) + image_extract = io.BytesIO(base64.b64decode(control_image_base64)) control_image = Image.open(image_extract).convert("RGB") control_image = control_image.resize(image.size) @@ -83,7 +80,8 @@ def parse_line(line, filename, data_format="default"): (image.size[0] - image.size[1]) // 2, 0, (image.size[0] + image.size[1]) // 2, - image.size[1], ) + image.size[1], + ) image = image.crop(crop_size) if control_image is not None: control_image = control_image.crop(crop_size) @@ -95,7 +93,8 @@ def parse_line(line, filename, data_format="default"): image=image, caption=caption, _id=_id, - control_image=control_image, ) + control_image=control_image, + ) else: return None except Exception as e: diff --git a/ppdiffusers/examples/t2i-adapter/adapter/dumpy_dataset.py b/ppdiffusers/examples/t2i-adapter/adapter/dumpy_dataset.py index 91969cb548b8c..74b3617fb060b 100644 --- a/ppdiffusers/examples/t2i-adapter/adapter/dumpy_dataset.py +++ b/ppdiffusers/examples/t2i-adapter/adapter/dumpy_dataset.py @@ -23,11 +23,12 @@ class Fill50kDataset(Dataset): def __init__( - self, - tokenizer, - file_path="./fill50k", - do_image_processing=True, - do_text_processing=True, ): + self, + tokenizer, + file_path="./fill50k", + do_image_processing=True, + do_text_processing=True, + ): self.tokenizer = tokenizer self.image_list = [] self.label_list = [] @@ -47,7 +48,8 @@ def __init__( padding="max_length", truncation=True, max_length=tokenizer.model_max_length, - return_tensors="np", ).input_ids[0] + return_tensors="np", + ).input_ids[0] self.do_image_processing = do_image_processing self.do_text_processing = do_text_processing @@ -67,13 +69,11 @@ def __getitem__(self, idx): if self.do_image_processing: # Normalize source images to [0, 1]. source = source.astype(np.float32) / 255.0 - source = paddle.to_tensor( - source.transpose([2, 0, 1]), dtype=paddle.float32) + source = paddle.to_tensor(source.transpose([2, 0, 1]), dtype=paddle.float32) # Normalize target images to [-1, 1]. target = (target.astype(np.float32) / 127.5) - 1.0 - target = paddle.to_tensor( - target.transpose([2, 0, 1]), dtype=paddle.float32) + target = paddle.to_tensor(target.transpose([2, 0, 1]), dtype=paddle.float32) if self.text_processing and self.do_text_processing: input_ids = self.text_processing(prompt) @@ -84,4 +84,5 @@ def __getitem__(self, idx): return dict( input_ids=input_ids, pixel_values=target, - adapter_cond=source, ) + adapter_cond=source, + ) diff --git a/ppdiffusers/examples/t2i-adapter/adapter/model.py b/ppdiffusers/examples/t2i-adapter/adapter/model.py index 2e31f0262f56b..1c9d6f678955e 100644 --- a/ppdiffusers/examples/t2i-adapter/adapter/model.py +++ b/ppdiffusers/examples/t2i-adapter/adapter/model.py @@ -24,9 +24,16 @@ from paddlenlp.transformers import AutoTokenizer, CLIPTextModel from paddlenlp.utils.log import logger -from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler, - LDMBertModel, T2IAdapter, UNet2DConditionModel, - is_ppxformers_available) +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + DDPMScheduler, + LDMBertModel, + T2IAdapter, + UNet2DConditionModel, + is_ppxformers_available, +) + # from ppdiffusers.initializer import reset_initialized_parameter from ppdiffusers.models.ema import LitEma from ppdiffusers.training_utils import freeze_params @@ -52,18 +59,20 @@ def __init__(self, model_args): # init tokenizer tokenizer_name_or_path = ( model_args.tokenizer_name - if model_args.pretrained_model_name_or_path is None else - os.path.join(model_args.pretrained_model_name_or_path, "tokenizer")) + if model_args.pretrained_model_name_or_path is None + else os.path.join(model_args.pretrained_model_name_or_path, "tokenizer") + ) self.tokenizer = AutoTokenizer.from_pretrained( - tokenizer_name_or_path, - model_max_length=model_args.model_max_length) + tokenizer_name_or_path, model_max_length=model_args.model_max_length + ) vae_name = "vqvae" if model_args.is_ldmbert else "vae" # init vae vae_name_or_path = ( model_args.vae_name_or_path - if model_args.pretrained_model_name_or_path is None else - os.path.join(model_args.pretrained_model_name_or_path, vae_name)) + if model_args.pretrained_model_name_or_path is None + else os.path.join(model_args.pretrained_model_name_or_path, vae_name) + ) self.vae = AutoencoderKL.from_pretrained(vae_name_or_path) freeze_params(self.vae.parameters()) @@ -72,27 +81,27 @@ def __init__(self, model_args): if model_args.is_ldmbert: text_encoder_name_or_path = ( model_args.text_encoder_name_or_path - if model_args.pretrained_model_name_or_path is None else - os.path.join(model_args.pretrained_model_name_or_path, "bert")) + if model_args.pretrained_model_name_or_path is None + else os.path.join(model_args.pretrained_model_name_or_path, "bert") + ) # init text_encoder - self.text_encoder = LDMBertModel.from_pretrained( - text_encoder_name_or_path) + self.text_encoder = LDMBertModel.from_pretrained(text_encoder_name_or_path) else: text_encoder_name_or_path = ( model_args.text_encoder_name_or_path - if model_args.pretrained_model_name_or_path is None else - os.path.join(model_args.pretrained_model_name_or_path, - "text_encoder")) - self.text_encoder = CLIPTextModel.from_pretrained( - text_encoder_name_or_path) + if model_args.pretrained_model_name_or_path is None + else os.path.join(model_args.pretrained_model_name_or_path, "text_encoder") + ) + self.text_encoder = CLIPTextModel.from_pretrained(text_encoder_name_or_path) freeze_params(self.text_encoder.parameters()) logger.info("Freeze text_encoder parameters!") unet_name_or_path = ( model_args.unet_name_or_path - if model_args.pretrained_model_name_or_path is None else - os.path.join(model_args.pretrained_model_name_or_path, "unet")) + if model_args.pretrained_model_name_or_path is None + else os.path.join(model_args.pretrained_model_name_or_path, "unet") + ) self.unet = UNet2DConditionModel.from_pretrained(unet_name_or_path) @@ -100,44 +109,43 @@ def __init__(self, model_args): logger.info("Freeze unet parameters!") if model_args.pretrained_adapter_name_or_path: - self.adapter = T2IAdapter.from_pretrained( - model_args.pretrained_adapter_name_or_path) + self.adapter = T2IAdapter.from_pretrained(model_args.pretrained_adapter_name_or_path) else: - self.adapter = T2IAdapter( - **read_json(model_args.adapter_config_file)) + self.adapter = T2IAdapter(**read_json(model_args.adapter_config_file)) self.noise_scheduler = DDPMScheduler( beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", - num_train_timesteps=1000, ) + num_train_timesteps=1000, + ) self.eval_scheduler = DDIMScheduler( beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False, - steps_offset=1, ) + steps_offset=1, + ) self.eval_scheduler.set_timesteps(model_args.num_inference_steps) self.use_ema = model_args.use_ema if self.use_ema: self.model_ema = LitEma(self.adapter) self.adapter_conditioning_scale = 1.0 - if (model_args.enable_xformers_memory_efficient_attention and - is_ppxformers_available()): + if model_args.enable_xformers_memory_efficient_attention and is_ppxformers_available(): try: self.unet.enable_xformers_memory_efficient_attention() self.adapter.enable_xformers_memory_efficient_attention() except Exception as e: logger.warn( "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed" - f" correctly and a GPU is available: {e}") + f" correctly and a GPU is available: {e}" + ) self.use_preconfig_latents = False if model_args.latents_path: self.use_preconfig_latents = True - self.register_buffer("preconfig_latents", - paddle.load(model_args.latents_path)) + self.register_buffer("preconfig_latents", paddle.load(model_args.latents_path)) self.random_alignment = model_args.random_alignment self.timestep_sample_schedule = model_args.timestep_sample_schedule @@ -162,36 +170,29 @@ def on_train_batch_end(self): def get_time_with_schedule(self, timestep_sample_schedule, bs): if timestep_sample_schedule == "linear": - t = paddle.randint( - low=0, - high=self.noise_scheduler.num_train_timesteps, - shape=(bs, )).astype(dtype="int64") + t = paddle.randint(low=0, high=self.noise_scheduler.num_train_timesteps, shape=(bs,)).astype(dtype="int64") elif timestep_sample_schedule == "cosine": - t = paddle.rand(shape=(bs, )) - t = paddle.cos(x=np.pi / 2.0 * - t) * self.noise_scheduler.num_train_timesteps + t = paddle.rand(shape=(bs,)) + t = paddle.cos(x=np.pi / 2.0 * t) * self.noise_scheduler.num_train_timesteps t = t.astype(dtype="int64") elif timestep_sample_schedule == "cubic": - t = paddle.rand(shape=(bs, )) + t = paddle.rand(shape=(bs,)) t = (1 - t**3) * self.noise_scheduler.num_train_timesteps t = t.astype(dtype="int64") else: raise NotImplementedError - t = paddle.clip( - x=t, min=0, max=self.noise_scheduler.num_train_timesteps - 1) + t = paddle.clip(x=t, min=0, max=self.noise_scheduler.num_train_timesteps - 1) return t - def get_time_with_schedule_and_numpy_generator( - self, timestep_sample_schedule, bs): + def get_time_with_schedule_and_numpy_generator(self, timestep_sample_schedule, bs): if timestep_sample_schedule == "linear": t = paddle.to_tensor( - generator.randint( - 0, self.noise_scheduler.num_train_timesteps, size=(bs, )), - dtype="int64", ) + generator.randint(0, self.noise_scheduler.num_train_timesteps, size=(bs,)), + dtype="int64", + ) elif timestep_sample_schedule == "cosine": t = paddle.to_tensor(generator.rand(bs)) - t = paddle.cos(x=np.pi / 2.0 * - t) * self.noise_scheduler.num_train_timesteps + t = paddle.cos(x=np.pi / 2.0 * t) * self.noise_scheduler.num_train_timesteps t = t.astype(dtype="int64") elif timestep_sample_schedule == "cubic": t = paddle.to_tensor(generator.rand(bs)) @@ -199,18 +200,12 @@ def get_time_with_schedule_and_numpy_generator( t = t.astype(dtype="int64") else: raise NotImplementedError - t = paddle.clip( - x=t, min=0, max=self.noise_scheduler.num_train_timesteps - 1) + t = paddle.clip(x=t, min=0, max=self.noise_scheduler.num_train_timesteps - 1) return t - def forward(self, - input_ids=None, - pixel_values=None, - adapter_cond=None, - **kwargs): + def forward(self, input_ids=None, pixel_values=None, adapter_cond=None, **kwargs): with paddle.no_grad(): - adapter_cond = self.control_image_processor.process_model_forward( - adapter_cond) + adapter_cond = self.control_image_processor.process_model_forward(adapter_cond) self.train() with paddle.amp.auto_cast(enable=False): with paddle.no_grad(): @@ -220,15 +215,13 @@ def forward(self, latents = latents * 0.18215 if self.random_alignment: timesteps = self.get_time_with_schedule_and_numpy_generator( - self.timestep_sample_schedule, latents.shape[0]) - noise = paddle.to_tensor( - generator.randn(*latents.shape), dtype="float32") + self.timestep_sample_schedule, latents.shape[0] + ) + noise = paddle.to_tensor(generator.randn(*latents.shape), dtype="float32") else: - timesteps = self.get_time_with_schedule( - self.timestep_sample_schedule, latents.shape[0]) + timesteps = self.get_time_with_schedule(self.timestep_sample_schedule, latents.shape[0]) noise = paddle.randn(latents.shape) - noisy_latents = self.noise_scheduler.add_noise(latents, noise, - timesteps) + noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps) encoder_hidden_states = self.text_encoder(input_ids)[0] adapter_state = self.adapter(adapter_cond) @@ -240,7 +233,8 @@ def forward(self, noisy_latents, timestep=timesteps, encoder_hidden_states=encoder_hidden_states, - down_block_additional_residuals=adapter_state, ).sample + down_block_additional_residuals=adapter_state, + ).sample loss = F.mse_loss(noise_pred, noise, reduction="mean") return loss @@ -257,29 +251,25 @@ def decode_image(self, pixel_values=None, **kwargs): @paddle.no_grad() def decode_control_image(self, adapter_cond=None, **kwargs): - adapter_cond = self.control_image_processor.process_model_forward( - adapter_cond) # (0, 1) - return (255 * (adapter_cond.transpose( - [0, 2, 3, 1])).cast("float32").numpy().round()) + adapter_cond = self.control_image_processor.process_model_forward(adapter_cond) # (0, 1) + return 255 * (adapter_cond.transpose([0, 2, 3, 1])).cast("float32").numpy().round() @paddle.no_grad() def log_image( - self, - input_ids=None, - adapter_cond=None, - height=512, - width=512, - eta=0.0, - guidance_scale=9, - **kwargs, ): - adapter_cond = self.control_image_processor.process_model_forward( - adapter_cond) + self, + input_ids=None, + adapter_cond=None, + height=512, + width=512, + eta=0.0, + guidance_scale=9, + **kwargs, + ): + adapter_cond = self.control_image_processor.process_model_forward(adapter_cond) self.eval() with self.ema_scope(): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") # only log 8 image if input_ids.shape[0] > 4: input_ids = input_ids[:4] @@ -293,33 +283,28 @@ def log_image( padding="max_length", truncation=True, max_length=max_length, - return_tensors="pd", ) + return_tensors="pd", + ) uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0] - text_embeddings = paddle.concat( - [uncond_embeddings, text_embeddings], axis=0) + text_embeddings = paddle.concat([uncond_embeddings, text_embeddings], axis=0) if self.use_preconfig_latents: latents = self.preconfig_latents else: - latents = paddle.randn( - (input_ids.shape[0], self.unet.in_channels, height // 8, - width // 8)) + latents = paddle.randn((input_ids.shape[0], self.unet.in_channels, height // 8, width // 8)) # ddim donot use this latents = latents * self.eval_scheduler.init_noise_sigma - accepts_eta = "eta" in set( - inspect.signature(self.eval_scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta for t in self.eval_scheduler.timesteps: # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents # ddim donot use this - latent_model_input = self.eval_scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t) # Adapter predict the noise residual adapter_state = self.adapter(adapter_cond) @@ -334,19 +319,16 @@ def log_image( latent_model_input, t, encoder_hidden_states=text_embeddings, - down_block_additional_residuals=[ - state.clone() for state in adapter_state - ], ).sample + down_block_additional_residuals=[state.clone() for state in adapter_state], + ).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.eval_scheduler.step( - noise_pred, t, latents, **extra_step_kwargs).prev_sample + latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample latents = 1 / 0.18215 * latents image = self.vae.decode(latents).sample @@ -358,7 +340,6 @@ def set_recompute(self, value=False): def fn(layer): if hasattr(layer, "gradient_checkpointing"): layer.gradient_checkpointing = value - print("Set", layer.__class__, "recompute", - layer.gradient_checkpointing) + print("Set", layer.__class__, "recompute", layer.gradient_checkpointing) self.adapter.apply(fn) diff --git a/ppdiffusers/examples/t2i-adapter/adapter/text_image_pair_dataset.py b/ppdiffusers/examples/t2i-adapter/adapter/text_image_pair_dataset.py index a523be48b4663..a3d1481c39807 100644 --- a/ppdiffusers/examples/t2i-adapter/adapter/text_image_pair_dataset.py +++ b/ppdiffusers/examples/t2i-adapter/adapter/text_image_pair_dataset.py @@ -43,25 +43,28 @@ def _get_param(self, img, output_size): class TextImagePair(IterableDataset): def __init__( - self, - file_list, - size, - num_records, - image_processing=None, - buffer_size=1000, - shuffle_every_n_samples=5, - interpolation="lanczos", - tokenizer=None, - control_image_processor=None, - data_format="default", - do_image_processing=True, ): + self, + file_list, + size, + num_records, + image_processing=None, + buffer_size=1000, + shuffle_every_n_samples=5, + interpolation="lanczos", + tokenizer=None, + control_image_processor=None, + data_format="default", + do_image_processing=True, + ): self.size = size self.resize_transform = transforms.Resize(int(size), interpolation) if image_processing is None: - self.image_processing = transforms.Compose([ - transforms.ToTensor(), # (0 ~ 1) - transforms.Normalize(0.5, 0.5), # (-1 ~ 1) - ]) + self.image_processing = transforms.Compose( + [ + transforms.ToTensor(), # (0 ~ 1) + transforms.Normalize(0.5, 0.5), # (-1 ~ 1) + ] + ) else: self.image_processing = image_processing if tokenizer is not None: @@ -70,7 +73,8 @@ def __init__( padding="max_length", truncation=True, max_length=tokenizer.model_max_length, - return_tensors="np", ).input_ids[0] + return_tensors="np", + ).input_ids[0] else: self.text_processing = None @@ -99,19 +103,14 @@ def __init__( file_weights = file_weights / file_weight_sum print(f"sample weights of files: {file_weights}") self.file_weights_cumsum = np.cumsum(file_weights) - self.file_weights_cumsum = np.concatenate( - [[0.0], self.file_weights_cumsum]) + self.file_weights_cumsum = np.concatenate([[0.0], self.file_weights_cumsum]) else: print("sample each file list with same probabiliy") self.file_weights_cumsum = None self.num_records = num_records - self.file_ids = [ - np.arange(len(filelist)) for filelist in self.file_list - ] - print( - f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}" - ) + self.file_ids = [np.arange(len(filelist)) for filelist in self.file_list] + print(f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}") self.buffer_size = buffer_size self.shuffle_every_n_samples = shuffle_every_n_samples self.data_format = data_format @@ -122,9 +121,7 @@ def sample_loader(self, file_ids, filenames): random.shuffle(file_ids) for i in file_ids: filename = filenames[i].strip("\n") - with gzip.open(filename, - "rb") if filename.endswith(".gz") else open( - filename, "rb") as f: + with gzip.open(filename, "rb") if filename.endswith(".gz") else open(filename, "rb") as f: # retry = 0 while True: line = f.readline() @@ -150,31 +147,26 @@ def sample_loader(self, file_ids, filenames): control_image = data["control_image"] if control_image is not None: - control_image = self.resize_transform( - control_image) + control_image = self.resize_transform(control_image) else: control_image = image out = { - "pixel_values": - self.image_processing(image).numpy() - if self.do_image_processing else image, - "input_ids": - self.text_processing(data["caption"]) - if self.text_processing else data["caption"], - "adapter_cond": - self.control_image_processor.process_data_load( - control_image).numpy() if - self.control_image_processor else control_image, + "pixel_values": self.image_processing(image).numpy() + if self.do_image_processing + else image, + "input_ids": self.text_processing(data["caption"]) + if self.text_processing + else data["caption"], + "adapter_cond": self.control_image_processor.process_data_load(control_image).numpy() + if self.control_image_processor + else control_image, } yield out def random_load_from_multi_dataset(self): - print( - f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}" - ) + print(f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}") sample_loader_per_dataset = [ - iter(self.sample_loader(self.file_ids[i], self.file_list[i])) - for i in range(len(self.file_ids)) + iter(self.sample_loader(self.file_ids[i], self.file_list[i])) for i in range(len(self.file_ids)) ] while True: @@ -183,8 +175,7 @@ def random_load_from_multi_dataset(self): else: rand_num = random.random() for i in range(len(self.file_list)): - if (self.file_weights_cumsum[i] <= rand_num < - self.file_weights_cumsum[i + 1]): + if self.file_weights_cumsum[i] <= rand_num < self.file_weights_cumsum[i + 1]: break sample_loader = sample_loader_per_dataset[i] yield next(sample_loader) diff --git a/ppdiffusers/examples/t2i-adapter/generate.py b/ppdiffusers/examples/t2i-adapter/generate.py index b4afa6609c6eb..1197dc715e704 100644 --- a/ppdiffusers/examples/t2i-adapter/generate.py +++ b/ppdiffusers/examples/t2i-adapter/generate.py @@ -17,22 +17,28 @@ import numpy as np import paddle -from adapter import (DataArguments, Fill50kDataset, GenerateArguments, - TextImagePair) +from adapter import DataArguments, Fill50kDataset, GenerateArguments, TextImagePair from annotator.canny import CannyDetector from annotator.util import HWC3 from paddlenlp.trainer import PdArgumentParser from PIL import Image from tqdm import tqdm -from ppdiffusers import (ControlNetModel, DDIMScheduler, - EulerAncestralDiscreteScheduler, LMSDiscreteScheduler, - PNDMScheduler, StableDiffusionAdapterPipeline, - StableDiffusionControlNetPipeline, T2IAdapter) +from ppdiffusers import ( + ControlNetModel, + DDIMScheduler, + EulerAncestralDiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + StableDiffusionAdapterPipeline, + StableDiffusionControlNetPipeline, + T2IAdapter, +) DEFAULT_NEGATIVE_PROMPT = ( "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, " - "fewer digits, cropped, worst quality, low quality") + "fewer digits, cropped, worst quality, low quality" +) class CannyProcessor: @@ -79,31 +85,34 @@ def set_seed(seed: int): def generate_images( - use_controlnet=False, - adapter_model_name_or_path=None, - sd_model_name_or_path=None, - batch_size=16, - test_dataset=None, - save_path="output", - guidance_scales=[3, 4, 5, 6, 7, 8], - num_inference_steps=50, - scheduler_type="ddim", - device="gpu", - max_generation_limits=1000, - use_text_cond=True, - use_default_neg_text_cond=True, - generate_control_image_processor_type=None, - eta=0.0, ): + use_controlnet=False, + adapter_model_name_or_path=None, + sd_model_name_or_path=None, + batch_size=16, + test_dataset=None, + save_path="output", + guidance_scales=[3, 4, 5, 6, 7, 8], + num_inference_steps=50, + scheduler_type="ddim", + device="gpu", + max_generation_limits=1000, + use_text_cond=True, + use_default_neg_text_cond=True, + generate_control_image_processor_type=None, + eta=0.0, +): # set pipe paddle.set_device(device) if use_controlnet: controlnet = ControlNetModel.from_pretrained(adapter_model_name_or_path) pipe = StableDiffusionControlNetPipeline.from_pretrained( - sd_model_name_or_path, controlnet=controlnet, safety_checker=None) + sd_model_name_or_path, controlnet=controlnet, safety_checker=None + ) else: adapter = T2IAdapter.from_pretrained(adapter_model_name_or_path) pipe = StableDiffusionAdapterPipeline.from_pretrained( - sd_model_name_or_path, adapter=adapter, safety_checker=None) + sd_model_name_or_path, adapter=adapter, safety_checker=None + ) pipe.set_progress_bar_config(disable=True) # set scheduler @@ -117,17 +126,14 @@ def generate_images( set_alpha_to_one=False, steps_offset=1, # Make sure the scheduler compatible with PNDM - skip_prk_steps=True, ) + skip_prk_steps=True, + ) elif scheduler_type == "lms": - scheduler = LMSDiscreteScheduler( - beta_start=beta_start, - beta_end=beta_end, - beta_schedule="scaled_linear") + scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear") elif scheduler_type == "euler-ancestral": scheduler = EulerAncestralDiscreteScheduler( - beta_start=beta_start, - beta_end=beta_end, - beta_schedule="scaled_linear") + beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear" + ) elif scheduler_type == "ddim": scheduler = DDIMScheduler( beta_start=beta_start, @@ -136,7 +142,8 @@ def generate_images( # Make sure the scheduler compatible with DDIM clip_sample=False, set_alpha_to_one=False, - steps_offset=1, ) + steps_offset=1, + ) else: raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") pipe.scheduler = scheduler @@ -158,24 +165,21 @@ def generate_images( write_file = open(os.path.join(save_path, "caption.txt"), "w") i = 0 for data in tqdm(test_dataset): - if (generate_control_image_processor_type == - "canny"): # Canny mode needs to manually process the control image - data["adapter_cond"] = canny_processor.process_data_load(data[ - "pixel_values"]) + if ( + generate_control_image_processor_type == "canny" + ): # Canny mode needs to manually process the control image + data["adapter_cond"] = canny_processor.process_data_load(data["pixel_values"]) images = pipe( data["input_ids"] if use_text_cond else "", - negative_prompt=DEFAULT_NEGATIVE_PROMPT - if use_default_neg_text_cond else "", + negative_prompt=DEFAULT_NEGATIVE_PROMPT if use_default_neg_text_cond else "", image=data["adapter_cond"], guidance_scale=float(cfg), eta=eta, - num_inference_steps=num_inference_steps, )[0] - data["adapter_cond"].save( - os.path.join(cond_save_path, "{:05d}_000.png".format(i))) - data["pixel_values"].save( - os.path.join(origin_save_path, "{:05d}_000.png".format(i))) - write_file.write("{:05d}_000".format(i) + "\t" + data["input_ids"] - .strip() + "\n") + num_inference_steps=num_inference_steps, + )[0] + data["adapter_cond"].save(os.path.join(cond_save_path, "{:05d}_000.png".format(i))) + data["pixel_values"].save(os.path.join(origin_save_path, "{:05d}_000.png".format(i))) + write_file.write("{:05d}_000".format(i) + "\t" + data["input_ids"].strip() + "\n") for image in images: path = os.path.join(new_save_path, "{:05d}_000.png".format(i)) image.save(path) @@ -198,7 +202,8 @@ def generate_images( tokenizer=None, file_path=generate_args.file, do_image_processing=False, - do_text_processing=False, ) + do_text_processing=False, + ) else: test_dataset = TextImagePair( @@ -210,7 +215,8 @@ def generate_images( interpolation="lanczos", data_format=generate_args.generate_data_format, control_image_processor=None, - do_image_processing=False, ) + do_image_processing=False, + ) generate_images( use_controlnet=generate_args.use_controlnet, @@ -226,5 +232,5 @@ def generate_images( max_generation_limits=generate_args.max_generation_limits, use_text_cond=generate_args.use_text_cond, use_default_neg_text_cond=generate_args.use_default_neg_text_cond, - generate_control_image_processor_type=generate_args. - generate_control_image_processor_type, ) + generate_control_image_processor_type=generate_args.generate_control_image_processor_type, + ) diff --git a/ppdiffusers/examples/t2i-adapter/tools/convert_diffusers_adapter_to_ppdiffusers.py b/ppdiffusers/examples/t2i-adapter/tools/convert_diffusers_adapter_to_ppdiffusers.py index 758e595e0ae59..01f4839ec21ff 100644 --- a/ppdiffusers/examples/t2i-adapter/tools/convert_diffusers_adapter_to_ppdiffusers.py +++ b/ppdiffusers/examples/t2i-adapter/tools/convert_diffusers_adapter_to_ppdiffusers.py @@ -39,8 +39,7 @@ def convert_to_ppdiffusers(controlnet, dtype="float32"): if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Pytorch model weights to Paddle model weights.") + parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.") parser.add_argument( "--pretrained_model_name_or_path", type=str, @@ -51,11 +50,11 @@ def convert_to_ppdiffusers(controlnet, dtype="float32"): "--output_path", type=str, default="paddle_models/sd-v1-4-adapter-color", - help="The model output path.", ) + help="The model output path.", + ) args = parser.parse_args() - th_controlnet = DiffusersAdapterNetModel.from_pretrained( - args.pretrained_model_name_or_path) + th_controlnet = DiffusersAdapterNetModel.from_pretrained(args.pretrained_model_name_or_path) controlnet_state_dict = convert_to_ppdiffusers(th_controlnet) pp_controlnet = PPDiffusersAdapterNetModel.from_config(th_controlnet.config) pp_controlnet.set_dict(controlnet_state_dict) diff --git a/ppdiffusers/examples/t2i-adapter/tools/convert_orig_adapter_ckpt_to_ppdiffusers.py b/ppdiffusers/examples/t2i-adapter/tools/convert_orig_adapter_ckpt_to_ppdiffusers.py index 824cb9d41f945..165fb8d562914 100644 --- a/ppdiffusers/examples/t2i-adapter/tools/convert_orig_adapter_ckpt_to_ppdiffusers.py +++ b/ppdiffusers/examples/t2i-adapter/tools/convert_orig_adapter_ckpt_to_ppdiffusers.py @@ -42,10 +42,7 @@ def convert_to_paddle(vae_or_unet, dtype="float32"): @patch_to(paddle.nn.Layer) -def load_state_dict(self: paddle.nn.Layer, - state_dict: dict, - use_structured_name=True, - strict=True): +def load_state_dict(self: paddle.nn.Layer, state_dict: dict, use_structured_name=True, strict=True): orig = self.state_dict() orig_keys = set([k for k in orig.keys()]) loaded_keys = set([k for k in state_dict.keys()]) @@ -76,29 +73,32 @@ def apply(name): if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Pytorch model weights to Paddle model weights.") + parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.") parser.add_argument( "--orig_t2i_adapter_project_path", type=str, default="pytorch/T2I-Adapter", - help="Path to a torch model parameters file", ) + help="Path to a torch model parameters file", + ) parser.add_argument( "--orig_t2i_adapter_pretrained_ckpt_path", type=str, default="ckpt/t2iadapter_openpose_sd14v1.pth", - help="Path to a torch model parameters file", ) + help="Path to a torch model parameters file", + ) parser.add_argument( "--ppdiffusers_t2i_adapter_model_config_path", type=str, default="ppdiffusers/examples/t2i-adapter/config/openpose_adapter.json", - help="Path to a torch model parameters file", ) + help="Path to a torch model parameters file", + ) parser.add_argument( "--ppdiffusers_t2i_adapter_model_output_path", type=str, default="paddle_models/sd-v1-4-adapter-openpose_initialized", - help="The model output path.", ) + help="The model output path.", + ) args = parser.parse_args() import os @@ -113,19 +113,21 @@ def apply(name): nums_rb=2, ksize=1, sk=True, - use_conv=False, ) + use_conv=False, + ) from ppdiffusers import T2IAdapter as paddle_network - Paddle_Model = paddle_network( - **read_json(args.ppdiffusers_t2i_adapter_model_config_path)) + Paddle_Model = paddle_network(**read_json(args.ppdiffusers_t2i_adapter_model_config_path)) torch_model = Torch_Model if args.orig_t2i_adapter_pretrained_ckpt_path: torch_model.load_state_dict( torch.load( args.orig_t2i_adapter_pretrained_ckpt_path, - map_location=torch.device("cpu"), ), - strict=True, ) + map_location=torch.device("cpu"), + ), + strict=True, + ) # When orig_t2i_adapter_pretrained_ckpt_path is not specified, the randomly initialized torch weights are stored in orig_t2i_adapter_pretrained_ckpt_path else: torch.save( @@ -133,7 +135,9 @@ def apply(name): os.path.join( args.orig_t2i_adapter_project_path, "ckpt", - "torch_t2i_model_initialized.pth", ), ) + "torch_t2i_model_initialized.pth", + ), + ) torch_model_dict = convert_adapter(torch_model.state_dict()) numpy_state_dict = convert_to_paddle(torch_model_dict) paddle_model = Paddle_Model diff --git a/ppdiffusers/examples/t2i-adapter/tools/convert_t2i_adapter_to_latest_version.py b/ppdiffusers/examples/t2i-adapter/tools/convert_t2i_adapter_to_latest_version.py index dd6cc4ced4689..45f7f2262e5fd 100644 --- a/ppdiffusers/examples/t2i-adapter/tools/convert_t2i_adapter_to_latest_version.py +++ b/ppdiffusers/examples/t2i-adapter/tools/convert_t2i_adapter_to_latest_version.py @@ -76,13 +76,15 @@ def convert_adapter_light(old_state_dict): default=None, type=str, required=True, - help="Path to the checkpoint to convert.", ) + help="Path to the checkpoint to convert.", + ) parser.add_argument( "--output_path", default=None, type=str, required=True, - help="Path to the store the result checkpoint.", ) + help="Path to the store the result checkpoint.", + ) parser.add_argument( "--is_adapter_light", default=False, diff --git a/ppdiffusers/examples/t2i-adapter/tools/make_dummpy_dataset.py b/ppdiffusers/examples/t2i-adapter/tools/make_dummpy_dataset.py index 432265f92f6db..172b6727c299f 100644 --- a/ppdiffusers/examples/t2i-adapter/tools/make_dummpy_dataset.py +++ b/ppdiffusers/examples/t2i-adapter/tools/make_dummpy_dataset.py @@ -30,60 +30,59 @@ "--dataset_base_name", type=str, default="artv4_openpose_test13", - help="The dataset basename.", ) + help="The dataset basename.", +) parser.add_argument( "--ids_list_path", type=str, default="artv4_openpose_test13_ids.txt", - help="The ids list path.", ) + help="The ids list path.", +) parser.add_argument( "--ids_list_path", type=str, default="artv4_openpose_test13_ids.txt", - help="The ids list path.", ) + help="The ids list path.", +) parser.add_argument( "--source_prompt_list_one_path", type=str, default="prompts_artv4_openpose_test1_en_prompts.txt", - help="The first source prompt list path.", ) + help="The first source prompt list path.", +) parser.add_argument( "--source_prompt_list_two_path", type=str, default="prompts_artv4_openpose_test2_en_prompts.txt", - help="The second source prompt list path.", ) + help="The second source prompt list path.", +) parser.add_argument( "--source_prompt_list_three_path", type=str, default="prompts_artv4_openpose_test3_en_prompts.txt", - help="The third source prompt list path.", ) + help="The third source prompt list path.", +) parser.add_argument( "--dataset_prompt_json_name", type=str, default="prompt.json", - help="The dataset prompt json name.", ) + help="The dataset prompt json name.", +) args = parser.parse_args() -def get_images_form_urls(ids_list, - dir_path, - dataset_base_name, - type=None, - is_resize=False): +def get_images_form_urls(ids_list, dir_path, dataset_base_name, type=None, is_resize=False): for i, id in enumerate(tqdm(ids_list)): if dataset_base_name == "artv4_openpose_test13": if type == "原图": - img_url = (dataset_base_name_one_type_one_url_base + - f"{id}/{id}_final00_control.png") + img_url = dataset_base_name_one_type_one_url_base + f"{id}/{id}_final00_control.png" elif type == "Openpose控制图": - img_url = (dataset_base_name_one_type_two_url_base + - f"{id}/{id}_final00_control_openpose.png") + img_url = dataset_base_name_one_type_two_url_base + f"{id}/{id}_final00_control_openpose.png" if dataset_base_name == "artv4_openpose_test2": if type == "原图": - img_url = (dataset_base_name_two_type_one_url_base + - f"{id}/{id}_final00_control.png") + img_url = dataset_base_name_two_type_one_url_base + f"{id}/{id}_final00_control.png" elif type == "Openpose控制图": - img_url = (dataset_base_name_two_type_one_url_base + - f"{id}/{id}_final00_control_openpose.png") + img_url = dataset_base_name_two_type_one_url_base + f"{id}/{id}_final00_control_openpose.png" in_image = load_image(img_url) if is_resize: in_image = in_image.resize((512, 512)) @@ -93,9 +92,7 @@ def get_images_form_urls(ids_list, def get_prompt_json_file(ids_list, prompt_lists, dataset_base_name): - with open( - os.path.join(dataset_base_name, args.dataset_prompt_json_name), - "w") as wf: + with open(os.path.join(dataset_base_name, args.dataset_prompt_json_name), "w") as wf: for i, id in enumerate(ids_list): which_prompt_list = int(id.split("_")[1][-1]) - 1 which_prompt = int(id.split("_")[-1]) @@ -112,41 +109,16 @@ def get_prompt_json_file(ids_list, prompt_lists, dataset_base_name): if __name__ == "__main__": dataset_base_name = args.dataset_base_name - ids_list = [ - line.strip() - for line in open( - args.ids_list_path, "r", encoding="utf8").readlines() - ] + ids_list = [line.strip() for line in open(args.ids_list_path, "r", encoding="utf8").readlines()] source_prompt_lists = [ - [ - line.strip() - for line in open( - args.source_prompt_list_one_path, "r", encoding="utf8") - .readlines() - ], - [ - line.strip() - for line in open( - args.source_prompt_list_two_path, "r", encoding="utf8") - .readlines() - ], - [ - line.strip() - for line in open( - args.source_prompt_list_three_path, "r", encoding="utf8") - .readlines() - ], + [line.strip() for line in open(args.source_prompt_list_one_path, "r", encoding="utf8").readlines()], + [line.strip() for line in open(args.source_prompt_list_two_path, "r", encoding="utf8").readlines()], + [line.strip() for line in open(args.source_prompt_list_three_path, "r", encoding="utf8").readlines()], ] source_dir = os.path.join(dataset_base_name, "source") target_dir = os.path.join(dataset_base_name, "target") - get_images_form_urls( - ids_list, - source_dir, - dataset_base_name, - type="Openpose控制图", - is_resize=False) - get_images_form_urls( - ids_list, target_dir, dataset_base_name, type="原图", is_resize=False) + get_images_form_urls(ids_list, source_dir, dataset_base_name, type="Openpose控制图", is_resize=False) + get_images_form_urls(ids_list, target_dir, dataset_base_name, type="原图", is_resize=False) get_prompt_json_file(ids_list, source_prompt_lists, dataset_base_name) diff --git a/ppdiffusers/examples/t2i-adapter/train_t2i_adapter_trainer.py b/ppdiffusers/examples/t2i-adapter/train_t2i_adapter_trainer.py index 79180b0f624fe..7f5bb1a23ecb4 100644 --- a/ppdiffusers/examples/t2i-adapter/train_t2i_adapter_trainer.py +++ b/ppdiffusers/examples/t2i-adapter/train_t2i_adapter_trainer.py @@ -15,10 +15,14 @@ import os import paddle -from adapter import (AdapterLDM, AdapterLDMTrainer, DataArguments, - ModelArguments, TextImagePair) -from paddlenlp.trainer import (PdArgumentParser, TrainingArguments, - get_last_checkpoint) +from adapter import ( + AdapterLDM, + AdapterLDMTrainer, + DataArguments, + ModelArguments, + TextImagePair, +) +from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint from paddlenlp.utils.log import logger @@ -28,15 +32,14 @@ def unfreeze_params(params): def main(): - parser = PdArgumentParser( - (ModelArguments, DataArguments, TrainingArguments)) + parser = PdArgumentParser((ModelArguments, DataArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() # report to custom_visualdl training_args.report_to = ["custom_visualdl"] training_args.resolution = data_args.resolution training_args.image_logging_steps = model_args.image_logging_steps = ( - math.ceil(model_args.image_logging_steps / training_args.logging_steps) - * training_args.logging_steps) + math.ceil(model_args.image_logging_steps / training_args.logging_steps) * training_args.logging_steps + ) training_args.print_config(model_args, "Model") training_args.print_config(data_args, "Data") @@ -44,16 +47,14 @@ def main(): # Detecting last checkpoint. last_checkpoint = None - if (os.path.isdir(training_args.output_dir) and training_args.do_train and - not training_args.overwrite_output_dir): + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len( - os.listdir(training_args.output_dir)) > 0: + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome.") - elif (last_checkpoint is not None and - training_args.resume_from_checkpoint is None): + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." @@ -69,12 +70,14 @@ def main(): interpolation="lanczos", tokenizer=model.tokenizer, control_image_processor=model.control_image_processor, - data_format=data_args.data_format, ) + data_format=data_args.data_format, + ) trainer = AdapterLDMTrainer( model=model, args=training_args, train_dataset=train_dataset, - tokenizer=model.tokenizer, ) + tokenizer=model.tokenizer, + ) # must set recompute after trainer init trainer.model.set_recompute(training_args.recompute) diff --git a/ppdiffusers/examples/text_to_image/train_text_to_image.py b/ppdiffusers/examples/text_to_image/train_text_to_image.py index d9e9e7295e0d5..95328abbff75f 100644 --- a/ppdiffusers/examples/text_to_image/train_text_to_image.py +++ b/ppdiffusers/examples/text_to_image/train_text_to_image.py @@ -27,8 +27,9 @@ import paddle.nn.functional as F from datasets import DatasetDict, load_dataset from huggingface_hub import HfFolder, Repository, create_repo, whoami -from paddle.distributed.fleet.utils.hybrid_parallel_util import \ - fused_allreduce_gradients +from paddle.distributed.fleet.utils.hybrid_parallel_util import ( + fused_allreduce_gradients, +) from paddle.io import BatchSampler, DataLoader, DistributedBatchSampler from paddle.optimizer import AdamW from paddle.vision import BaseTransform, transforms @@ -38,19 +39,27 @@ from paddlenlp.utils.log import logger from tqdm.auto import tqdm -from ppdiffusers import (AutoencoderKL, DDPMScheduler, DiffusionPipeline, - UNet2DConditionModel, is_ppxformers_available) +from ppdiffusers import ( + AutoencoderKL, + DDPMScheduler, + DiffusionPipeline, + UNet2DConditionModel, + is_ppxformers_available, +) from ppdiffusers.optimization import get_scheduler -from ppdiffusers.training_utils import (EMAModel, freeze_params, - main_process_first, unwrap_model) +from ppdiffusers.training_utils import ( + EMAModel, + freeze_params, + main_process_first, + unwrap_model, +) from ppdiffusers.utils import PPDIFFUSERS_CACHE, check_min_version check_min_version("0.16.1") def url_or_path_join(*path_list): - return (os.path.join(*path_list) - if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list)) + return os.path.join(*path_list) if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list) class Lambda(BaseTransform): @@ -62,11 +71,11 @@ def _apply_image(self, img): return self.fn(img) -def import_model_class_from_model_name_or_path( - pretrained_model_name_or_path: str): +def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str): try: text_encoder_config = PretrainedConfig.from_pretrained( - url_or_path_join(pretrained_model_name_or_path, "text_encoder")) + url_or_path_join(pretrained_model_name_or_path, "text_encoder") + ) model_class = text_encoder_config.architectures[0] except Exception: model_class = "LDMBertModel" @@ -75,8 +84,9 @@ def import_model_class_from_model_name_or_path( return CLIPTextModel elif model_class == "RobertaSeriesModelWithTransformation": - from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import \ - RobertaSeriesModelWithTransformation + from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import ( + RobertaSeriesModelWithTransformation, + ) return RobertaSeriesModelWithTransformation elif model_class == "BertModel": @@ -84,8 +94,9 @@ def import_model_class_from_model_name_or_path( return BertModel elif model_class == "LDMBertModel": - from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import \ - LDMBertModel + from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import ( + LDMBertModel, + ) return LDMBertModel else: @@ -101,8 +112,7 @@ def fn(layer): # unet if hasattr(layer, "gradient_checkpointing"): layer.gradient_checkpointing = value - print("Set", layer.__class__, "recompute", - layer.gradient_checkpointing) + print("Set", layer.__class__, "recompute", layer.gradient_checkpointing) model.apply(fn) @@ -122,8 +132,7 @@ def get_report_to(args): def parse_args(input_args=None): - parser = argparse.ArgumentParser( - description="Simple example of a training a text to image model script.") + parser = argparse.ArgumentParser(description="Simple example of a training a text to image model script.") parser.add_argument( "--pretrained_model_name_or_path", type=str, @@ -140,7 +149,8 @@ def parse_args(input_args=None): parser.add_argument( "--train_text_encoder", action="store_true", - help="Whether to train the text encoder.", ) + help="Whether to train the text encoder.", + ) parser.add_argument( "--dataset_name", type=str, @@ -148,7 +158,8 @@ def parse_args(input_args=None): help=( "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private," " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem," - " or to a folder containing files that 🤗 Datasets can understand."), + " or to a folder containing files that 🤗 Datasets can understand." + ), ) parser.add_argument( "--dataset_config_name", @@ -164,12 +175,14 @@ def parse_args(input_args=None): "A folder containing the training data. Folder contents must follow the structure described in" " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file" " must exist to provide the captions for the images. Ignored if `dataset_name` is specified." - ), ) + ), + ) parser.add_argument( "--image_column", type=str, default="image", - help="The column of the dataset containing an image.", ) + help="The column of the dataset containing an image.", + ) parser.add_argument( "--caption_column", type=str, @@ -182,7 +195,9 @@ def parse_args(input_args=None): default=None, help=( "For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set."), ) + "value if set." + ), + ) parser.add_argument( "--output_dir", type=str, @@ -195,32 +210,34 @@ def parse_args(input_args=None): default=None, help="The directory where the downloaded models and datasets will be stored.", ) - parser.add_argument( - "--seed", - type=int, - default=None, - help="A seed for reproducible training.") + parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") parser.add_argument( "--height", type=int, default=None, help=( "The height for input images, all the images in the train/validation dataset will be resized to this" - " height"), ) + " height" + ), + ) parser.add_argument( "--width", type=int, default=None, help=( "The width for input images, all the images in the train/validation dataset will be resized to this" - " width"), ) + " width" + ), + ) parser.add_argument( "--resolution", type=int, default=512, help=( "The resolution for input images, all the images in the train/validation dataset will be resized to this" - " resolution"), ) + " resolution" + ), + ) parser.add_argument( "--center_crop", default=False, @@ -228,16 +245,19 @@ def parse_args(input_args=None): help=( "Whether to center crop the input images to the resolution. If not set, the images will be randomly" " cropped. The images will be resized to the resolution first before cropping." - ), ) + ), + ) parser.add_argument( "--random_flip", action="store_true", - help="whether to randomly flip images horizontally", ) + help="whether to randomly flip images horizontally", + ) parser.add_argument( "--train_batch_size", type=int, default=16, - help="Batch size (per device) for the training dataloader.", ) + help="Batch size (per device) for the training dataloader.", + ) parser.add_argument("--num_train_epochs", type=int, default=100) parser.add_argument( "--max_train_steps", @@ -274,18 +294,22 @@ def parse_args(input_args=None): default="constant", help=( 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' - ' "constant", "constant_with_warmup"]'), ) + ' "constant", "constant_with_warmup"]' + ), + ) parser.add_argument( "--lr_warmup_steps", type=int, default=500, - help="Number of steps for the warmup in the lr scheduler.", ) + help="Number of steps for the warmup in the lr scheduler.", + ) parser.add_argument( "--snr_gamma", type=float, default=None, help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. " - "More details here: https://arxiv.org/abs/2303.09556.", ) + "More details here: https://arxiv.org/abs/2303.09556.", + ) parser.add_argument( "--lr_num_cycles", type=int, @@ -296,51 +320,49 @@ def parse_args(input_args=None): "--lr_power", type=float, default=1.0, - help="Power factor of the polynomial scheduler.", ) - parser.add_argument( - "--use_ema", action="store_true", help="Whether to use EMA model.") - parser.add_argument( - "--debug", - action="store_true", - help="Whether to debug this training script.") + help="Power factor of the polynomial scheduler.", + ) + parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.") + parser.add_argument("--debug", action="store_true", help="Whether to debug this training script.") parser.add_argument( "--dataloader_num_workers", type=int, default=0, help=( "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process." - ), ) + ), + ) parser.add_argument( "--adam_beta1", type=float, default=0.9, - help="The beta1 parameter for the Adam optimizer.", ) + help="The beta1 parameter for the Adam optimizer.", + ) parser.add_argument( "--adam_beta2", type=float, default=0.999, - help="The beta2 parameter for the Adam optimizer.", ) - parser.add_argument( - "--adam_weight_decay", - type=float, - default=1e-2, - help="Weight decay to use.") + help="The beta2 parameter for the Adam optimizer.", + ) + parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") parser.add_argument( "--adam_epsilon", type=float, default=1e-08, - help="Epsilon value for the Adam optimizer", ) - parser.add_argument( - "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + help="Epsilon value for the Adam optimizer", + ) + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--push_to_hub", action="store_true", - help="Whether or not to push the model to the Hub.", ) + help="Whether or not to push the model to the Hub.", + ) parser.add_argument( "--hub_token", type=str, default=None, - help="The token to use to push to the Model Hub.", ) + help="The token to use to push to the Model Hub.", + ) parser.add_argument( "--hub_model_id", type=str, @@ -353,27 +375,28 @@ def parse_args(input_args=None): default="logs", help=( "[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to" - "*output_dir/logs"), ) + "*output_dir/logs" + ), + ) parser.add_argument( "--report_to", type=str, default="visualdl", choices=["tensorboard", "visualdl"], - help="Log writer type.", ) + help="Log writer type.", + ) parser.add_argument( "--checkpointing_steps", type=int, default=500, - help=("Save a checkpoint of the training state every X updates."), ) + help=("Save a checkpoint of the training state every X updates."), + ) parser.add_argument( "--enable_xformers_memory_efficient_attention", action="store_true", - help="Whether or not to use xformers.", ) - parser.add_argument( - "--noise_offset", - type=float, - default=0, - help="The scale of noise offset.") + help="Whether or not to use xformers.", + ) + parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.") if input_args is not None: args = parser.parse_args(input_args) else: @@ -389,9 +412,7 @@ def parse_args(input_args=None): return args -def get_full_repo_name(model_id: str, - organization: Optional[str]=None, - token: Optional[str]=None): +def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None): if token is None: token = HfFolder.get_token() if organization is None: @@ -401,7 +422,9 @@ def get_full_repo_name(model_id: str, return f"{organization}/{model_id}" -DATASET_NAME_MAPPING = {"lambdalabs/pokemon-blip-captions": ("image", "text"), } +DATASET_NAME_MAPPING = { + "lambdalabs/pokemon-blip-captions": ("image", "text"), +} def main(): @@ -422,16 +445,13 @@ def main(): os.makedirs(args.output_dir, exist_ok=True) if args.push_to_hub: if args.hub_model_id is None: - repo_name = get_full_repo_name( - Path(args.output_dir).name, token=args.hub_token) + repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id create_repo(repo_name, exist_ok=True, token=args.hub_token) - repo = Repository( - args.output_dir, clone_from=repo_name, token=args.hub_token) + repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token) - with open(os.path.join(args.output_dir, ".gitignore"), - "w+") as gitignore: + with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: if "step_*" not in gitignore: gitignore.write("step_*\n") if "epoch_*" not in gitignore: @@ -441,30 +461,26 @@ def main(): if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name) elif args.pretrained_model_name_or_path: - tokenizer = AutoTokenizer.from_pretrained( - url_or_path_join(args.pretrained_model_name_or_path, "tokenizer")) + tokenizer = AutoTokenizer.from_pretrained(url_or_path_join(args.pretrained_model_name_or_path, "tokenizer")) # import correct text encoder class - text_encoder_cls = import_model_class_from_model_name_or_path( - args.pretrained_model_name_or_path) + text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path) # Load scheduler and models - noise_scheduler = DDPMScheduler.from_pretrained( - args.pretrained_model_name_or_path, subfolder="scheduler") + noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") text_encoder = text_encoder_cls.from_pretrained( - url_or_path_join(args.pretrained_model_name_or_path, "text_encoder")) - text_config = (text_encoder.config if isinstance(text_encoder.config, dict) - else text_encoder.config.to_dict()) - if (text_config.get("use_attention_mask", None) is not None and - text_config["use_attention_mask"]): + url_or_path_join(args.pretrained_model_name_or_path, "text_encoder") + ) + text_config = text_encoder.config if isinstance(text_encoder.config, dict) else text_encoder.config.to_dict() + if text_config.get("use_attention_mask", None) is not None and text_config["use_attention_mask"]: use_attention_mask = True else: use_attention_mask = False - vae = AutoencoderKL.from_pretrained( - args.pretrained_model_name_or_path, subfolder="vae") + vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae") unet = UNet2DConditionModel.from_pretrained( args.pretrained_model_name_or_path, - subfolder="unet", ) + subfolder="unet", + ) freeze_params(vae.parameters()) if not args.train_text_encoder: @@ -472,7 +488,8 @@ def main(): if args.use_ema: ema_unet = UNet2DConditionModel.from_pretrained( args.pretrained_model_name_or_path, - subfolder="unet", ) + subfolder="unet", + ) ema_unet = EMAModel(ema_unet.parameters()) if args.gradient_checkpointing: @@ -480,14 +497,14 @@ def main(): if args.train_text_encoder: set_recompute(text_encoder, True) - if args.enable_xformers_memory_efficient_attention and is_ppxformers_available( - ): + if args.enable_xformers_memory_efficient_attention and is_ppxformers_available(): try: unet.enable_xformers_memory_efficient_attention() except Exception as e: logger.warn( "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed" - f" correctly and a GPU is available: {e}") + f" correctly and a GPU is available: {e}" + ) def compute_snr(timesteps): """ @@ -495,7 +512,7 @@ def compute_snr(timesteps): """ alphas_cumprod = noise_scheduler.alphas_cumprod sqrt_alphas_cumprod = alphas_cumprod**0.5 - sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod)**0.5 + sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5 # Expand the tensors. # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026 @@ -504,15 +521,13 @@ def compute_snr(timesteps): sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None] alpha = sqrt_alphas_cumprod.expand(timesteps.shape) - sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[ - timesteps].cast("float32") + sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[timesteps].cast("float32") while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape): - sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., - None] + sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None] sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape) # Compute SNR. - snr = (alpha / sigma)**2 + snr = (alpha / sigma) ** 2 return snr # Get the datasets: you can either provide your own training and evaluation files (see below) @@ -523,7 +538,8 @@ def compute_snr(timesteps): if args.debug: file_path = get_path_from_url_with_filelock( "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/pokemon-blip-captions.tar.gz", - PPDIFFUSERS_CACHE, ) + PPDIFFUSERS_CACHE, + ) dataset = DatasetDict.load_from_disk(file_path) args.dataset_name = "lambdalabs/pokemon-blip-captions" else: @@ -532,7 +548,8 @@ def compute_snr(timesteps): dataset = load_dataset( args.dataset_name, args.dataset_config_name, - cache_dir=args.cache_dir, ) + cache_dir=args.cache_dir, + ) else: data_files = {} if args.train_data_dir is not None: @@ -540,7 +557,8 @@ def compute_snr(timesteps): dataset = load_dataset( "imagefolder", data_files=data_files, - cache_dir=args.cache_dir, ) + cache_dir=args.cache_dir, + ) # See more about loading custom images at # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder @@ -551,8 +569,7 @@ def compute_snr(timesteps): # 6. Get the column names for input/target. dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None) if args.image_column is None: - image_column = (dataset_columns[0] - if dataset_columns is not None else column_names[0]) + image_column = dataset_columns[0] if dataset_columns is not None else column_names[0] else: image_column = args.image_column if image_column not in column_names: @@ -560,8 +577,7 @@ def compute_snr(timesteps): f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}" ) if args.caption_column is None: - caption_column = (dataset_columns[1] - if dataset_columns is not None else column_names[1]) + caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1] else: caption_column = args.caption_column if caption_column not in column_names: @@ -578,8 +594,7 @@ def tokenize_captions(examples, is_train=True): captions.append(caption) elif isinstance(caption, (list, np.ndarray)): # take a random caption if there are multiple - captions.append( - random.choice(caption) if is_train else caption[0]) + captions.append(random.choice(caption) if is_train else caption[0]) else: raise ValueError( f"Caption column `{caption_column}` should contain either strings or lists of strings." @@ -589,20 +604,22 @@ def tokenize_captions(examples, is_train=True): max_length=tokenizer.model_max_length, padding="do_not_pad", truncation=True, - return_attention_mask=False, ) + return_attention_mask=False, + ) return inputs.input_ids # Preprocessing the datasets. - train_transforms = transforms.Compose([ - transforms.Resize( - (args.height, args.width), interpolation="bilinear"), - transforms.CenterCrop((args.height, args.width)) if args.center_crop - else transforms.RandomCrop((args.height, args.width)), - transforms.RandomHorizontalFlip() - if args.random_flip else Lambda(lambda x: x), - transforms.ToTensor(), - transforms.Normalize([0.5], [0.5]), - ]) + train_transforms = transforms.Compose( + [ + transforms.Resize((args.height, args.width), interpolation="bilinear"), + transforms.CenterCrop((args.height, args.width)) + if args.center_crop + else transforms.RandomCrop((args.height, args.width)), + transforms.RandomHorizontalFlip() if args.random_flip else Lambda(lambda x: x), + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] + ) def preprocess_train(examples): images = [image.convert("RGB") for image in examples[image_column]] @@ -612,47 +629,42 @@ def preprocess_train(examples): with main_process_first(): if args.max_train_samples is not None: - dataset["train"] = (dataset["train"].shuffle(seed=args.seed) - .select(range(args.max_train_samples))) + dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples)) # Set the training transforms train_dataset = dataset["train"].with_transform(preprocess_train) def collate_fn(examples): - pixel_values = paddle.stack( - [example["pixel_values"] for example in examples]).cast("float32") + pixel_values = paddle.stack([example["pixel_values"] for example in examples]).cast("float32") input_ids = [example["input_ids"] for example in examples] input_ids = tokenizer.pad( - { - "input_ids": input_ids - }, + {"input_ids": input_ids}, padding="max_length", max_length=tokenizer.model_max_length, - return_tensors="pd", ).input_ids + return_tensors="pd", + ).input_ids return { "input_ids": input_ids, "pixel_values": pixel_values, } - train_sampler = (DistributedBatchSampler( - train_dataset, batch_size=args.train_batch_size, shuffle=True) - if num_processes > 1 else BatchSampler( - train_dataset, - batch_size=args.train_batch_size, - shuffle=True)) + train_sampler = ( + DistributedBatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True) + if num_processes > 1 + else BatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True) + ) train_dataloader = DataLoader( train_dataset, batch_sampler=train_sampler, collate_fn=collate_fn, - num_workers=args.dataloader_num_workers, ) + num_workers=args.dataloader_num_workers, + ) # Scheduler and math around the number of training steps. - num_update_steps_per_epoch = math.ceil( - len(train_dataloader) / args.gradient_accumulation_steps) + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch # Afterwards we recalculate our number of training epochs - args.num_train_epochs = math.ceil(args.max_train_steps / - num_update_steps_per_epoch) + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) if num_processes > 1: unet = paddle.DataParallel(unet) @@ -660,23 +672,22 @@ def collate_fn(examples): text_encoder = paddle.DataParallel(text_encoder) params_to_optimize = ( - list(unet.parameters()) + list(text_encoder.parameters()) - if args.train_text_encoder else unet.parameters()) + list(unet.parameters()) + list(text_encoder.parameters()) if args.train_text_encoder else unet.parameters() + ) if args.scale_lr: - args.learning_rate = (args.learning_rate * - args.gradient_accumulation_steps * - args.train_batch_size * num_processes) + args.learning_rate = ( + args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * num_processes + ) lr_scheduler = get_scheduler( args.lr_scheduler, learning_rate=args.learning_rate, - num_warmup_steps=args.lr_warmup_steps * - args.gradient_accumulation_steps, - num_training_steps=args.max_train_steps * - args.gradient_accumulation_steps, + num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, num_cycles=args.lr_num_cycles, - power=args.lr_power, ) + power=args.lr_power, + ) # Initialize the optimizer optimizer = AdamW( learning_rate=lr_scheduler, @@ -685,8 +696,8 @@ def collate_fn(examples): beta2=args.adam_beta2, weight_decay=args.adam_weight_decay, epsilon=args.adam_epsilon, - grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) - if args.max_grad_norm > 0 else None, ) + grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None, + ) if is_main_process: logger.info("----------- Configuration Arguments -----------") @@ -696,25 +707,19 @@ def collate_fn(examples): writer = get_report_to(args) # Train! - total_batch_size = (args.train_batch_size * num_processes * - args.gradient_accumulation_steps) + total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num batches each epoch = {len(train_dataloader)}") logger.info(f" Num Epochs = {args.num_train_epochs}") - logger.info( - f" Instantaneous batch size per device = {args.train_batch_size}") - logger.info( - f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" - ) - logger.info( - f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Instantaneous batch size per device = {args.train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. - progress_bar = tqdm( - range(args.max_train_steps), disable=not is_main_process) + progress_bar = tqdm(range(args.max_train_steps), disable=not is_main_process) progress_bar.set_description("Train Steps") global_step = 0 @@ -737,20 +742,19 @@ def collate_fn(examples): if args.noise_offset: # https://www.crosslabs.org//blog/diffusion-with-offset-noise noise += args.noise_offset * paddle.randn( - (latents.shape[0], latents.shape[1], 1, 1), - dtype=latents.dtype) + (latents.shape[0], latents.shape[1], 1, 1), dtype=latents.dtype + ) batch_size = latents.shape[0] # Sample a random timestep for each image - timesteps = paddle.randint( - 0, noise_scheduler.config.num_train_timesteps, - (batch_size, )).cast("int64") + timesteps = paddle.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,)).cast("int64") # Add noise to the latents according to the noise magnitude at each timestep # (this is the forward diffusion process) noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) - if num_processes > 1 and (args.gradient_checkpointing or ( - (step + 1) % args.gradient_accumulation_steps != 0)): + if num_processes > 1 and ( + args.gradient_checkpointing or ((step + 1) % args.gradient_accumulation_steps != 0) + ): # grad acc, no_sync when (step + 1) % args.gradient_accumulation_steps != 0: # gradient_checkpointing, no_sync every where # gradient_checkpointing + grad_acc, no_sync every where @@ -758,68 +762,61 @@ def collate_fn(examples): if args.train_text_encoder: text_encoder_ctx_manager = text_encoder.no_sync() else: - text_encoder_ctx_manager = (contextlib.nullcontext() - if sys.version_info >= (3, 7) - else contextlib.suppress()) + text_encoder_ctx_manager = ( + contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress() + ) else: - unet_ctx_manager = (contextlib.nullcontext() - if sys.version_info >= (3, 7) else - contextlib.suppress()) - text_encoder_ctx_manager = (contextlib.nullcontext() - if sys.version_info >= (3, 7) else - contextlib.suppress()) + unet_ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress() + text_encoder_ctx_manager = ( + contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress() + ) with text_encoder_ctx_manager: # Get the text embedding for conditioning if use_attention_mask: - attention_mask = (batch["input_ids"] != - tokenizer.pad_token_id).cast("int64") + attention_mask = (batch["input_ids"] != tokenizer.pad_token_id).cast("int64") else: attention_mask = None - encoder_hidden_states = text_encoder( - batch["input_ids"], attention_mask=attention_mask)[0] + encoder_hidden_states = text_encoder(batch["input_ids"], attention_mask=attention_mask)[0] with unet_ctx_manager: # Predict the noise residual / sample - model_pred = unet(noisy_latents, timesteps, - encoder_hidden_states).sample + model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample # Get the target for loss depending on the prediction type if noise_scheduler.config.prediction_type == "epsilon": target = noise elif noise_scheduler.config.prediction_type == "v_prediction": - target = noise_scheduler.get_velocity(latents, noise, - timesteps) + target = noise_scheduler.get_velocity(latents, noise, timesteps) else: - raise ValueError( - f"Unknown prediction type {noise_scheduler.config.prediction_type}" - ) + raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") if args.snr_gamma is None: loss = F.mse_loss( model_pred.cast("float32"), target.cast("float32"), - reduction="mean", ) + reduction="mean", + ) else: # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556. # Since we predict the noise instead of x_0, the original formulation is slightly changed. # This is discussed in Section 4.2 of the same paper. snr = compute_snr(timesteps) - mse_loss_weights = (paddle.stack( - [ - snr, - args.snr_gamma * paddle.ones_like(timesteps) - ], - axis=1, ).min(1)[0] / snr) + mse_loss_weights = ( + paddle.stack([snr, args.snr_gamma * paddle.ones_like(timesteps)], axis=1,).min( + 1 + )[0] + / snr + ) # We first calculate the original loss. Then we mean over the non-batch dimensions and # rebalance the sample-wise losses with their respective loss weights. # Finally, we take the mean of the rebalanced loss. loss = F.mse_loss( model_pred.cast("float32"), target.cast("float32"), - reduction="none", ) - loss = (loss.mean(axis=list(range(1, len(loss.shape)))) - * mse_loss_weights) + reduction="none", + ) + loss = loss.mean(axis=list(range(1, len(loss.shape)))) * mse_loss_weights loss = loss.mean() if args.gradient_accumulation_steps > 1: @@ -851,13 +848,10 @@ def collate_fn(examples): writer.add_scalar(f"train/{name}", val, global_step) if global_step % args.checkpointing_steps == 0: - save_path = os.path.join(args.output_dir, - f"checkpoint-{global_step}") - unwrap_model(unet).save_pretrained( - os.path.join(save_path, "unet")) + save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") + unwrap_model(unet).save_pretrained(os.path.join(save_path, "unet")) if args.train_text_encoder: - unwrap_model(text_encoder).save_pretrained( - os.path.join(save_path, "text_encoder")) + unwrap_model(text_encoder).save_pretrained(os.path.join(save_path, "text_encoder")) if global_step >= args.max_train_steps: break @@ -871,14 +865,12 @@ def collate_fn(examples): pipeline = DiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, unet=unet, - text_encoder=unwrap_model(text_encoder), ) + text_encoder=unwrap_model(text_encoder), + ) pipeline.save_pretrained(args.output_dir) if args.push_to_hub: - repo.push_to_hub( - commit_message="End of training", - blocking=False, - auto_lfs_prune=True) + repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True) if __name__ == "__main__": diff --git a/ppdiffusers/examples/text_to_image/train_text_to_image_lora.py b/ppdiffusers/examples/text_to_image/train_text_to_image_lora.py index b07bc09c1d1ae..611aebd6a5dc0 100644 --- a/ppdiffusers/examples/text_to_image/train_text_to_image_lora.py +++ b/ppdiffusers/examples/text_to_image/train_text_to_image_lora.py @@ -29,8 +29,9 @@ import paddle.nn.functional as F from datasets import DatasetDict, load_dataset from huggingface_hub import HfFolder, Repository, create_repo, whoami -from paddle.distributed.fleet.utils.hybrid_parallel_util import \ - fused_allreduce_gradients +from paddle.distributed.fleet.utils.hybrid_parallel_util import ( + fused_allreduce_gradients, +) from paddle.io import BatchSampler, DataLoader, DistributedBatchSampler from paddle.optimizer import AdamW from paddle.vision import BaseTransform, transforms @@ -40,31 +41,37 @@ from paddlenlp.utils.log import logger from tqdm.auto import tqdm -from ppdiffusers import (AutoencoderKL, DDPMScheduler, DiffusionPipeline, - DPMSolverMultistepScheduler, UNet2DConditionModel, - is_ppxformers_available) +from ppdiffusers import ( + AutoencoderKL, + DDPMScheduler, + DiffusionPipeline, + DPMSolverMultistepScheduler, + UNet2DConditionModel, + is_ppxformers_available, +) from ppdiffusers.loaders import AttnProcsLayers, LoraLoaderMixin from ppdiffusers.models.attention_processor import ( - AttnProcessor, AttnProcessor2_5, LoRAAttnProcessor, LoRAAttnProcessor2_5) + AttnProcessor, + AttnProcessor2_5, + LoRAAttnProcessor, + LoRAAttnProcessor2_5, +) from ppdiffusers.optimization import get_scheduler -from ppdiffusers.training_utils import (freeze_params, main_process_first, - unwrap_model) -from ppdiffusers.utils import (PPDIFFUSERS_CACHE, TEXT_ENCODER_ATTN_MODULE, - check_min_version) +from ppdiffusers.training_utils import freeze_params, main_process_first, unwrap_model +from ppdiffusers.utils import ( + PPDIFFUSERS_CACHE, + TEXT_ENCODER_ATTN_MODULE, + check_min_version, +) check_min_version("0.16.1") def url_or_path_join(*path_list): - return (os.path.join(*path_list) - if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list)) + return os.path.join(*path_list) if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list) -def save_model_card(repo_id: str, - images=None, - base_model=str, - dataset_name=str, - repo_folder=None): +def save_model_card(repo_id: str, images=None, base_model=str, dataset_name=str, repo_folder=None): img_str = "" for i, image in enumerate(images): image.save(os.path.join(repo_folder, f"image_{i}.png")) @@ -94,11 +101,11 @@ def save_model_card(repo_id: str, f.write(yaml + model_card) -def import_model_class_from_model_name_or_path( - pretrained_model_name_or_path: str): +def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str): try: text_encoder_config = PretrainedConfig.from_pretrained( - url_or_path_join(pretrained_model_name_or_path, "text_encoder")) + url_or_path_join(pretrained_model_name_or_path, "text_encoder") + ) model_class = text_encoder_config.architectures[0] except Exception: model_class = "LDMBertModel" @@ -107,8 +114,9 @@ def import_model_class_from_model_name_or_path( return CLIPTextModel elif model_class == "RobertaSeriesModelWithTransformation": - from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import \ - RobertaSeriesModelWithTransformation + from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import ( + RobertaSeriesModelWithTransformation, + ) return RobertaSeriesModelWithTransformation elif model_class == "BertModel": @@ -116,8 +124,9 @@ def import_model_class_from_model_name_or_path( return BertModel elif model_class == "LDMBertModel": - from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import \ - LDMBertModel + from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import ( + LDMBertModel, + ) return LDMBertModel else: @@ -148,8 +157,7 @@ def get_report_to(args): def parse_args(input_args=None): - parser = argparse.ArgumentParser( - description="Simple example of a training text to image lora script.") + parser = argparse.ArgumentParser(description="Simple example of a training text to image lora script.") parser.add_argument( "--pretrained_model_name_or_path", type=str, @@ -170,7 +178,8 @@ def parse_args(input_args=None): help=( "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private," " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem," - " or to a folder containing files that 🤗 Datasets can understand."), + " or to a folder containing files that 🤗 Datasets can understand." + ), ) parser.add_argument( "--dataset_config_name", @@ -186,12 +195,14 @@ def parse_args(input_args=None): "A folder containing the training data. Folder contents must follow the structure described in" " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file" " must exist to provide the captions for the images. Ignored if `dataset_name` is specified." - ), ) + ), + ) parser.add_argument( "--image_column", type=str, default="image", - help="The column of the dataset containing an image.", ) + help="The column of the dataset containing an image.", + ) parser.add_argument( "--caption_column", type=str, @@ -202,7 +213,8 @@ def parse_args(input_args=None): "--validation_prompt", type=str, default=None, - help="A prompt that is sampled during training for inference.", ) + help="A prompt that is sampled during training for inference.", + ) parser.add_argument( "--num_validation_images", type=int, @@ -216,14 +228,17 @@ def parse_args(input_args=None): help=( "Run fine-tuning validation every X epochs. The validation process consists of running the prompt" " `args.validation_prompt` multiple times: `args.num_validation_images`." - ), ) + ), + ) parser.add_argument( "--max_train_samples", type=int, default=None, help=( "For debugging purposes or quicker training, truncate the number of training examples to this " - "value if set."), ) + "value if set." + ), + ) parser.add_argument( "--output_dir", type=str, @@ -236,32 +251,34 @@ def parse_args(input_args=None): default=None, help="The directory where the downloaded models and datasets will be stored.", ) - parser.add_argument( - "--seed", - type=int, - default=None, - help="A seed for reproducible training.") + parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") parser.add_argument( "--height", type=int, default=None, help=( "The height for input images, all the images in the train/validation dataset will be resized to this" - " height"), ) + " height" + ), + ) parser.add_argument( "--width", type=int, default=None, help=( "The width for input images, all the images in the train/validation dataset will be resized to this" - " width"), ) + " width" + ), + ) parser.add_argument( "--resolution", type=int, default=512, help=( "The resolution for input images, all the images in the train/validation dataset will be resized to this" - " resolution"), ) + " resolution" + ), + ) parser.add_argument( "--center_crop", default=False, @@ -269,21 +286,25 @@ def parse_args(input_args=None): help=( "Whether to center crop the input images to the resolution. If not set, the images will be randomly" " cropped. The images will be resized to the resolution first before cropping." - ), ) + ), + ) parser.add_argument( "--lora_rank", type=int, default=4, - help="The rank of lora linear.", ) + help="The rank of lora linear.", + ) parser.add_argument( "--random_flip", action="store_true", - help="whether to randomly flip images horizontally", ) + help="whether to randomly flip images horizontally", + ) parser.add_argument( "--train_batch_size", type=int, default=16, - help="Batch size (per device) for the training dataloader.", ) + help="Batch size (per device) for the training dataloader.", + ) parser.add_argument( "--train_text_encoder", action="store_true", @@ -300,7 +321,8 @@ def parse_args(input_args=None): "--checkpointing_steps", type=int, default=500, - help=("Save a checkpoint of the training state every X updates."), ) + help=("Save a checkpoint of the training state every X updates."), + ) parser.add_argument( "--gradient_accumulation_steps", type=int, @@ -330,12 +352,15 @@ def parse_args(input_args=None): default="constant", help=( 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' - ' "constant", "constant_with_warmup"]'), ) + ' "constant", "constant_with_warmup"]' + ), + ) parser.add_argument( "--lr_warmup_steps", type=int, default=500, - help="Number of steps for the warmup in the lr scheduler.", ) + help="Number of steps for the warmup in the lr scheduler.", + ) parser.add_argument( "--lr_num_cycles", type=int, @@ -346,49 +371,48 @@ def parse_args(input_args=None): "--lr_power", type=float, default=1.0, - help="Power factor of the polynomial scheduler.", ) - parser.add_argument( - "--debug", - action="store_true", - help="Whether to debug this training script.") + help="Power factor of the polynomial scheduler.", + ) + parser.add_argument("--debug", action="store_true", help="Whether to debug this training script.") parser.add_argument( "--dataloader_num_workers", type=int, default=0, help=( "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process." - ), ) + ), + ) parser.add_argument( "--adam_beta1", type=float, default=0.9, - help="The beta1 parameter for the Adam optimizer.", ) + help="The beta1 parameter for the Adam optimizer.", + ) parser.add_argument( "--adam_beta2", type=float, default=0.999, - help="The beta2 parameter for the Adam optimizer.", ) - parser.add_argument( - "--adam_weight_decay", - type=float, - default=1e-2, - help="Weight decay to use.") + help="The beta2 parameter for the Adam optimizer.", + ) + parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") parser.add_argument( "--adam_epsilon", type=float, default=1e-08, - help="Epsilon value for the Adam optimizer", ) - parser.add_argument( - "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + help="Epsilon value for the Adam optimizer", + ) + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--push_to_hub", action="store_true", - help="Whether or not to push the model to the Hub.", ) + help="Whether or not to push the model to the Hub.", + ) parser.add_argument( "--hub_token", type=str, default=None, - help="The token to use to push to the Model Hub.", ) + help="The token to use to push to the Model Hub.", + ) parser.add_argument( "--hub_model_id", type=str, @@ -401,22 +425,22 @@ def parse_args(input_args=None): default="logs", help=( "[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to" - "*output_dir/logs"), ) + "*output_dir/logs" + ), + ) parser.add_argument( "--report_to", type=str, default="visualdl", choices=["tensorboard", "visualdl"], - help="Log writer type.", ) + help="Log writer type.", + ) parser.add_argument( "--enable_xformers_memory_efficient_attention", action="store_true", - help="Whether or not to use xformers.", ) - parser.add_argument( - "--noise_offset", - type=float, - default=0, - help="The scale of noise offset.") + help="Whether or not to use xformers.", + ) + parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.") if input_args is not None: args = parser.parse_args(input_args) else: @@ -432,9 +456,7 @@ def parse_args(input_args=None): return args -def get_full_repo_name(model_id: str, - organization: Optional[str]=None, - token: Optional[str]=None): +def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None): if token is None: token = HfFolder.get_token() if organization is None: @@ -444,7 +466,9 @@ def get_full_repo_name(model_id: str, return f"{organization}/{model_id}" -DATASET_NAME_MAPPING = {"lambdalabs/pokemon-blip-captions": ("image", "text"), } +DATASET_NAME_MAPPING = { + "lambdalabs/pokemon-blip-captions": ("image", "text"), +} def main(): @@ -465,16 +489,13 @@ def main(): os.makedirs(args.output_dir, exist_ok=True) if args.push_to_hub: if args.hub_model_id is None: - repo_name = get_full_repo_name( - Path(args.output_dir).name, token=args.hub_token) + repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id create_repo(repo_name, exist_ok=True, token=args.hub_token) - repo = Repository( - args.output_dir, clone_from=repo_name, token=args.hub_token) + repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token) - with open(os.path.join(args.output_dir, ".gitignore"), - "w+") as gitignore: + with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: if "step_*" not in gitignore: gitignore.write("step_*\n") if "epoch_*" not in gitignore: @@ -484,44 +505,40 @@ def main(): if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name) elif args.pretrained_model_name_or_path: - tokenizer = AutoTokenizer.from_pretrained( - url_or_path_join(args.pretrained_model_name_or_path, "tokenizer")) + tokenizer = AutoTokenizer.from_pretrained(url_or_path_join(args.pretrained_model_name_or_path, "tokenizer")) # import correct text encoder class - text_encoder_cls = import_model_class_from_model_name_or_path( - args.pretrained_model_name_or_path) + text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path) # Load scheduler and models - noise_scheduler = DDPMScheduler.from_pretrained( - args.pretrained_model_name_or_path, subfolder="scheduler") + noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") text_encoder = text_encoder_cls.from_pretrained( - url_or_path_join(args.pretrained_model_name_or_path, "text_encoder")) - text_config = (text_encoder.config if isinstance(text_encoder.config, dict) - else text_encoder.config.to_dict()) - if (text_config.get("use_attention_mask", None) is not None and - text_config["use_attention_mask"]): + url_or_path_join(args.pretrained_model_name_or_path, "text_encoder") + ) + text_config = text_encoder.config if isinstance(text_encoder.config, dict) else text_encoder.config.to_dict() + if text_config.get("use_attention_mask", None) is not None and text_config["use_attention_mask"]: use_attention_mask = True else: use_attention_mask = False - vae = AutoencoderKL.from_pretrained( - args.pretrained_model_name_or_path, subfolder="vae") + vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae") unet = UNet2DConditionModel.from_pretrained( args.pretrained_model_name_or_path, - subfolder="unet", ) + subfolder="unet", + ) # We only train the additional adapter LoRA layers freeze_params(vae.parameters()) freeze_params(text_encoder.parameters()) freeze_params(unet.parameters()) - if args.enable_xformers_memory_efficient_attention and is_ppxformers_available( - ): + if args.enable_xformers_memory_efficient_attention and is_ppxformers_available(): try: unet.enable_xformers_memory_efficient_attention() except Exception as e: logger.warning( "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed" - f" correctly and a GPU is available: {e}") + f" correctly and a GPU is available: {e}" + ) # now we will add new LoRA weights to the attention layers # It's important to realize here how many attention weights will be added and of which sizes # The sizes of the attention layers consist only of two different variables: @@ -538,14 +555,12 @@ def main(): # Set correct lora layers unet_lora_attn_procs = {} for name, attn_processor in unet.attn_processors.items(): - cross_attention_dim = (None if name.endswith("attn1.processor") else - unet.config.cross_attention_dim) + cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim if name.startswith("mid_block"): hidden_size = unet.config.block_out_channels[-1] elif name.startswith("up_blocks"): block_id = int(name[len("up_blocks.")]) - hidden_size = list(reversed(unet.config.block_out_channels))[ - block_id] + hidden_size = list(reversed(unet.config.block_out_channels))[block_id] elif name.startswith("down_blocks"): block_id = int(name[len("down_blocks.")]) hidden_size = unet.config.block_out_channels[block_id] @@ -555,14 +570,13 @@ def main(): elif isinstance(attn_processor, AttnProcessor2_5): lora_attn_processor_class = LoRAAttnProcessor2_5 else: - raise ValueError( - f"Unknown attention processor type: {attn_processor.__class__.__name__}" - ) + raise ValueError(f"Unknown attention processor type: {attn_processor.__class__.__name__}") unet_lora_attn_procs[name] = lora_attn_processor_class( hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, - rank=args.lora_rank, ) + rank=args.lora_rank, + ) unet.set_attn_processor(unet_lora_attn_procs) unet_lora_layers = AttnProcsLayers(unet.attn_processors) @@ -578,10 +592,12 @@ def main(): text_lora_attn_procs[name] = LoRAAttnProcessor( hidden_size=module.out_proj.weight.shape[1], cross_attention_dim=None, - rank=args.lora_rank, ) + rank=args.lora_rank, + ) text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs) temp_pipeline = DiffusionPipeline.from_pretrained( - args.pretrained_model_name_or_path, text_encoder=text_encoder) + args.pretrained_model_name_or_path, text_encoder=text_encoder + ) temp_pipeline._modify_text_encoder(text_lora_attn_procs) text_encoder = temp_pipeline.text_encoder del temp_pipeline @@ -594,7 +610,8 @@ def main(): if args.debug: file_path = get_path_from_url_with_filelock( "https://paddlenlp.bj.bcebos.com/models/community/junnyu/develop/pokemon-blip-captions.tar.gz", - PPDIFFUSERS_CACHE, ) + PPDIFFUSERS_CACHE, + ) dataset = DatasetDict.load_from_disk(file_path) args.dataset_name = "lambdalabs/pokemon-blip-captions" else: @@ -603,7 +620,8 @@ def main(): dataset = load_dataset( args.dataset_name, args.dataset_config_name, - cache_dir=args.cache_dir, ) + cache_dir=args.cache_dir, + ) else: data_files = {} if args.train_data_dir is not None: @@ -611,7 +629,8 @@ def main(): dataset = load_dataset( "imagefolder", data_files=data_files, - cache_dir=args.cache_dir, ) + cache_dir=args.cache_dir, + ) # See more about loading custom images at # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder @@ -622,8 +641,7 @@ def main(): # 6. Get the column names for input/target. dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None) if args.image_column is None: - image_column = (dataset_columns[0] - if dataset_columns is not None else column_names[0]) + image_column = dataset_columns[0] if dataset_columns is not None else column_names[0] else: image_column = args.image_column if image_column not in column_names: @@ -631,8 +649,7 @@ def main(): f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}" ) if args.caption_column is None: - caption_column = (dataset_columns[1] - if dataset_columns is not None else column_names[1]) + caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1] else: caption_column = args.caption_column if caption_column not in column_names: @@ -649,8 +666,7 @@ def tokenize_captions(examples, is_train=True): captions.append(caption) elif isinstance(caption, (list, np.ndarray)): # take a random caption if there are multiple - captions.append( - random.choice(caption) if is_train else caption[0]) + captions.append(random.choice(caption) if is_train else caption[0]) else: raise ValueError( f"Caption column `{caption_column}` should contain either strings or lists of strings." @@ -660,20 +676,22 @@ def tokenize_captions(examples, is_train=True): max_length=tokenizer.model_max_length, padding="do_not_pad", truncation=True, - return_attention_mask=False, ) + return_attention_mask=False, + ) return inputs.input_ids # Preprocessing the datasets. - train_transforms = transforms.Compose([ - transforms.Resize( - (args.height, args.width), interpolation="bilinear"), - transforms.CenterCrop((args.height, args.width)) if args.center_crop - else transforms.RandomCrop((args.height, args.width)), - transforms.RandomHorizontalFlip() - if args.random_flip else Lambda(lambda x: x), - transforms.ToTensor(), - transforms.Normalize([0.5], [0.5]), - ]) + train_transforms = transforms.Compose( + [ + transforms.Resize((args.height, args.width), interpolation="bilinear"), + transforms.CenterCrop((args.height, args.width)) + if args.center_crop + else transforms.RandomCrop((args.height, args.width)), + transforms.RandomHorizontalFlip() if args.random_flip else Lambda(lambda x: x), + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] + ) def preprocess_train(examples): images = [image.convert("RGB") for image in examples[image_column]] @@ -683,67 +701,62 @@ def preprocess_train(examples): with main_process_first(): if args.max_train_samples is not None: - dataset["train"] = (dataset["train"].shuffle(seed=args.seed) - .select(range(args.max_train_samples))) + dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples)) # Set the training transforms train_dataset = dataset["train"].with_transform(preprocess_train) def collate_fn(examples): - pixel_values = paddle.stack( - [example["pixel_values"] for example in examples]).cast("float32") + pixel_values = paddle.stack([example["pixel_values"] for example in examples]).cast("float32") input_ids = [example["input_ids"] for example in examples] input_ids = tokenizer.pad( - { - "input_ids": input_ids - }, + {"input_ids": input_ids}, padding="max_length", max_length=tokenizer.model_max_length, - return_tensors="pd", ).input_ids + return_tensors="pd", + ).input_ids return { "input_ids": input_ids, "pixel_values": pixel_values, } - train_sampler = (DistributedBatchSampler( - train_dataset, batch_size=args.train_batch_size, shuffle=True) - if num_processes > 1 else BatchSampler( - train_dataset, - batch_size=args.train_batch_size, - shuffle=True)) + train_sampler = ( + DistributedBatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True) + if num_processes > 1 + else BatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True) + ) train_dataloader = DataLoader( train_dataset, batch_sampler=train_sampler, collate_fn=collate_fn, - num_workers=args.dataloader_num_workers, ) + num_workers=args.dataloader_num_workers, + ) # Scheduler and math around the number of training steps. - num_update_steps_per_epoch = math.ceil( - len(train_dataloader) / args.gradient_accumulation_steps) + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch # Afterwards we recalculate our number of training epochs - args.num_train_epochs = math.ceil(args.max_train_steps / - num_update_steps_per_epoch) + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) if args.scale_lr: - args.learning_rate = (args.learning_rate * - args.gradient_accumulation_steps * - args.train_batch_size * num_processes) + args.learning_rate = ( + args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * num_processes + ) lr_scheduler = get_scheduler( args.lr_scheduler, learning_rate=args.learning_rate, - num_warmup_steps=args.lr_warmup_steps * - args.gradient_accumulation_steps, - num_training_steps=args.max_train_steps * - args.gradient_accumulation_steps, + num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, num_cycles=args.lr_num_cycles, - power=args.lr_power, ) + power=args.lr_power, + ) - params_to_optimize = (list(unet_lora_layers.parameters()) + - list(text_encoder_lora_layers.parameters()) - if args.train_text_encoder else - unet_lora_layers.parameters()) + params_to_optimize = ( + list(unet_lora_layers.parameters()) + list(text_encoder_lora_layers.parameters()) + if args.train_text_encoder + else unet_lora_layers.parameters() + ) # Optimizer creation optimizer = AdamW( learning_rate=lr_scheduler, @@ -752,8 +765,8 @@ def collate_fn(examples): beta2=args.adam_beta2, weight_decay=args.adam_weight_decay, epsilon=args.adam_epsilon, - grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) - if args.max_grad_norm > 0 else None, ) + grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None, + ) if num_processes > 1: unet = paddle.DataParallel(unet) @@ -768,25 +781,19 @@ def collate_fn(examples): writer = get_report_to(args) # Train! - total_batch_size = (args.train_batch_size * num_processes * - args.gradient_accumulation_steps) + total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num batches each epoch = {len(train_dataloader)}") logger.info(f" Num Epochs = {args.num_train_epochs}") - logger.info( - f" Instantaneous batch size per device = {args.train_batch_size}") - logger.info( - f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" - ) - logger.info( - f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Instantaneous batch size per device = {args.train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. - progress_bar = tqdm( - range(args.max_train_steps), disable=not is_main_process) + progress_bar = tqdm(range(args.max_train_steps), disable=not is_main_process) progress_bar.set_description("Train Steps") global_step = 0 vae.eval() @@ -807,52 +814,43 @@ def collate_fn(examples): if args.noise_offset: # https://www.crosslabs.org/blog/diffusion-with-offset-noise noise += args.noise_offset * paddle.randn( - (latents.shape[0], latents.shape[1], 1, 1), - dtype=latents.dtype) + (latents.shape[0], latents.shape[1], 1, 1), dtype=latents.dtype + ) batch_size = latents.shape[0] # Sample a random timestep for each image - timesteps = paddle.randint( - 0, noise_scheduler.config.num_train_timesteps, - (batch_size, )).cast("int64") + timesteps = paddle.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,)).cast("int64") # Add noise to the latents according to the noise magnitude at each timestep # (this is the forward diffusion process) noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) - if num_processes > 1 and (args.gradient_checkpointing or ( - (step + 1) % args.gradient_accumulation_steps != 0)): + if num_processes > 1 and ( + args.gradient_checkpointing or ((step + 1) % args.gradient_accumulation_steps != 0) + ): # grad acc, no_sync when (step + 1) % args.gradient_accumulation_steps != 0: # gradient_checkpointing, no_sync every where # gradient_checkpointing + grad_acc, no_sync every where unet_ctx_manager = unet.no_sync() else: - unet_ctx_manager = (contextlib.nullcontext() - if sys.version_info >= (3, 7) else - contextlib.suppress()) + unet_ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress() if use_attention_mask: - attention_mask = ( - batch["input_ids"] != tokenizer.pad_token_id).cast("int64") + attention_mask = (batch["input_ids"] != tokenizer.pad_token_id).cast("int64") else: attention_mask = None - encoder_hidden_states = text_encoder( - batch["input_ids"], attention_mask=attention_mask)[0] + encoder_hidden_states = text_encoder(batch["input_ids"], attention_mask=attention_mask)[0] with unet_ctx_manager: # Predict the noise residual / sample - model_pred = unet(noisy_latents, timesteps, - encoder_hidden_states).sample + model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample # Get the target for loss depending on the prediction type if noise_scheduler.config.prediction_type == "epsilon": target = noise elif noise_scheduler.config.prediction_type == "v_prediction": - target = noise_scheduler.get_velocity(latents, noise, - timesteps) + target = noise_scheduler.get_velocity(latents, noise, timesteps) else: - raise ValueError( - f"Unknown prediction type {noise_scheduler.config.prediction_type}" - ) + raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") loss = F.mse_loss(model_pred, target, reduction="mean") @@ -883,52 +881,51 @@ def collate_fn(examples): writer.add_scalar(f"train/{name}", val, global_step) if global_step % args.checkpointing_steps == 0: - save_path = os.path.join(args.output_dir, - f"checkpoint-{global_step}") + save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") # We combine the text encoder and UNet LoRA parameters with a simple # custom logic. So, use `LoraLoaderMixin.save_lora_weights()`. LoraLoaderMixin.save_lora_weights( save_directory=save_path, unet_lora_layers=unet_lora_layers, - text_encoder_lora_layers=text_encoder_lora_layers, ) + text_encoder_lora_layers=text_encoder_lora_layers, + ) logger.info(f"Saved lora weights to {save_path}") if global_step >= args.max_train_steps: break if is_main_process: - if (args.validation_prompt is not None and - epoch % args.validation_epochs == 0): + if args.validation_prompt is not None and epoch % args.validation_epochs == 0: logger.info( f"Running validation... \n Generating {args.num_validation_images} images with prompt:" - f" {args.validation_prompt}.") + f" {args.validation_prompt}." + ) # create pipeline pipeline = DiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, unet=unwrap_model(unet), text_encoder=unwrap_model(text_encoder), safety_checker=None, - requires_safety_checker=False, ) + requires_safety_checker=False, + ) pipeline.set_progress_bar_config(disable=True) # run inference - generator = (paddle.Generator().manual_seed(args.seed) - if args.seed else None) + generator = paddle.Generator().manual_seed(args.seed) if args.seed else None images = [ pipeline( args.validation_prompt, num_inference_steps=30, - generator=generator, ).images[0] + generator=generator, + ).images[0] for _ in range(args.num_validation_images) ] np_images = np.stack([np.asarray(img) for img in images]) if args.report_to == "tensorboard": - writer.add_images( - "validation", np_images, epoch, dataformats="NHWC") + writer.add_images("validation", np_images, epoch, dataformats="NHWC") else: - writer.add_image( - "validation", np_images, epoch, dataformats="NHWC") + writer.add_image("validation", np_images, epoch, dataformats="NHWC") del pipeline gc.collect() @@ -941,7 +938,8 @@ def collate_fn(examples): LoraLoaderMixin.save_lora_weights( save_directory=args.output_dir, unet_lora_layers=unet_lora_layers, - text_encoder_lora_layers=text_encoder_lora_layers, ) + text_encoder_lora_layers=text_encoder_lora_layers, + ) if args.push_to_hub: save_model_card( @@ -949,31 +947,25 @@ def collate_fn(examples): images=images, base_model=args.pretrained_model_name_or_path, prompt=args.instance_prompt, - repo_folder=args.output_dir, ) - repo.push_to_hub( - commit_message="End of training", - blocking=False, - auto_lfs_prune=True) + repo_folder=args.output_dir, + ) + repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True) # Final inference # Load previous pipeline pipeline = DiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, safety_checker=None, - requires_safety_checker=False, ) - pipeline.scheduler = DPMSolverMultistepScheduler.from_config( - pipeline.scheduler.config) + requires_safety_checker=False, + ) + pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config) # load attention processors pipeline.load_lora_weights(args.output_dir) # run inference if args.validation_prompt and args.num_validation_images > 0: - generator = paddle.Generator().manual_seed( - args.seed) if args.seed else None + generator = paddle.Generator().manual_seed(args.seed) if args.seed else None images = [ - pipeline( - args.validation_prompt, - num_inference_steps=30, - generator=generator).images[0] + pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0] for _ in range(args.num_validation_images) ] np_images = np.stack([np.asarray(img) for img in images]) diff --git a/ppdiffusers/examples/text_to_image_laion400m/generate_images.py b/ppdiffusers/examples/text_to_image_laion400m/generate_images.py index 9f7e732a9033e..c8527964620b4 100644 --- a/ppdiffusers/examples/text_to_image_laion400m/generate_images.py +++ b/ppdiffusers/examples/text_to_image_laion400m/generate_images.py @@ -20,9 +20,13 @@ import pandas as pd from tqdm.auto import tqdm -from ppdiffusers import (DDIMScheduler, EulerAncestralDiscreteScheduler, - LDMTextToImagePipeline, LMSDiscreteScheduler, - PNDMScheduler) +from ppdiffusers import ( + DDIMScheduler, + EulerAncestralDiscreteScheduler, + LDMTextToImagePipeline, + LMSDiscreteScheduler, + PNDMScheduler, +) def batchify(data, batch_size=16): @@ -37,18 +41,19 @@ def batchify(data, batch_size=16): def generate_images( - model_name_or_path, - batch_size=16, - file="coco30k.csv", - save_path="output", - seed=42, - scheduler_type="ddim", - eta=0.0, - num_inference_steps=50, - guidance_scales=[3, 4, 5, 6, 7, 8], - height=256, - width=256, - device="gpu", ): + model_name_or_path, + batch_size=16, + file="coco30k.csv", + save_path="output", + seed=42, + scheduler_type="ddim", + eta=0.0, + num_inference_steps=50, + guidance_scales=[3, 4, 5, 6, 7, 8], + height=256, + width=256, + device="gpu", +): paddle.set_device(device) pipe = LDMTextToImagePipeline.from_pretrained(model_name_or_path) pipe.set_progress_bar_config(disable=True) @@ -62,17 +67,14 @@ def generate_images( set_alpha_to_one=False, steps_offset=1, # Make sure the scheduler compatible with PNDM - skip_prk_steps=True, ) + skip_prk_steps=True, + ) elif scheduler_type == "lms": - scheduler = LMSDiscreteScheduler( - beta_start=beta_start, - beta_end=beta_end, - beta_schedule="scaled_linear") + scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear") elif scheduler_type == "euler-ancestral": scheduler = EulerAncestralDiscreteScheduler( - beta_start=beta_start, - beta_end=beta_end, - beta_schedule="scaled_linear") + beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear" + ) elif scheduler_type == "ddim": scheduler = DDIMScheduler( beta_start=beta_start, @@ -81,7 +83,8 @@ def generate_images( # Make sure the scheduler compatible with DDIM clip_sample=False, set_alpha_to_one=False, - steps_offset=1, ) + steps_offset=1, + ) else: raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") pipe.scheduler = scheduler @@ -103,7 +106,8 @@ def generate_images( eta=eta, height=height, width=width, - num_inference_steps=num_inference_steps, )[0] + num_inference_steps=num_inference_steps, + )[0] for image in images: path = os.path.join(new_save_path, "{:05d}_000.png".format(i)) image.save(path) @@ -117,17 +121,20 @@ def generate_images( default=None, type=str, required=True, - help="model_name_or_path.", ) + help="model_name_or_path.", + ) parser.add_argument( "--file", default="./coco30k.tsv", type=str, - help="eval file.", ) + help="eval file.", + ) parser.add_argument( "--seed", default=42, type=int, - help="random seed.", ) + help="random seed.", + ) parser.add_argument( "--scheduler_type", default="ddim", @@ -137,22 +144,20 @@ def generate_images( ) parser.add_argument("--device", default="gpu", type=str, help="device") parser.add_argument("--batch_size", default=16, type=int, help="batch_size") - parser.add_argument( - "--num_inference_steps", - default=50, - type=int, - help="num_inference_steps") + parser.add_argument("--num_inference_steps", default=50, type=int, help="num_inference_steps") parser.add_argument( "--save_path", default="output/1.5b_ldm/12w.pd", type=str, - help="Path to the output file.", ) + help="Path to the output file.", + ) parser.add_argument( "--guidance_scales", default=[3, 4, 5, 6, 7, 8], nargs="+", type=str, - help="guidance_scales list.", ) + help="guidance_scales list.", + ) parser.add_argument("--height", default=256, type=int, help="height.") parser.add_argument("--width", default=256, type=int, help="width.") args = parser.parse_args() @@ -171,4 +176,5 @@ def generate_images( scheduler_type=args.scheduler_type, height=args.height, width=args.width, - device=args.device, ) + device=args.device, + ) diff --git a/ppdiffusers/examples/text_to_image_laion400m/generate_pipelines.py b/ppdiffusers/examples/text_to_image_laion400m/generate_pipelines.py index c89f6fd190bf7..069fde479ce3d 100644 --- a/ppdiffusers/examples/text_to_image_laion400m/generate_pipelines.py +++ b/ppdiffusers/examples/text_to_image_laion400m/generate_pipelines.py @@ -19,8 +19,13 @@ from paddlenlp.transformers import AutoTokenizer from paddlenlp.utils.log import logger -from ppdiffusers import (AutoencoderKL, DDIMScheduler, LDMBertModel, - LDMTextToImagePipeline, UNet2DConditionModel) +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + LDMBertModel, + LDMTextToImagePipeline, + UNet2DConditionModel, +) from ppdiffusers.pipelines.latent_diffusion import LDMBertConfig @@ -30,27 +35,32 @@ def parse_args(): "--model_file", type=str, default="./model_state.pdparams", - help="path to pretrained model_state.pdparams", ) + help="path to pretrained model_state.pdparams", + ) parser.add_argument( "--output_path", type=str, default="./ldm_pipelines", - help="the output path of pipeline.", ) + help="the output path of pipeline.", + ) parser.add_argument( "--vae_name_or_path", type=str, default="CompVis/stable-diffusion-v1-4/vae", - help="pretrained_vae_name_or_path.", ) + help="pretrained_vae_name_or_path.", + ) parser.add_argument( "--text_encoder_config_file", type=str, default="./config/ldmbert.json", - help="text_encoder_config_file.", ) + help="text_encoder_config_file.", + ) parser.add_argument( "--unet_config_file", type=str, default="./config/unet.json", - help="unet_config_file.", ) + help="unet_config_file.", + ) parser.add_argument( "--tokenizer_name_or_path", type=str, @@ -61,12 +71,9 @@ def parse_args(): "--model_max_length", type=int, default=77, - help="Pretrained tokenizer model_max_length.", ) - parser.add_argument( - "--device", - type=str, - default=None, - help="Device to use. Like gpu:0 or cpu") + help="Pretrained tokenizer model_max_length.", + ) + parser.add_argument("--device", type=str, default=None, help="Device to use. Like gpu:0 or cpu") return parser.parse_args() @@ -119,17 +126,17 @@ def check_keys(model, state_dict): def build_pipelines( - model_file, - output_path, - vae_name_or_path, - unet_config_file, - text_encoder_config_file, - tokenizer_name_or_path="bert-base-uncased", - model_max_length=77, ): + model_file, + output_path, + vae_name_or_path, + unet_config_file, + text_encoder_config_file, + tokenizer_name_or_path="bert-base-uncased", + model_max_length=77, +): vae = AutoencoderKL.from_config(vae_name_or_path) unet = UNet2DConditionModel(**read_json(unet_config_file)) - tokenizer = AutoTokenizer.from_pretrained( - tokenizer_name_or_path, model_max_length=model_max_length) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, model_max_length=model_max_length) text_encoder_config = read_json(text_encoder_config_file) vocab_size = text_encoder_config["vocab_size"] max_position_embeddings = text_encoder_config["max_position_embeddings"] @@ -143,8 +150,7 @@ def build_pipelines( logger.info( f"The tokenizer's model_max_length {tokenizer.model_max_length}, while the text encoder's max_position_embeddings is {max_position_embeddings}, we will use {tokenizer.model_max_length} as max_position_embeddings!" ) - text_encoder_config[ - "max_position_embeddings"] = tokenizer.model_max_length + text_encoder_config["max_position_embeddings"] = tokenizer.model_max_length cofnig = LDMBertConfig(**text_encoder_config) text_encoder = LDMBertModel(cofnig) scheduler = DDIMScheduler( @@ -154,7 +160,8 @@ def build_pipelines( # Make sure the scheduler compatible with DDIM clip_sample=False, set_alpha_to_one=False, - steps_offset=1, ) + steps_offset=1, + ) unet_dict, vae_dict, text_encoder_dict = extract_paramaters(model_file) check_keys(unet, unet_dict) check_keys(vae, vae_dict) @@ -167,7 +174,8 @@ def build_pipelines( tokenizer=tokenizer, scheduler=scheduler, vqvae=vae, - unet=unet, ) + unet=unet, + ) pipe.save_pretrained(output_path) @@ -182,4 +190,5 @@ def build_pipelines( unet_config_file=args.unet_config_file, text_encoder_config_file=args.text_encoder_config_file, tokenizer_name_or_path=args.tokenizer_name_or_path, - model_max_length=args.model_max_length, ) + model_max_length=args.model_max_length, + ) diff --git a/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_args.py b/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_args.py index f7c2e091bed03..0443a7224578e 100644 --- a/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_args.py +++ b/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_args.py @@ -29,51 +29,43 @@ class ModelArguments: # use pretrained vae kl-8.ckpt (CompVis/stable-diffusion-v1-4/vae) vae_name_or_path: Optional[str] = field( default="CompVis/stable-diffusion-v1-4/vae", - metadata={"help": "pretrained_vae_name_or_path"}, ) + metadata={"help": "pretrained_vae_name_or_path"}, + ) text_encoder_config_file: Optional[str] = field( - default="./config/ldmbert.json", - metadata={"help": "text_encoder_config_file"}) - unet_config_file: Optional[str] = field( - default="./config/unet.json", metadata={"help": "unet_config_file"}) + default="./config/ldmbert.json", metadata={"help": "text_encoder_config_file"} + ) + unet_config_file: Optional[str] = field(default="./config/unet.json", metadata={"help": "unet_config_file"}) tokenizer_name: Optional[str] = field( default="bert-base-uncased", - metadata={ - "help": - "Pretrained tokenizer name or path if not the same as model_name" - }, ) - model_max_length: Optional[int] = field( - default=77, metadata={"help": "Pretrained tokenizer model_max_length"}) - num_inference_steps: Optional[int] = field( - default=200, metadata={"help": "num_inference_steps"}) - use_ema: bool = field( - default=False, metadata={"help": "Whether or not use ema"}) + metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}, + ) + model_max_length: Optional[int] = field(default=77, metadata={"help": "Pretrained tokenizer model_max_length"}) + num_inference_steps: Optional[int] = field(default=200, metadata={"help": "num_inference_steps"}) + use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"}) pretrained_model_name_or_path: str = field( default=None, - metadata={ - "help": - "Path to pretrained model or model, when we want to resume training." - }, ) - image_logging_steps: Optional[int] = field( - default=1000, metadata={"help": "Log image every X steps."}) + metadata={"help": "Path to pretrained model or model, when we want to resume training."}, + ) + image_logging_steps: Optional[int] = field(default=1000, metadata={"help": "Log image every X steps."}) enable_xformers_memory_efficient_attention: bool = field( - default=False, - metadata={"help": "enable_xformers_memory_efficient_attention."}) - to_static: bool = field( - default=False, metadata={"help": "Whether or not to_static"}) + default=False, metadata={"help": "enable_xformers_memory_efficient_attention."} + ) + to_static: bool = field(default=False, metadata={"help": "Whether or not to_static"}) prediction_type: Optional[str] = field( default="epsilon", metadata={ - "help": - "prediction_type, prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf)" - }, ) + "help": "prediction_type, prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf)" + }, + ) benchmark: bool = field( default=False, - metadata={"help": "Whether or not run benchmark."}, ) + metadata={"help": "Whether or not run benchmark."}, + ) profiler_options: Optional[str] = field( default=None, - metadata={"help": "profiler_options."}, ) - noise_offset: Optional[int] = field( - default=0, metadata={"help": "The scale of noise offset."}) + metadata={"help": "profiler_options."}, + ) + noise_offset: Optional[int] = field(default=0, metadata={"help": "The scale of noise offset."}) @dataclass @@ -84,113 +76,89 @@ class DataArguments: file_list: str = field( default="./data/filelist/train.filelist.list", - metadata={"help": "The name of the file_list."}, ) + metadata={"help": "The name of the file_list."}, + ) resolution: int = field( default=256, metadata={ - "help": - "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution." - }, ) + "help": "The resolution for input images, all the images in the train/validation dataset will be resized to this resolution." + }, + ) num_records: int = field(default=10000000, metadata={"help": "num_records"}) buffer_size: int = field( default=100, - metadata={"help": "Buffer size"}, ) + metadata={"help": "Buffer size"}, + ) shuffle_every_n_samples: int = field( default=5, - metadata={"help": "shuffle_every_n_samples."}, ) + metadata={"help": "shuffle_every_n_samples."}, + ) @dataclass class NoTrainerTrainingArguments: output_dir: str = field( default="outputs", - metadata={ - "help": - "The output directory where the model predictions and checkpoints will be written." - }, ) + metadata={"help": "The output directory where the model predictions and checkpoints will be written."}, + ) per_device_train_batch_size: int = field( - default=16, - metadata={"help": "Batch size per GPU core/CPU for training."}) + default=16, metadata={"help": "Batch size per GPU core/CPU for training."} + ) gradient_accumulation_steps: int = field( default=2, - metadata={ - "help": - "Number of updates steps to accumulate before performing a backward/update pass." - }, ) - learning_rate: float = field( - default=5e-5, - metadata={"help": "The initial learning rate for AdamW."}) - weight_decay: float = field( - default=0.02, - metadata={"help": "Weight decay for AdamW if we apply some."}) - adam_beta1: float = field( - default=0.9, metadata={"help": "Beta1 for AdamW optimizer"}) - adam_beta2: float = field( - default=0.999, metadata={"help": "Beta2 for AdamW optimizer"}) - adam_epsilon: float = field( - default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."}) - max_grad_norm: float = field( - default=-1.0, metadata={"help": "Max gradient norm."}) - num_train_epochs: int = field( - default=100, - metadata={"help": "Total number of training epochs to perform."}) + metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."}, + ) + learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."}) + weight_decay: float = field(default=0.02, metadata={"help": "Weight decay for AdamW if we apply some."}) + adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"}) + adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"}) + adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."}) + max_grad_norm: float = field(default=-1.0, metadata={"help": "Max gradient norm."}) + num_train_epochs: int = field(default=100, metadata={"help": "Total number of training epochs to perform."}) max_steps: int = field( default=1000000000, - metadata={ - "help": - "If > 0: set total number of training steps to perform. Override num_train_epochs." - }, ) + metadata={"help": "If > 0: set total number of training steps to perform. Override num_train_epochs."}, + ) lr_scheduler_type: str = field( default="constant", metadata={ - "help": - 'The scheduler type to use. support ["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"]' - }, ) - warmup_steps: int = field( - default=0, metadata={"help": "Linear warmup over warmup_steps."}) + "help": 'The scheduler type to use. support ["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"]' + }, + ) + warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."}) - logging_dir: Optional[str] = field( - default="logs", metadata={"help": "VisualDL log dir."}) + logging_dir: Optional[str] = field(default="logs", metadata={"help": "VisualDL log dir."}) - logging_steps: int = field( - default=50, metadata={"help": "Log every X updates steps."}) + logging_steps: int = field(default=50, metadata={"help": "Log every X updates steps."}) - save_steps: int = field( - default=5000, - metadata={"help": "Save checkpoint every X updates steps."}) + save_steps: int = field(default=5000, metadata={"help": "Save checkpoint every X updates steps."}) seed: int = field( default=23, - metadata={ - "help": "Random seed that will be set at the beginning of training." - }, ) + metadata={"help": "Random seed that will be set at the beginning of training."}, + ) dataloader_num_workers: int = field( default=6, metadata={ - "help": - "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process." - }, ) + "help": "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process." + }, + ) report_to: str = field( default="visualdl", - metadata={ - "help": - "The list of integrations to report the results and logs to." - }, ) + metadata={"help": "The list of integrations to report the results and logs to."}, + ) recompute: bool = field( default=False, metadata={ - "help": - "Recompute the forward pass to calculate gradients. Used for saving memory. " + "help": "Recompute the forward pass to calculate gradients. Used for saving memory. " "Only support for networks with transformer blocks." - }, ) + }, + ) def __str__(self): self_as_dict = asdict(self) - self_as_dict = { - k: f"<{k.upper()}>" if k.endswith("_token") else v - for k, v in self_as_dict.items() - } + self_as_dict = {k: f"<{k.upper()}>" if k.endswith("_token") else v for k, v in self_as_dict.items()} attrs_as_str = [f"{k}={v},\n" for k, v in sorted(self_as_dict.items())] return f"{self.__class__.__name__}(\n{''.join(attrs_as_str)})" @@ -207,8 +175,7 @@ def print_config(self, args=None, key=""): key = "Training" logger.info("{:^40}".format("{} Configuration Arguments".format(key))) - logger.info("{:30}:{}".format("paddle commit id", - paddle.version.commit)) + logger.info("{:30}:{}".format("paddle commit id", paddle.version.commit)) for a in dir(args): if a[:2] != "__": # don't print double underscore methods diff --git a/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_trainer.py b/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_trainer.py index 9103c0221f18a..6a99ea7a8f8bc 100644 --- a/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_trainer.py +++ b/ppdiffusers/examples/text_to_image_laion400m/ldm/ldm_trainer.py @@ -20,7 +20,11 @@ from paddle.io import DataLoader from paddlenlp.trainer import PrinterCallback, ProgressCallback, Trainer from paddlenlp.trainer.integrations import ( - INTEGRATION_TO_CALLBACK, TrainerCallback, VisualDLCallback, rewrite_logs) + INTEGRATION_TO_CALLBACK, + TrainerCallback, + VisualDLCallback, + rewrite_logs, +) from paddlenlp.utils import profiler from paddlenlp.utils.log import logger @@ -38,19 +42,17 @@ def autocast_smart_context_manager(self, args): "c_softmax_with_cross_entropy", ], level=args.fp16_opt_level, - dtype=amp_dtype, ) + dtype=amp_dtype, + ) else: - ctx_manager = (contextlib.nullcontext() - if sys.version_info >= (3, 7) else - contextlib.suppress()) + ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress() return ctx_manager def on_step_end(self, args, state, control, model=None, **kwargs): if hasattr(model, "on_train_batch_end"): model.on_train_batch_end() - if (args.image_logging_steps > 0 and - state.global_step % args.image_logging_steps == 0): + if args.image_logging_steps > 0 and state.global_step % args.image_logging_steps == 0: control.should_log = True def on_log(self, args, state, control, logs=None, **kwargs): @@ -58,22 +60,26 @@ def on_log(self, args, state, control, logs=None, **kwargs): inputs = kwargs.get("inputs", None) model = kwargs.get("model", None) image_logs = {} - if (inputs is not None and model is not None and - args.image_logging_steps > 0 and - state.global_step % args.image_logging_steps == 0): + if ( + inputs is not None + and model is not None + and args.image_logging_steps > 0 + and state.global_step % args.image_logging_steps == 0 + ): with self.autocast_smart_context_manager(args): - image_logs["reconstruction"] = model.decode_image( - pixel_values=inputs["pixel_values"]) + image_logs["reconstruction"] = model.decode_image(pixel_values=inputs["pixel_values"]) image_logs["ddim-samples-1.0"] = model.log_image( input_ids=inputs["input_ids"], guidance_scale=1.0, height=args.resolution, - width=args.resolution, ) + width=args.resolution, + ) image_logs["ddim-samples-7.5"] = model.log_image( input_ids=inputs["input_ids"], guidance_scale=7.5, height=args.resolution, - width=args.resolution, ) + width=args.resolution, + ) if not state.is_world_process_zero: return @@ -91,11 +97,11 @@ def on_log(self, args, state, control, logs=None, **kwargs): "Trainer is attempting to log a value of " f'"{v}" of type {type(v)} for key "{k}" as a scalar. ' "This invocation of VisualDL's writer.add_scalar() " - "is incorrect so we dropped this attribute.") + "is incorrect so we dropped this attribute." + ) # log images for k, v in image_logs.items(): - self.vdl_writer.add_image( - k, v, state.global_step, dataformats="NHWC") + self.vdl_writer.add_image(k, v, state.global_step, dataformats="NHWC") self.vdl_writer.flush() @@ -136,8 +142,7 @@ def __init__(self, benchmark=True, profiler_options=None): self.profiler_options = profiler_options def on_train_begin(self, args, state, control, **kwargs): - assert (args.gradient_accumulation_steps == 1 and not args.do_eval and - not args.do_predict) + assert args.gradient_accumulation_steps == 1 and not args.do_eval and not args.do_predict if self.benchmark: self.reader_cost_avg = AverageStatistical() @@ -162,8 +167,7 @@ def on_step_end(self, args, state, control, **kwargs): def on_log(self, args, state, control, logs=None, **kwargs): if self.benchmark: if logs is not None and "interval_steps_per_second" in logs: - self.batch_start = self.batch_start + ( - time.time() - self.maybe_log_save_evaluate_start) + self.batch_start = self.batch_start + (time.time() - self.maybe_log_save_evaluate_start) ips = logs["interval_steps_per_second"] * args.train_batch_size avg_batch_cost = 1 / logs["interval_steps_per_second"] logger.info( @@ -175,14 +179,15 @@ def on_log(self, args, state, control, logs=None, **kwargs): self.reader_cost_avg.get_average(), avg_batch_cost, args.train_batch_size, - ips, )) + ips, + ) + ) self.reader_cost_avg.reset() def on_epoch_end(self, args, state, control, **kwargs): if self.benchmark: train_epoch_cost = time.time() - self.epoch_start - logger.info("train epoch: %d, epoch_cost: %.5f s" % - (state.epoch, train_epoch_cost)) + logger.info("train epoch: %d, epoch_cost: %.5f s" % (state.epoch, train_epoch_cost)) # register visualdl_with_image @@ -196,7 +201,9 @@ def __init__(self, **kwargs): self.add_callback( BenchmarkCallback( benchmark=self.args.benchmark, - profiler_options=self.args.profiler_options, )) + profiler_options=self.args.profiler_options, + ) + ) if self.args.benchmark: if self.args.disable_tqdm: self.pop_callback(PrinterCallback) @@ -215,6 +222,7 @@ def get_train_dataloader(self): self.train_dataset, batch_size=self.args.train_batch_size, num_workers=self.args.dataloader_num_workers, - worker_init_fn=worker_init_fn, ) + worker_init_fn=worker_init_fn, + ) else: return super().get_train_dataloader() diff --git a/ppdiffusers/examples/text_to_image_laion400m/ldm/model.py b/ppdiffusers/examples/text_to_image_laion400m/ldm/model.py index 2fe8ba07c5621..5b4bb009920c4 100644 --- a/ppdiffusers/examples/text_to_image_laion400m/ldm/model.py +++ b/ppdiffusers/examples/text_to_image_laion400m/ldm/model.py @@ -20,9 +20,14 @@ import paddle.nn.functional as F from paddlenlp.transformers import AutoTokenizer -from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler, - LDMBertModel, UNet2DConditionModel, - is_ppxformers_available) +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + DDPMScheduler, + LDMBertModel, + UNet2DConditionModel, + is_ppxformers_available, +) from ppdiffusers.models.attention import AttentionBlock from ppdiffusers.models.ema import LitEma from ppdiffusers.pipelines.latent_diffusion import LDMBertConfig @@ -31,15 +36,15 @@ try: from ppdiffusers.models.attention import SpatialTransformer except ImportError: - from ppdiffusers.models.transformer_2d import (Transformer2DModel as - SpatialTransformer, ) + from ppdiffusers.models.transformer_2d import ( + Transformer2DModel as SpatialTransformer, + ) import json from paddlenlp.utils.log import logger -from ppdiffusers.initializer import (normal_, reset_initialized_parameter, - zeros_) +from ppdiffusers.initializer import normal_, reset_initialized_parameter, zeros_ from ppdiffusers.models.resnet import ResnetBlock2D @@ -55,31 +60,31 @@ def __init__(self, model_args): # init tokenizer tokenizer_name_or_path = ( model_args.tokenizer_name - if model_args.pretrained_model_name_or_path is None else - os.path.join(model_args.pretrained_model_name_or_path, "tokenizer")) + if model_args.pretrained_model_name_or_path is None + else os.path.join(model_args.pretrained_model_name_or_path, "tokenizer") + ) self.tokenizer = AutoTokenizer.from_pretrained( - tokenizer_name_or_path, - model_max_length=model_args.model_max_length) + tokenizer_name_or_path, model_max_length=model_args.model_max_length + ) # init vae vae_name_or_path = ( model_args.vae_name_or_path - if model_args.pretrained_model_name_or_path is None else - os.path.join(model_args.pretrained_model_name_or_path, "vqvae")) + if model_args.pretrained_model_name_or_path is None + else os.path.join(model_args.pretrained_model_name_or_path, "vqvae") + ) self.vae = AutoencoderKL.from_pretrained(vae_name_or_path) freeze_params(self.vae.parameters()) logger.info("Freeze vae parameters!") if model_args.pretrained_model_name_or_path is None: assert ( - model_args.text_encoder_config_file is not None and - model_args.unet_config_file is not None + model_args.text_encoder_config_file is not None and model_args.unet_config_file is not None ), "we must supply text_encoder_config_file & unet_config_file" # init text_encoder text_encoder_config = read_json(model_args.text_encoder_config_file) vocab_size = text_encoder_config["vocab_size"] - max_position_embeddings = text_encoder_config[ - "max_position_embeddings"] + max_position_embeddings = text_encoder_config["max_position_embeddings"] if self.tokenizer.vocab_size != vocab_size: logger.info( f"The tokenizer has a vocab size of {self.tokenizer.vocab_size}, while the text encoder has a vocab size of {vocab_size}, we will use {self.tokenizer.vocab_size} as vocab_size!" @@ -90,24 +95,24 @@ def __init__(self, model_args): logger.info( f"The tokenizer's model_max_length {self.tokenizer.model_max_length}, while the text encoder's max_position_embeddings is {max_position_embeddings}, we will use {self.tokenizer.model_max_length} as max_position_embeddings!" ) - text_encoder_config[ - "max_position_embeddings"] = self.tokenizer.model_max_length + text_encoder_config["max_position_embeddings"] = self.tokenizer.model_max_length config = LDMBertConfig(**text_encoder_config) self.text_encoder = LDMBertModel(config) self.text_encoder_is_pretrained = False # init unet2d - self.unet = UNet2DConditionModel( - **read_json(model_args.unet_config_file)) + self.unet = UNet2DConditionModel(**read_json(model_args.unet_config_file)) self.unet_is_pretrained = False else: # init text_encoder self.text_encoder = LDMBertModel.from_pretrained( - model_args.pretrained_model_name_or_path, subfolder="bert") + model_args.pretrained_model_name_or_path, subfolder="bert" + ) self.text_encoder_is_pretrained = True # init unet2d self.unet = UNet2DConditionModel.from_pretrained( - model_args.pretrained_model_name_or_path, subfolder="unet") + model_args.pretrained_model_name_or_path, subfolder="unet" + ) self.unet_is_pretrained = True assert model_args.prediction_type in ["epsilon", "v_prediction"] @@ -117,9 +122,9 @@ def __init__(self, model_args): beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000, - prediction_type=self.prediction_type, ) - self.register_buffer("alphas_cumprod", - self.noise_scheduler.alphas_cumprod) + prediction_type=self.prediction_type, + ) + self.register_buffer("alphas_cumprod", self.noise_scheduler.alphas_cumprod) if model_args.image_logging_steps > 0: self.eval_scheduler = DDIMScheduler( @@ -130,7 +135,8 @@ def __init__(self, model_args): clip_sample=False, set_alpha_to_one=False, steps_offset=1, - prediction_type=self.prediction_type, ) + prediction_type=self.prediction_type, + ) self.eval_scheduler.set_timesteps(model_args.num_inference_steps) self.init_weights() self.use_ema = model_args.use_ema @@ -138,14 +144,14 @@ def __init__(self, model_args): if self.use_ema: self.model_ema = LitEma(self.unet) - if (model_args.enable_xformers_memory_efficient_attention and - is_ppxformers_available()): + if model_args.enable_xformers_memory_efficient_attention and is_ppxformers_available(): try: self.unet.enable_xformers_memory_efficient_attention() except Exception as e: logger.warn( "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed" - f" correctly and a GPU is available: {e}") + f" correctly and a GPU is available: {e}" + ) # make sure unet text_encoder in train mode, vae in eval mode self.unet.train() @@ -153,35 +159,31 @@ def __init__(self, model_args): self.vae.eval() def add_noise( - self, - original_samples: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor, ) -> paddle.Tensor: - sqrt_alpha_prod = self.alphas_cumprod[timesteps]**0.5 + self, + original_samples: paddle.Tensor, + noise: paddle.Tensor, + timesteps: paddle.Tensor, + ) -> paddle.Tensor: + sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5 sqrt_alpha_prod = sqrt_alpha_prod.flatten() while len(sqrt_alpha_prod.shape) < len(original_samples.shape): sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) - sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps])**0.5 + sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5 sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() - while len(sqrt_one_minus_alpha_prod.shape) < len( - original_samples.shape): + while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape): sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) - noisy_samples = (sqrt_alpha_prod * original_samples + - sqrt_one_minus_alpha_prod * noise) + noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise return noisy_samples - def get_velocity(self, - sample: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor) -> paddle.Tensor: - sqrt_alpha_prod = self.alphas_cumprod[timesteps]**0.5 + def get_velocity(self, sample: paddle.Tensor, noise: paddle.Tensor, timesteps: paddle.Tensor) -> paddle.Tensor: + sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5 sqrt_alpha_prod = sqrt_alpha_prod.flatten() while len(sqrt_alpha_prod.shape) < len(sample.shape): sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) - sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps])**0.5 + sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5 sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape): sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) @@ -193,10 +195,8 @@ def init_weights(self): # init text_encoder if not self.text_encoder_is_pretrained: reset_initialized_parameter(self.text_encoder) - normal_(self.text_encoder.embeddings.word_embeddings.weight, 0, - 0.02) - normal_(self.text_encoder.embeddings.position_embeddings.weight, 0, - 0.02) + normal_(self.text_encoder.embeddings.word_embeddings.weight, 0, 0.02) + normal_(self.text_encoder.embeddings.position_embeddings.weight, 0, 0.02) # init unet if not self.unet_is_pretrained: reset_initialized_parameter(self.unet) @@ -243,16 +243,15 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs): if self.noise_offset: # https://www.crosslabs.org//blog/diffusion-with-offset-noise noise += self.noise_offset * paddle.randn( - (latents.shape[0], latents.shape[1], 1, 1), - dtype=noise.dtype) - timesteps = paddle.randint(0, - self.noise_scheduler.num_train_timesteps, - (latents.shape[0], )).astype("int64") + (latents.shape[0], latents.shape[1], 1, 1), dtype=noise.dtype + ) + timesteps = paddle.randint(0, self.noise_scheduler.num_train_timesteps, (latents.shape[0],)).astype( + "int64" + ) noisy_latents = self.add_noise(latents, noise, timesteps) encoder_hidden_states = self.text_encoder(input_ids)[0] - noise_pred = self.unet(noisy_latents, timesteps, - encoder_hidden_states).sample + noise_pred = self.unet(noisy_latents, timesteps, encoder_hidden_states).sample # Get the target for loss depending on the prediction type if self.prediction_type == "epsilon": @@ -262,10 +261,7 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs): else: raise ValueError(f"Unknown prediction type {self.prediction_type}") - loss = (F.mse_loss( - noise_pred.cast("float32"), - target.cast("float32"), - reduction="none").mean([1, 2, 3]).mean()) + loss = F.mse_loss(noise_pred.cast("float32"), target.cast("float32"), reduction="none").mean([1, 2, 3]).mean() return loss @@ -282,19 +278,18 @@ def decode_image(self, pixel_values=None, **kwargs): @paddle.no_grad() def log_image( - self, - input_ids=None, - height=256, - width=256, - eta=0.0, - guidance_scale=7.5, - **kwargs, ): + self, + input_ids=None, + height=256, + width=256, + eta=0.0, + guidance_scale=7.5, + **kwargs, + ): self.eval() with self.ema_scope(): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") # only log 8 image if input_ids.shape[0] > 8: input_ids = input_ids[:8] @@ -308,43 +303,34 @@ def log_image( padding="max_length", truncation=True, max_length=max_length, - return_tensors="pd", ) + return_tensors="pd", + ) uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0] - text_embeddings = paddle.concat( - [uncond_embeddings, text_embeddings], axis=0) + text_embeddings = paddle.concat([uncond_embeddings, text_embeddings], axis=0) - latents = paddle.randn((input_ids.shape[0], self.unet.in_channels, - height // 8, width // 8)) + latents = paddle.randn((input_ids.shape[0], self.unet.in_channels, height // 8, width // 8)) # ddim donot use this latents = latents * self.eval_scheduler.init_noise_sigma - accepts_eta = "eta" in set( - inspect.signature(self.eval_scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta for t in self.eval_scheduler.timesteps: # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents # ddim donot use this - latent_model_input = self.eval_scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t) # predict the noise residual - noise_pred = self.unet( - latent_model_input, - t, - encoder_hidden_states=text_embeddings).sample + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.eval_scheduler.step( - noise_pred, t, latents, **extra_step_kwargs).prev_sample + latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample latents = 1 / 0.18215 * latents image = self.vae.decode(latents).sample @@ -356,12 +342,10 @@ def fn(layer): # ldmbert if hasattr(layer, "enable_recompute"): layer.enable_recompute = value - print("Set", layer.__class__, "recompute", - layer.enable_recompute) + print("Set", layer.__class__, "recompute", layer.enable_recompute) # unet if hasattr(layer, "gradient_checkpointing"): layer.gradient_checkpointing = value - print("Set", layer.__class__, "recompute", - layer.gradient_checkpointing) + print("Set", layer.__class__, "recompute", layer.gradient_checkpointing) self.apply(fn) diff --git a/ppdiffusers/examples/text_to_image_laion400m/ldm/text_image_pair_dataset.py b/ppdiffusers/examples/text_to_image_laion400m/ldm/text_image_pair_dataset.py index 82d71e6c5f816..b41f0b799469f 100644 --- a/ppdiffusers/examples/text_to_image_laion400m/ldm/text_image_pair_dataset.py +++ b/ppdiffusers/examples/text_to_image_laion400m/ldm/text_image_pair_dataset.py @@ -46,8 +46,7 @@ def parse_src(filename): elif data_source == "laion_aes": text_json = json.loads(vec[2]) img_b64 = vec[5] - caption = text_json.get("caption_en", - text_json.get("blip_caption_en", "")) + caption = text_json.get("caption_en", text_json.get("blip_caption_en", "")) else: _, captions, _, _, _, img_b64 = vec[:6] caption = random.sample(captions.split("|"), 1)[0].replace("\1", "") @@ -77,23 +76,26 @@ def _get_param(self, img, output_size): class TextImagePair(IterableDataset): def __init__( - self, - file_list, - size, - num_records, - image_processing=None, - buffer_size=1000, - shuffle_every_n_samples=5, - interpolation="lanczos", - tokenizer=None, ): + self, + file_list, + size, + num_records, + image_processing=None, + buffer_size=1000, + shuffle_every_n_samples=5, + interpolation="lanczos", + tokenizer=None, + ): self.size = size if image_processing is None: - self.image_processing = transforms.Compose([ - transforms.Resize(int(size / 0.9), interpolation), - RandomCrop(size), - transforms.ToTensor(), - transforms.Normalize(0.5, 0.5), - ]) + self.image_processing = transforms.Compose( + [ + transforms.Resize(int(size / 0.9), interpolation), + RandomCrop(size), + transforms.ToTensor(), + transforms.Normalize(0.5, 0.5), + ] + ) else: self.image_processing = image_processing self.text_processing = lambda caption: tokenizer( @@ -101,7 +103,8 @@ def __init__( padding="max_length", truncation=True, max_length=tokenizer.model_max_length, - return_tensors="pd", ).input_ids[0] + return_tensors="pd", + ).input_ids[0] self.file_list = [] file_weights = [] with open(file_list, "r") as f: @@ -122,19 +125,14 @@ def __init__( file_weights = file_weights / file_weight_sum print(f"sample weights of files: {file_weights}") self.file_weights_cumsum = np.cumsum(file_weights) - self.file_weights_cumsum = np.concatenate( - [[0.0], self.file_weights_cumsum]) + self.file_weights_cumsum = np.concatenate([[0.0], self.file_weights_cumsum]) else: print("sample each file list with same probabiliy") self.file_weights_cumsum = None self.num_records = num_records - self.file_ids = [ - np.arange(len(filelist)) for filelist in self.file_list - ] - print( - f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}" - ) + self.file_ids = [np.arange(len(filelist)) for filelist in self.file_list] + print(f"original lengths of self.file_ids: {[len(f) for f in self.file_ids]}") self.buffer_size = buffer_size self.shuffle_every_n_samples = shuffle_every_n_samples @@ -143,9 +141,7 @@ def sample_loader(self, file_ids, filenames): random.shuffle(file_ids) for i in file_ids: filename = filenames[i].strip("\n") - with gzip.open(filename, - "rb") if filename.endswith(".gz") else open( - filename, "rb") as f: + with gzip.open(filename, "rb") if filename.endswith(".gz") else open(filename, "rb") as f: # retry = 0 while True: line = f.readline() @@ -171,19 +167,14 @@ def sample_loader(self, file_ids, filenames): if w < self.size or h < self.size: continue yield { - "pixel_values": - self.image_processing(data["image"]), - "input_ids": - self.text_processing(data["caption"]), + "pixel_values": self.image_processing(data["image"]), + "input_ids": self.text_processing(data["caption"]), } def random_load_from_multi_dataset(self): - print( - f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}" - ) + print(f"lengths of self.file_ids in random_load: {[len(f) for f in self.file_ids]}") sample_loader_per_dataset = [ - iter(self.sample_loader(self.file_ids[i], self.file_list[i])) - for i in range(len(self.file_ids)) + iter(self.sample_loader(self.file_ids[i], self.file_list[i])) for i in range(len(self.file_ids)) ] while True: @@ -192,8 +183,7 @@ def random_load_from_multi_dataset(self): else: rand_num = random.random() for i in range(len(self.file_list)): - if (self.file_weights_cumsum[i] <= rand_num < - self.file_weights_cumsum[i + 1]): + if self.file_weights_cumsum[i] <= rand_num < self.file_weights_cumsum[i + 1]: break sample_loader = sample_loader_per_dataset[i] # debug diff --git a/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_orig_ldm_ckpt_to_ppdiffusers.py b/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_orig_ldm_ckpt_to_ppdiffusers.py index c3249e9caca29..d3da3f1f9d187 100644 --- a/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_orig_ldm_ckpt_to_ppdiffusers.py +++ b/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_orig_ldm_ckpt_to_ppdiffusers.py @@ -26,10 +26,16 @@ ) from paddlenlp.transformers import BertTokenizer -from ppdiffusers import (AutoencoderKL, DDIMScheduler, - EulerAncestralDiscreteScheduler, LDMBertModel, - LDMTextToImagePipeline, LMSDiscreteScheduler, - PNDMScheduler, UNet2DConditionModel) +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + EulerAncestralDiscreteScheduler, + LDMBertModel, + LDMTextToImagePipeline, + LMSDiscreteScheduler, + PNDMScheduler, + UNet2DConditionModel, +) paddle.set_device("cpu") @@ -59,8 +65,7 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0): new_item = new_item.replace("emb_layers.1", "time_emb_proj") new_item = new_item.replace("skip_connection", "conv_shortcut") - new_item = shave_segments( - new_item, n_shave_prefix_segments=n_shave_prefix_segments) + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) mapping.append({"old": old_item, "new": new_item}) @@ -76,8 +81,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0): new_item = old_item new_item = new_item.replace("nin_shortcut", "conv_shortcut") - new_item = shave_segments( - new_item, n_shave_prefix_segments=n_shave_prefix_segments) + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) mapping.append({"old": old_item, "new": new_item}) @@ -119,8 +123,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): new_item = new_item.replace("proj_out.weight", "proj_attn.weight") new_item = new_item.replace("proj_out.bias", "proj_attn.bias") - new_item = shave_segments( - new_item, n_shave_prefix_segments=n_shave_prefix_segments) + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) mapping.append({"old": old_item, "new": new_item}) @@ -128,21 +131,20 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): def assign_to_checkpoint( - paths, - checkpoint, - old_checkpoint, - attention_paths_to_split=None, - additional_replacements=None, - config=None, ): + paths, + checkpoint, + old_checkpoint, + attention_paths_to_split=None, + additional_replacements=None, + config=None, +): """ This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits attention layers, and takes into account additional replacements that may arise. Assigns the weights to the new checkpoint. """ - assert isinstance( - paths, - list), "Paths should be a list of dicts containing 'old' and 'new' keys." + assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys." # Splits the attention layers into three variables. if attention_paths_to_split is not None: @@ -150,13 +152,11 @@ def assign_to_checkpoint( old_tensor = old_checkpoint[path] channels = old_tensor.shape[0] // 3 - target_shape = (-1, channels) if len(old_tensor.shape) == 3 else ( - -1) + target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1) num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3 - old_tensor = old_tensor.reshape((num_heads, 3 * channels // - num_heads) + old_tensor.shape[1:]) + old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:]) query, key, value = old_tensor.split(channels // num_heads, dim=1) checkpoint[path_map["query"]] = query.reshape(target_shape) @@ -167,8 +167,7 @@ def assign_to_checkpoint( new_path = path["new"] # These have already been assigned - if (attention_paths_to_split is not None and - new_path in attention_paths_to_split): + if attention_paths_to_split is not None and new_path in attention_paths_to_split: continue # Global renaming happens here @@ -178,8 +177,7 @@ def assign_to_checkpoint( if additional_replacements is not None: for replacement in additional_replacements: - new_path = new_path.replace(replacement["old"], - replacement["new"]) + new_path = new_path.replace(replacement["old"], replacement["new"]) # proj_attn.weight has to be converted from conv 1D to linear if "proj_attn.weight" in new_path: @@ -206,25 +204,19 @@ def create_unet_diffusers_config(original_config): """ unet_params = original_config.model.params.unet_config.params - block_out_channels = [ - unet_params.model_channels * mult for mult in unet_params.channel_mult - ] + block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult] down_block_types = [] resolution = 1 for i in range(len(block_out_channels)): - block_type = ("CrossAttnDownBlock2D" - if resolution in unet_params.attention_resolutions else - "DownBlock2D") + block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D" down_block_types.append(block_type) if i != len(block_out_channels) - 1: resolution *= 2 up_block_types = [] for i in range(len(block_out_channels)): - block_type = ("CrossAttnUpBlock2D" - if resolution in unet_params.attention_resolutions else - "UpBlock2D") + block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" up_block_types.append(block_type) resolution //= 2 @@ -237,7 +229,8 @@ def create_unet_diffusers_config(original_config): block_out_channels=tuple(block_out_channels), layers_per_block=unet_params.num_res_blocks, cross_attention_dim=unet_params.context_dim, - attention_head_dim=unet_params.num_heads, ) + attention_head_dim=unet_params.num_heads, + ) return config @@ -261,14 +254,12 @@ def create_vae_diffusers_config(original_config): up_block_types=tuple(up_block_types), block_out_channels=tuple(block_out_channels), latent_channels=vae_params.z_channels, - layers_per_block=vae_params.num_res_blocks, ) + layers_per_block=vae_params.num_res_blocks, + ) return config -def convert_ldm_unet_checkpoint(checkpoint, - config, - path=None, - extract_ema=False): +def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False): """ Takes a state dict and a config, and returns a converted checkpoint. """ @@ -289,8 +280,7 @@ def convert_ldm_unet_checkpoint(checkpoint, for key in keys: if key.startswith("model.diffusion_model"): flat_ema_key = "model_ema." + "".join(key.split(".")[1:]) - unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop( - flat_ema_key) + unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key) else: print( "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA" @@ -303,17 +293,12 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint = {} - new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict[ - "time_embed.0.weight"] - new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict[ - "time_embed.0.bias"] - new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict[ - "time_embed.2.weight"] - new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict[ - "time_embed.2.bias"] - - new_checkpoint["conv_in.weight"] = unet_state_dict[ - "input_blocks.0.0.weight"] + new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"] + new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"] + new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"] + new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"] + + new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"] new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"] new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"] @@ -322,35 +307,23 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"] # Retrieves the keys for the input blocks only - num_input_blocks = len({ - ".".join(layer.split(".")[:2]) - for layer in unet_state_dict if "input_blocks" in layer - }) + num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer}) input_blocks = { - layer_id: - [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key] + layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key] for layer_id in range(num_input_blocks) } # Retrieves the keys for the middle blocks only - num_middle_blocks = len({ - ".".join(layer.split(".")[:2]) - for layer in unet_state_dict if "middle_block" in layer - }) + num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer}) middle_blocks = { - layer_id: - [key for key in unet_state_dict if f"middle_block.{layer_id}" in key] + layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key] for layer_id in range(num_middle_blocks) } # Retrieves the keys for the output blocks only - num_output_blocks = len({ - ".".join(layer.split(".")[:2]) - for layer in unet_state_dict if "output_blocks" in layer - }) + num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer}) output_blocks = { - layer_id: - [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key] + layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key] for layer_id in range(num_output_blocks) } @@ -359,21 +332,17 @@ def convert_ldm_unet_checkpoint(checkpoint, layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1) resnets = [ - key for key in input_blocks[i] - if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in - key - ] - attentions = [ - key for key in input_blocks[i] if f"input_blocks.{i}.1" in key + key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key ] + attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key] if f"input_blocks.{i}.0.op.weight" in unet_state_dict: - new_checkpoint[ - f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop( - f"input_blocks.{i}.0.op.weight") - new_checkpoint[ - f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop( - f"input_blocks.{i}.0.op.bias") + new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop( + f"input_blocks.{i}.0.op.weight" + ) + new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop( + f"input_blocks.{i}.0.op.bias" + ) paths = renew_resnet_paths(resnets) meta_path = { @@ -385,7 +354,8 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) if len(attentions): paths = renew_attention_paths(attentions) @@ -398,19 +368,18 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) resnet_0 = middle_blocks[0] attentions = middle_blocks[1] resnet_1 = middle_blocks[2] resnet_0_paths = renew_resnet_paths(resnet_0) - assign_to_checkpoint( - resnet_0_paths, new_checkpoint, unet_state_dict, config=config) + assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config) resnet_1_paths = renew_resnet_paths(resnet_1) - assign_to_checkpoint( - resnet_1_paths, new_checkpoint, unet_state_dict, config=config) + assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config) attentions_paths = renew_attention_paths(attentions) meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"} @@ -419,14 +388,13 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) for i in range(num_output_blocks): block_id = i // (config["layers_per_block"] + 1) layer_in_block_id = i % (config["layers_per_block"] + 1) - output_block_layers = [ - shave_segments(name, 2) for name in output_blocks[i] - ] + output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]] output_block_list = {} for layer in output_block_layers: @@ -437,12 +405,8 @@ def convert_ldm_unet_checkpoint(checkpoint, output_block_list[layer_id] = [layer_name] if len(output_block_list) > 1: - resnets = [ - key for key in output_blocks[i] if f"output_blocks.{i}.0" in key - ] - attentions = [ - key for key in output_blocks[i] if f"output_blocks.{i}.1" in key - ] + resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key] + attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key] resnet_0_paths = renew_resnet_paths(resnets) paths = renew_resnet_paths(resnets) @@ -456,17 +420,17 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) if ["conv.weight", "conv.bias"] in output_block_list.values(): - index = list(output_block_list.values()).index( - ["conv.weight", "conv.bias"]) - new_checkpoint[ - f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[ - f"output_blocks.{i}.{index}.conv.weight"] - new_checkpoint[ - f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[ - f"output_blocks.{i}.{index}.conv.bias"] + index = list(output_block_list.values()).index(["conv.weight", "conv.bias"]) + new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[ + f"output_blocks.{i}.{index}.conv.weight" + ] + new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[ + f"output_blocks.{i}.{index}.conv.bias" + ] # Clear attentions as they have been attributed above. if len(attentions) == 2: @@ -476,27 +440,28 @@ def convert_ldm_unet_checkpoint(checkpoint, paths = renew_attention_paths(attentions) meta_path = { "old": f"output_blocks.{i}.1", - "new": - f"up_blocks.{block_id}.attentions.{layer_in_block_id}", + "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}", } assign_to_checkpoint( paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) else: - resnet_0_paths = renew_resnet_paths( - output_block_layers, n_shave_prefix_segments=1) + resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1) for path in resnet_0_paths: old_path = ".".join(["output_blocks", str(i), path["old"]]) - new_path = ".".join([ - "up_blocks", - str(block_id), - "resnets", - str(layer_in_block_id), - path["new"], - ]) + new_path = ".".join( + [ + "up_blocks", + str(block_id), + "resnets", + str(layer_in_block_id), + path["new"], + ] + ) new_checkpoint[new_path] = unet_state_dict[old_path] @@ -514,107 +479,74 @@ def convert_ldm_vae_checkpoint(checkpoint, config): new_checkpoint = {} - new_checkpoint["encoder.conv_in.weight"] = vae_state_dict[ - "encoder.conv_in.weight"] - new_checkpoint["encoder.conv_in.bias"] = vae_state_dict[ - "encoder.conv_in.bias"] - new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[ - "encoder.conv_out.weight"] - new_checkpoint["encoder.conv_out.bias"] = vae_state_dict[ - "encoder.conv_out.bias"] - new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[ - "encoder.norm_out.weight"] - new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[ - "encoder.norm_out.bias"] - - new_checkpoint["decoder.conv_in.weight"] = vae_state_dict[ - "decoder.conv_in.weight"] - new_checkpoint["decoder.conv_in.bias"] = vae_state_dict[ - "decoder.conv_in.bias"] - new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[ - "decoder.conv_out.weight"] - new_checkpoint["decoder.conv_out.bias"] = vae_state_dict[ - "decoder.conv_out.bias"] - new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[ - "decoder.norm_out.weight"] - new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[ - "decoder.norm_out.bias"] + new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"] + new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"] + new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"] + new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"] + new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"] + new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"] + + new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"] + new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"] + new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"] + new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"] + new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"] + new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"] new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"] new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"] - new_checkpoint["post_quant_conv.weight"] = vae_state_dict[ - "post_quant_conv.weight"] - new_checkpoint["post_quant_conv.bias"] = vae_state_dict[ - "post_quant_conv.bias"] + new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"] + new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"] # Retrieves the keys for the encoder down blocks only - num_down_blocks = len({ - ".".join(layer.split(".")[:3]) - for layer in vae_state_dict if "encoder.down" in layer - }) + num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer}) down_blocks = { - layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] - for layer_id in range(num_down_blocks) + layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks) } # Retrieves the keys for the decoder up blocks only - num_up_blocks = len({ - ".".join(layer.split(".")[:3]) - for layer in vae_state_dict if "decoder.up" in layer - }) + num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer}) up_blocks = { - layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] - for layer_id in range(num_up_blocks) + layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks) } for i in range(num_down_blocks): - resnets = [ - key for key in down_blocks[i] - if f"down.{i}" in key and f"down.{i}.downsample" not in key - ] + resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key] if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict: - new_checkpoint[ - f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop( - f"encoder.down.{i}.downsample.conv.weight") - new_checkpoint[ - f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop( - f"encoder.down.{i}.downsample.conv.bias") + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop( + f"encoder.down.{i}.downsample.conv.weight" + ) + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop( + f"encoder.down.{i}.downsample.conv.bias" + ) paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"down.{i}.block", - "new": f"down_blocks.{i}.resnets" - } + meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key] num_mid_res_blocks = 2 for i in range(1, num_mid_res_blocks + 1): - resnets = [ - key for key in mid_resnets if f"encoder.mid.block_{i}" in key - ] + resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key] paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"mid.block_{i}", - "new": f"mid_block.resnets.{i - 1}" - } + meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) - mid_attentions = [ - key for key in vae_state_dict if "encoder.mid.attn" in key - ] + mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key] paths = renew_vae_attention_paths(mid_attentions) meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} assign_to_checkpoint( @@ -622,58 +554,50 @@ def convert_ldm_vae_checkpoint(checkpoint, config): new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) conv_attn_to_linear(new_checkpoint) for i in range(num_up_blocks): block_id = num_up_blocks - 1 - i resnets = [ - key for key in up_blocks[block_id] - if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key + key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key ] if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict: - new_checkpoint[ - f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[ - f"decoder.up.{block_id}.upsample.conv.weight"] - new_checkpoint[ - f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[ - f"decoder.up.{block_id}.upsample.conv.bias"] + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[ + f"decoder.up.{block_id}.upsample.conv.weight" + ] + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[ + f"decoder.up.{block_id}.upsample.conv.bias" + ] paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"up.{block_id}.block", - "new": f"up_blocks.{i}.resnets" - } + meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key] num_mid_res_blocks = 2 for i in range(1, num_mid_res_blocks + 1): - resnets = [ - key for key in mid_resnets if f"decoder.mid.block_{i}" in key - ] + resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key] paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"mid.block_{i}", - "new": f"mid_block.resnets.{i - 1}" - } + meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) - mid_attentions = [ - key for key in vae_state_dict if "decoder.mid.attn" in key - ] + mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key] paths = renew_vae_attention_paths(mid_attentions) meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} assign_to_checkpoint( @@ -681,14 +605,13 @@ def convert_ldm_vae_checkpoint(checkpoint, config): new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) conv_attn_to_linear(new_checkpoint) return new_checkpoint -def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, - diffusers_vae_unet_checkpoint, - dtype="float32"): +def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint, dtype="float32"): need_transpose = [] for k, v in vae_or_unet.named_sublayers(include_self=True): if isinstance(v, paddle.nn.Linear): @@ -741,7 +664,8 @@ def create_ldm_bert_config(original_config): attention_dropout=0.0, activation_dropout=0.0, init_std=0.02, - pad_token_id=0, ) + pad_token_id=0, + ) return config @@ -755,61 +679,56 @@ def convert_ldm_bert_to_ppdiffusers(checkpoint, config): bert_state_dict[key.replace(bert_key, "")] = checkpoint.get(key) new_checkpoint = {} - new_checkpoint["embeddings.word_embeddings.weight"] = bert_state_dict[ - "transformer.token_emb.weight"].numpy() - new_checkpoint["embeddings.position_embeddings.weight"] = bert_state_dict[ - "transformer.pos_emb.emb.weight"].numpy() + new_checkpoint["embeddings.word_embeddings.weight"] = bert_state_dict["transformer.token_emb.weight"].numpy() + new_checkpoint["embeddings.position_embeddings.weight"] = bert_state_dict["transformer.pos_emb.emb.weight"].numpy() for i in range(config["encoder_layers"]): double_i = 2 * i double_i_plus1 = 2 * i + 1 # convert norm new_checkpoint[f"encoder.layers.{i}.norm1.weight"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i}.0.weight"].numpy() + f"transformer.attn_layers.layers.{double_i}.0.weight" + ].numpy() new_checkpoint[f"encoder.layers.{i}.norm1.bias"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i}.0.bias"].numpy() + f"transformer.attn_layers.layers.{double_i}.0.bias" + ].numpy() new_checkpoint[f"encoder.layers.{i}.self_attn.q_proj.weight"] = ( - bert_state_dict[ - f"transformer.attn_layers.layers.{double_i}.1.to_q.weight"].t() - .numpy()) + bert_state_dict[f"transformer.attn_layers.layers.{double_i}.1.to_q.weight"].t().numpy() + ) new_checkpoint[f"encoder.layers.{i}.self_attn.k_proj.weight"] = ( - bert_state_dict[ - f"transformer.attn_layers.layers.{double_i}.1.to_k.weight"].t() - .numpy()) + bert_state_dict[f"transformer.attn_layers.layers.{double_i}.1.to_k.weight"].t().numpy() + ) new_checkpoint[f"encoder.layers.{i}.self_attn.v_proj.weight"] = ( - bert_state_dict[ - f"transformer.attn_layers.layers.{double_i}.1.to_v.weight"].t() - .numpy()) + bert_state_dict[f"transformer.attn_layers.layers.{double_i}.1.to_v.weight"].t().numpy() + ) new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.weight"] = ( - bert_state_dict[ - f"transformer.attn_layers.layers.{double_i}.1.to_out.weight"] - .t().numpy()) - new_checkpoint[ - f"encoder.layers.{i}.self_attn.out_proj.bias"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i}.1.to_out.bias"].numpy( - ) + bert_state_dict[f"transformer.attn_layers.layers.{double_i}.1.to_out.weight"].t().numpy() + ) + new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.bias"] = bert_state_dict[ + f"transformer.attn_layers.layers.{double_i}.1.to_out.bias" + ].numpy() new_checkpoint[f"encoder.layers.{i}.norm2.weight"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i_plus1}.0.weight"].numpy() + f"transformer.attn_layers.layers.{double_i_plus1}.0.weight" + ].numpy() new_checkpoint[f"encoder.layers.{i}.norm2.bias"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i_plus1}.0.bias"].numpy() - new_checkpoint[f"encoder.layers.{i}.linear1.weight"] = (bert_state_dict[ - f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight"] - .t().numpy()) + f"transformer.attn_layers.layers.{double_i_plus1}.0.bias" + ].numpy() + new_checkpoint[f"encoder.layers.{i}.linear1.weight"] = ( + bert_state_dict[f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight"].t().numpy() + ) new_checkpoint[f"encoder.layers.{i}.linear1.bias"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"].numpy( - ) - new_checkpoint[f"encoder.layers.{i}.linear2.weight"] = (bert_state_dict[ - f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight"] - .t().numpy()) - new_checkpoint[f"encoder.layers.{i}.linear2.bias"] = (bert_state_dict[ - f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"].t() - .numpy()) - - new_checkpoint["final_layer_norm.weight"] = bert_state_dict[ - "transformer.norm.weight"].numpy() - new_checkpoint["final_layer_norm.bias"] = bert_state_dict[ - "transformer.norm.bias"].numpy() + f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias" + ].numpy() + new_checkpoint[f"encoder.layers.{i}.linear2.weight"] = ( + bert_state_dict[f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight"].t().numpy() + ) + new_checkpoint[f"encoder.layers.{i}.linear2.bias"] = ( + bert_state_dict[f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"].t().numpy() + ) + + new_checkpoint["final_layer_norm.weight"] = bert_state_dict["transformer.norm.weight"].numpy() + new_checkpoint["final_layer_norm.bias"] = bert_state_dict["transformer.norm.bias"].numpy() return new_checkpoint @@ -822,7 +741,8 @@ def convert_ldm_bert_to_ppdiffusers(checkpoint, config): default=None, type=str, required=True, - help="Path to the checkpoint to convert.", ) + help="Path to the checkpoint to convert.", + ) # wget https://raw.githubusercontent.com/CompVis/latent-diffusion/main/configs/latent-diffusion/txt2img-1p4B-eval.yaml parser.add_argument( "--original_config_file", @@ -844,13 +764,15 @@ def convert_ldm_bert_to_ppdiffusers(checkpoint, config): "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights" " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield" " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning." - ), ) + ), + ) parser.add_argument( "--dump_path", default=None, type=str, required=True, - help="Path to the output model.", ) + help="Path to the output model.", + ) args = parser.parse_args() @@ -871,46 +793,40 @@ def convert_ldm_bert_to_ppdiffusers(checkpoint, config): checkpoint, diffusers_unet_config, path=args.checkpoint_path, - extract_ema=args.extract_ema, ) + extract_ema=args.extract_ema, + ) unet = UNet2DConditionModel(**diffusers_unet_config) - ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers( - unet, diffusers_unet_checkpoint) + ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(unet, diffusers_unet_checkpoint) check_keys(unet, ppdiffusers_unet_checkpoint) unet.load_dict(ppdiffusers_unet_checkpoint) # 2. Convert the VAE model. vae_config = create_vae_diffusers_config(original_config) - diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, - vae_config) + diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config) vae = AutoencoderKL(**vae_config) - ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers( - vae, diffusers_vae_checkpoint) + ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(vae, diffusers_vae_checkpoint) check_keys(vae, ppdiffusers_vae_checkpoint) vae.load_dict(ppdiffusers_vae_checkpoint) # 3. Convert the text model. - text_model_type = original_config.model.params.cond_stage_config.target.split( - ".")[-1] + text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1] if text_model_type != "BERTEmbedder": print("We only support BERTEmbedder as text_encoder!") # 4. Convert the Bert model. bert_config = create_ldm_bert_config(original_config) - ppdiffusers_bert_checkpoint = convert_ldm_bert_to_ppdiffusers(checkpoint, - bert_config) + ppdiffusers_bert_checkpoint = convert_ldm_bert_to_ppdiffusers(checkpoint, bert_config) bert = LDMBertModel(**bert_config) check_keys(bert, ppdiffusers_bert_checkpoint) bert.load_dict(ppdiffusers_bert_checkpoint) # 5. Convert tokenizer. tokenizer = BertTokenizer.from_pretrained( - "bert-base-uncased", - model_max_length=bert_config["max_position_embeddings"]) + "bert-base-uncased", model_max_length=bert_config["max_position_embeddings"] + ) if tokenizer.vocab_size != bert_config["vocab_size"]: - print( - "Vocab size mismatched! Please verify your tokenizer or text encoder!" - ) + print("Vocab size mismatched! Please verify your tokenizer or text encoder!") # 6. Convert scheduler. num_train_timesteps = original_config.model.params.timesteps @@ -925,17 +841,14 @@ def convert_ldm_bert_to_ppdiffusers(checkpoint, config): set_alpha_to_one=False, steps_offset=1, # Make sure the scheduler compatible with PNDM - skip_prk_steps=True, ) + skip_prk_steps=True, + ) elif args.scheduler_type == "lms": - scheduler = LMSDiscreteScheduler( - beta_start=beta_start, - beta_end=beta_end, - beta_schedule="scaled_linear") + scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear") elif args.scheduler_type == "euler-ancestral": scheduler = EulerAncestralDiscreteScheduler( - beta_start=beta_start, - beta_end=beta_end, - beta_schedule="scaled_linear") + beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear" + ) elif args.scheduler_type == "ddim": scheduler = DDIMScheduler( beta_start=beta_start, @@ -944,16 +857,11 @@ def convert_ldm_bert_to_ppdiffusers(checkpoint, config): # Make sure the scheduler compatible with DDIM clip_sample=False, set_alpha_to_one=False, - steps_offset=1, ) + steps_offset=1, + ) else: - raise ValueError( - f"Scheduler of type {args.scheduler_type} doesn't exist!") - - pipe = LDMTextToImagePipeline( - vqvae=vae, - bert=bert, - tokenizer=tokenizer, - unet=unet, - scheduler=scheduler) + raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!") + + pipe = LDMTextToImagePipeline(vqvae=vae, bert=bert, tokenizer=tokenizer, unet=unet, scheduler=scheduler) pipe.save_pretrained(args.dump_path) diff --git a/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_ppdiffusers_to_orig_ldm_ckpt.py b/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_ppdiffusers_to_orig_ldm_ckpt.py index de9f15339690a..f9e742d3942f6 100644 --- a/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_ppdiffusers_to_orig_ldm_ckpt.py +++ b/ppdiffusers/examples/text_to_image_laion400m/scripts/convert_ppdiffusers_to_orig_ldm_ckpt.py @@ -63,15 +63,13 @@ # loop over resnets/attentions for downblocks hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}." sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0." - unet_conversion_map_layer.append( - (sd_down_res_prefix, hf_down_res_prefix)) + unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix)) if i < 3: # no attention layers in down_blocks.3 hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}." sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1." - unet_conversion_map_layer.append( - (sd_down_atn_prefix, hf_down_atn_prefix)) + unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix)) for j in range(3): # loop over resnets/attentions for upblocks @@ -83,21 +81,18 @@ # no attention layers in up_blocks.0 hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}." sd_up_atn_prefix = f"output_blocks.{3*i + j}.1." - unet_conversion_map_layer.append( - (sd_up_atn_prefix, hf_up_atn_prefix)) + unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix)) if i < 3: # no downsample in down_blocks.3 hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv." sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op." - unet_conversion_map_layer.append( - (sd_downsample_prefix, hf_downsample_prefix)) + unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix)) # no upsample in up_blocks.3 hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0." sd_upsample_prefix = f"output_blocks.{3*i + 2}.{1 if i == 0 else 2}." - unet_conversion_map_layer.append( - (sd_upsample_prefix, hf_upsample_prefix)) + unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix)) hf_mid_atn_prefix = "mid_block.attentions.0." sd_mid_atn_prefix = "middle_block.1." @@ -211,8 +206,7 @@ def convert_vae_state_dict(vae_state_dict): # pretty much a no-op -def convert_ppdiffusers_vae_unet_to_diffusers(vae_or_unet, - ppdiffusers_vae_unet_checkpoint): +def convert_ppdiffusers_vae_unet_to_diffusers(vae_or_unet, ppdiffusers_vae_unet_checkpoint): need_transpose = [] for k, v in vae_or_unet.named_sublayers(include_self=True): if isinstance(v, paddle.nn.Linear): @@ -228,56 +222,63 @@ def convert_ppdiffusers_vae_unet_to_diffusers(vae_or_unet, def convert_ldmbert_state_dict(ldmbert_state_dict, num_layers=32): ppdiffusers_mapping_to_orig = {} + ppdiffusers_mapping_to_orig["embeddings.word_embeddings.weight"] = "cond_stage_model.transformer.token_emb.weight" ppdiffusers_mapping_to_orig[ - "embeddings.word_embeddings.weight"] = "cond_stage_model.transformer.token_emb.weight" - ppdiffusers_mapping_to_orig[ - "embeddings.position_embeddings.weight"] = "cond_stage_model.transformer.pos_emb.emb.weight" + "embeddings.position_embeddings.weight" + ] = "cond_stage_model.transformer.pos_emb.emb.weight" for i in range(num_layers): double_i = 2 * i double_i_plus1 = 2 * i + 1 ppdiffusers_mapping_to_orig[ - f"encoder.layers.{i}.norm1.weight"] = f"cond_stage_model.transformer.attn_layers.layers.{double_i}.0.weight" - ppdiffusers_mapping_to_orig[ - f"encoder.layers.{i}.norm1.bias"] = f"cond_stage_model.transformer.attn_layers.layers.{double_i}.0.bias" - - ppdiffusers_mapping_to_orig[ - f"encoder.layers.{i}.self_attn.q_proj.weight"] = ( - f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_q.weight", - "transpose", ) - ppdiffusers_mapping_to_orig[ - f"encoder.layers.{i}.self_attn.k_proj.weight"] = ( - f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_k.weight", - "transpose", ) + f"encoder.layers.{i}.norm1.weight" + ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i}.0.weight" ppdiffusers_mapping_to_orig[ - f"encoder.layers.{i}.self_attn.v_proj.weight"] = ( - f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_v.weight", - "transpose", ) + f"encoder.layers.{i}.norm1.bias" + ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i}.0.bias" + + ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.self_attn.q_proj.weight"] = ( + f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_q.weight", + "transpose", + ) + ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.self_attn.k_proj.weight"] = ( + f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_k.weight", + "transpose", + ) + ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.self_attn.v_proj.weight"] = ( + f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_v.weight", + "transpose", + ) + ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.self_attn.out_proj.weight"] = ( + f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_out.weight", + "transpose", + ) ppdiffusers_mapping_to_orig[ - f"encoder.layers.{i}.self_attn.out_proj.weight"] = ( - f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_out.weight", - "transpose", ) - ppdiffusers_mapping_to_orig[ - f"encoder.layers.{i}.self_attn.out_proj.bias"] = f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_out.bias" + f"encoder.layers.{i}.self_attn.out_proj.bias" + ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i}.1.to_out.bias" ppdiffusers_mapping_to_orig[ - f"encoder.layers.{i}.norm2.weight"] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.0.weight" + f"encoder.layers.{i}.norm2.weight" + ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.0.weight" ppdiffusers_mapping_to_orig[ - f"encoder.layers.{i}.norm2.bias"] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.0.bias" + f"encoder.layers.{i}.norm2.bias" + ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.0.bias" ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.linear1.weight"] = ( f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight", - "transpose", ) + "transpose", + ) ppdiffusers_mapping_to_orig[ - f"encoder.layers.{i}.linear1.bias"] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias" + f"encoder.layers.{i}.linear1.bias" + ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias" ppdiffusers_mapping_to_orig[f"encoder.layers.{i}.linear2.weight"] = ( f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight", - "transpose", ) + "transpose", + ) ppdiffusers_mapping_to_orig[ - f"encoder.layers.{i}.linear2.bias"] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias" + f"encoder.layers.{i}.linear2.bias" + ] = f"cond_stage_model.transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias" - ppdiffusers_mapping_to_orig[ - "final_layer_norm.weight"] = "cond_stage_model.transformer.norm.weight" - ppdiffusers_mapping_to_orig[ - "final_layer_norm.bias"] = "cond_stage_model.transformer.norm.bias" + ppdiffusers_mapping_to_orig["final_layer_norm.weight"] = "cond_stage_model.transformer.norm.weight" + ppdiffusers_mapping_to_orig["final_layer_norm.bias"] = "cond_stage_model.transformer.norm.bias" new_state_dict = {} for k, v in ldmbert_state_dict.items(): @@ -286,18 +287,15 @@ def convert_ldmbert_state_dict(ldmbert_state_dict, num_layers=32): if isinstance(new_name, (list, tuple)): need_transpose = True new_name = new_name[0] - new_state_dict[new_name] = (torch.from_numpy(v.t().numpy()) - if need_transpose else - torch.from_numpy(v.numpy())) + new_state_dict[new_name] = torch.from_numpy(v.t().numpy()) if need_transpose else torch.from_numpy(v.numpy()) # dummpy weights, we donot use this! - new_state_dict[ - "cond_stage_model.transformer.to_logits.weight"] = torch.zeros( - new_state_dict[ - "cond_stage_model.transformer.token_emb.weight"].shape) + new_state_dict["cond_stage_model.transformer.to_logits.weight"] = torch.zeros( + new_state_dict["cond_stage_model.transformer.token_emb.weight"].shape + ) new_state_dict["cond_stage_model.transformer.to_logits.bias"] = torch.zeros( - new_state_dict["cond_stage_model.transformer.token_emb.weight"].shape[ - 0]) + new_state_dict["cond_stage_model.transformer.token_emb.weight"].shape[0] + ) return new_state_dict @@ -308,43 +306,35 @@ def convert_ldmbert_state_dict(ldmbert_state_dict, num_layers=32): default=None, type=str, required=True, - help="Path to the model to convert.", ) + help="Path to the model to convert.", + ) parser.add_argument( "--dump_path", default=None, type=str, required=True, - help="Path to the output model.", ) - parser.add_argument( - "--half", action="store_true", help="Save weights in half precision.") + help="Path to the output model.", + ) + parser.add_argument("--half", action="store_true", help="Save weights in half precision.") args = parser.parse_args() pipe = LDMTextToImagePipeline.from_pretrained(args.model_name_or_path) # Convert the UNet model - unet_state_dict = convert_ppdiffusers_vae_unet_to_diffusers( - pipe.unet, pipe.unet.state_dict()) + unet_state_dict = convert_ppdiffusers_vae_unet_to_diffusers(pipe.unet, pipe.unet.state_dict()) unet_state_dict = convert_unet_state_dict(unet_state_dict) - unet_state_dict = { - "model.diffusion_model." + k: v - for k, v in unet_state_dict.items() - } + unet_state_dict = {"model.diffusion_model." + k: v for k, v in unet_state_dict.items()} # Convert the VAE model - vae_state_dict = convert_ppdiffusers_vae_unet_to_diffusers( - pipe.vqvae, pipe.vqvae.state_dict()) + vae_state_dict = convert_ppdiffusers_vae_unet_to_diffusers(pipe.vqvae, pipe.vqvae.state_dict()) vae_state_dict = convert_vae_state_dict(vae_state_dict) - vae_state_dict = { - "first_stage_model." + k: v - for k, v in vae_state_dict.items() - } + vae_state_dict = {"first_stage_model." + k: v for k, v in vae_state_dict.items()} # Convert the ldmbert model - text_enc_dict = convert_ldmbert_state_dict( - pipe.bert.state_dict(), num_layers=pipe.bert.config["encoder_layers"]) + text_enc_dict = convert_ldmbert_state_dict(pipe.bert.state_dict(), num_layers=pipe.bert.config["encoder_layers"]) # Put together new checkpoint - state_dict = { ** unet_state_dict, ** vae_state_dict, ** text_enc_dict} + state_dict = {**unet_state_dict, **vae_state_dict, **text_enc_dict} if args.half: state_dict = {k: v.half() for k, v in state_dict.items()} state_dict = {"state_dict": state_dict} diff --git a/ppdiffusers/examples/text_to_image_laion400m/scripts/plot_fid_clip_score.py b/ppdiffusers/examples/text_to_image_laion400m/scripts/plot_fid_clip_score.py index 3ab76ea0ffc2b..6890fae514ab5 100644 --- a/ppdiffusers/examples/text_to_image_laion400m/scripts/plot_fid_clip_score.py +++ b/ppdiffusers/examples/text_to_image_laion400m/scripts/plot_fid_clip_score.py @@ -41,7 +41,8 @@ linewidth=3, color="r", marker="o", - markerfacecolor="blue", ) + markerfacecolor="blue", +) plt.plot( clip_pt, fid_pt, @@ -49,7 +50,8 @@ linewidth=3, color="b", marker="o", - markerfacecolor="red", ) + markerfacecolor="red", +) plt.xlabel("CLIP Score") plt.ylabel("FID@1k") plt.title("12W Globel Step Pareto Curves - DDIM") diff --git a/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_no_trainer.py b/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_no_trainer.py index 15352e4cd1d5b..4aa3163536c16 100644 --- a/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_no_trainer.py +++ b/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_no_trainer.py @@ -21,8 +21,14 @@ import paddle import paddle.nn as nn -from ldm import (DataArguments, LatentDiffusionModel, ModelArguments, - NoTrainerTrainingArguments, TextImagePair, worker_init_fn) +from ldm import ( + DataArguments, + LatentDiffusionModel, + ModelArguments, + NoTrainerTrainingArguments, + TextImagePair, + worker_init_fn, +) from paddle.io import DataLoader from paddle.optimizer import AdamW from paddlenlp.trainer import PdArgumentParser, set_seed @@ -47,12 +53,11 @@ def get_writer(training_args): def main(): - parser = PdArgumentParser( - (ModelArguments, DataArguments, NoTrainerTrainingArguments)) + parser = PdArgumentParser((ModelArguments, DataArguments, NoTrainerTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() training_args.image_logging_steps = model_args.image_logging_steps = ( - math.ceil(model_args.image_logging_steps / training_args.logging_steps) - * training_args.logging_steps) + math.ceil(model_args.image_logging_steps / training_args.logging_steps) * training_args.logging_steps + ) training_args.resolution = data_args.resolution training_args.print_config(training_args, "Training") training_args.print_config(model_args, "Model") @@ -64,8 +69,7 @@ def main(): if num_processes > 1: paddle.distributed.init_parallel_env() - training_args.logging_dir = os.path.join(training_args.output_dir, - training_args.logging_dir) + training_args.logging_dir = os.path.join(training_args.output_dir, training_args.logging_dir) if training_args.seed is not None: set_seed(training_args.seed) @@ -75,16 +79,14 @@ def main(): model = LatentDiffusionModel(model_args) model.set_recompute(training_args.recompute) - params_to_train = itertools.chain(model.text_encoder.parameters(), - model.unet.parameters()) + params_to_train = itertools.chain(model.text_encoder.parameters(), model.unet.parameters()) lr_scheduler = get_scheduler( training_args.lr_scheduler_type, learning_rate=training_args.learning_rate, - num_warmup_steps=training_args.warmup_steps * - training_args.gradient_accumulation_steps, - num_training_steps=training_args.max_steps * - training_args.gradient_accumulation_steps, ) + num_warmup_steps=training_args.warmup_steps * training_args.gradient_accumulation_steps, + num_training_steps=training_args.max_steps * training_args.gradient_accumulation_steps, + ) optimizer = AdamW( learning_rate=lr_scheduler, @@ -94,8 +96,9 @@ def main(): weight_decay=training_args.weight_decay, epsilon=training_args.adam_epsilon, grad_clip=nn.ClipGradByGlobalNorm(training_args.max_grad_norm) - if training_args.max_grad_norm is not None and - training_args.max_grad_norm > 0 else None, ) + if training_args.max_grad_norm is not None and training_args.max_grad_norm > 0 + else None, + ) train_dataset = TextImagePair( file_list=data_args.file_list, size=data_args.resolution, @@ -103,7 +106,8 @@ def main(): buffer_size=data_args.buffer_size, shuffle_every_n_samples=data_args.shuffle_every_n_samples, interpolation="lanczos", - tokenizer=model.tokenizer, ) + tokenizer=model.tokenizer, + ) if num_processes > 1: model = paddle.DataParallel(model) @@ -112,28 +116,23 @@ def main(): train_dataset, batch_size=training_args.per_device_train_batch_size, num_workers=training_args.dataloader_num_workers, - worker_init_fn=worker_init_fn, ) + worker_init_fn=worker_init_fn, + ) if rank == 0: writer = get_writer(training_args) # Train! - total_batch_size = (training_args.per_device_train_batch_size * - num_processes * - training_args.gradient_accumulation_steps) + total_batch_size = ( + training_args.per_device_train_batch_size * num_processes * training_args.gradient_accumulation_steps + ) logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {training_args.num_train_epochs}") - logger.info( - f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}" - ) - logger.info( - f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" - ) - logger.info( - f" Gradient Accumulation steps = {training_args.gradient_accumulation_steps}" - ) + logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {training_args.gradient_accumulation_steps}") global_steps = 0 tic_train = time.time() @@ -144,15 +143,13 @@ def main(): break for step, batch in enumerate(train_dataloader): - if (num_processes > 1 and ( - (step + 1) % training_args.gradient_accumulation_steps != 0) - ) or training_args.recompute: + if ( + num_processes > 1 and ((step + 1) % training_args.gradient_accumulation_steps != 0) + ) or training_args.recompute: # grad acc, no_sync when (step + 1) % training_args.gradient_accumulation_steps != 0: ctx_manager = model.no_sync() else: - ctx_manager = (contextlib.nullcontext() - if sys.version_info >= (3, 7) else - contextlib.suppress()) + ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress() with ctx_manager: loss = model(**batch) @@ -170,8 +167,7 @@ def main(): # train log if global_steps % training_args.logging_steps == 0: logs = { - "train/loss": - loss.item() * training_args.gradient_accumulation_steps, + "train/loss": loss.item() * training_args.gradient_accumulation_steps, "train/lr_abs": lr_scheduler.get_lr(), "train/global_steps": global_steps, } @@ -191,48 +187,51 @@ def main(): logger.info(log_str) if global_steps % training_args.image_logging_steps == 0: - reconstruction_img = unwrap_model(model).decode_image( - pixel_values=batch["pixel_values"]) - ddim_10_img = unwrap_model(model).log_image( - input_ids=batch["input_ids"], guidance_scale=1.0) - ddim_75_img = unwrap_model(model).log_image( - input_ids=batch["input_ids"], guidance_scale=7.5) + reconstruction_img = unwrap_model(model).decode_image(pixel_values=batch["pixel_values"]) + ddim_10_img = unwrap_model(model).log_image(input_ids=batch["input_ids"], guidance_scale=1.0) + ddim_75_img = unwrap_model(model).log_image(input_ids=batch["input_ids"], guidance_scale=7.5) if rank == 0: writer.add_image( "reconstruction", reconstruction_img, global_steps, - dataformats="NHWC", ) + dataformats="NHWC", + ) writer.add_image( "ddim-samples-1.0", ddim_10_img, global_steps, - dataformats="NHWC", ) + dataformats="NHWC", + ) writer.add_image( "ddim-samples-7.5", ddim_75_img, global_steps, - dataformats="NHWC", ) + dataformats="NHWC", + ) tic_train = time.time() if rank == 0 and global_steps % training_args.save_steps == 0: os.makedirs( - os.path.join(training_args.output_dir, - f"global-steps-{global_steps}"), - exist_ok=True, ) + os.path.join(training_args.output_dir, f"global-steps-{global_steps}"), + exist_ok=True, + ) paddle.save( model.state_dict(), os.path.join( training_args.output_dir, f"global-steps-{global_steps}", - "model_state.pdparams", ), ) + "model_state.pdparams", + ), + ) if global_steps >= training_args.max_steps: break if rank == 0: paddle.save( model.state_dict(), - os.path.join(training_args.output_dir, "model_state.pdparams"), ) + os.path.join(training_args.output_dir, "model_state.pdparams"), + ) writer.close() diff --git a/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_trainer.py b/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_trainer.py index d0464a661998f..0125d6fc27e9d 100644 --- a/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_trainer.py +++ b/ppdiffusers/examples/text_to_image_laion400m/train_txt2img_laion400m_trainer.py @@ -16,16 +16,19 @@ import os import paddle -from ldm import (DataArguments, LatentDiffusionModel, LatentDiffusionTrainer, - ModelArguments, TextImagePair) -from paddlenlp.trainer import (PdArgumentParser, TrainingArguments, - get_last_checkpoint) +from ldm import ( + DataArguments, + LatentDiffusionModel, + LatentDiffusionTrainer, + ModelArguments, + TextImagePair, +) +from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint from paddlenlp.utils.log import logger def main(): - parser = PdArgumentParser( - (ModelArguments, DataArguments, TrainingArguments)) + parser = PdArgumentParser((ModelArguments, DataArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() # report to custom_visualdl training_args.report_to = ["custom_visualdl"] @@ -33,9 +36,10 @@ def main(): training_args.benchmark = model_args.benchmark training_args.profiler_options = model_args.profiler_options training_args.image_logging_steps = model_args.image_logging_steps = ( - (math.ceil(model_args.image_logging_steps / training_args.logging_steps) - * training_args.logging_steps) - if model_args.image_logging_steps > 0 else -1) + (math.ceil(model_args.image_logging_steps / training_args.logging_steps) * training_args.logging_steps) + if model_args.image_logging_steps > 0 + else -1 + ) training_args.print_config(model_args, "Model") training_args.print_config(data_args, "Data") @@ -44,16 +48,14 @@ def main(): # Detecting last checkpoint. last_checkpoint = None - if (os.path.isdir(training_args.output_dir) and training_args.do_train and - not training_args.overwrite_output_dir): + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len( - os.listdir(training_args.output_dir)) > 0: + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome.") - elif (last_checkpoint is not None and - training_args.resume_from_checkpoint is None): + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." @@ -67,32 +69,30 @@ def main(): buffer_size=data_args.buffer_size, shuffle_every_n_samples=data_args.shuffle_every_n_samples, interpolation="lanczos", - tokenizer=model.tokenizer, ) + tokenizer=model.tokenizer, + ) if model_args.to_static: - input_ids = paddle.static.InputSpec( - name="input_ids", - shape=[-1, model_args.model_max_length], - dtype="int64") + input_ids = paddle.static.InputSpec(name="input_ids", shape=[-1, model_args.model_max_length], dtype="int64") pixel_values = paddle.static.InputSpec( name="pixel_values", shape=[-1, 3, data_args.resolution, data_args.resolution], - dtype="float32", ) + dtype="float32", + ) specs = [input_ids, pixel_values] paddle.jit.ignore_module([os]) model = paddle.jit.to_static(model, input_spec=specs) - logger.info("Successfully to apply @to_static with specs: {}".format( - specs)) + logger.info("Successfully to apply @to_static with specs: {}".format(specs)) trainer = LatentDiffusionTrainer( model=model, args=training_args, train_dataset=train_dataset, - tokenizer=model.tokenizer, ) + tokenizer=model.tokenizer, + ) # must set recompute after trainer init trainer.model.set_recompute(training_args.recompute) - params_to_train = itertools.chain(trainer.model.text_encoder.parameters(), - trainer.model.unet.parameters()) + params_to_train = itertools.chain(trainer.model.text_encoder.parameters(), trainer.model.unet.parameters()) trainer.set_optimizer_grouped_parameters(params_to_train) checkpoint = None diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/_functional_video.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/_functional_video.py index e94c83d4ee0af..14468dc73417a 100644 --- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/_functional_video.py +++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/_functional_video.py @@ -30,16 +30,13 @@ def crop(clip, i, j, h, w): """ if len(clip.shape) != 4: raise ValueError("clip should be a 4D tensor") - return clip[(...), i:i + h, j:j + w] + return clip[(...), i : i + h, j : j + w] def resize(clip, target_size, interpolation_mode): if len(target_size) != 2: - raise ValueError( - f"target size should be tuple (height, width), instead got {target_size}" - ) - return paddle.nn.functional.interpolate( - x=clip, size=target_size, mode=interpolation_mode, align_corners=False) + raise ValueError(f"target size should be tuple (height, width), instead got {target_size}") + return paddle.nn.functional.interpolate(x=clip, size=target_size, mode=interpolation_mode, align_corners=False) def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"): @@ -85,8 +82,7 @@ def to_tensor(clip): """ _is_tensor_video_clip(clip) if not clip.dtype == "uint8": - raise TypeError("clip tensor should have data type uint8. Got %s" % - str(clip.dtype)) + raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype)) return clip.astype(dtype="float32").transpose(perm=[3, 0, 1, 2]) / 255.0 @@ -105,8 +101,7 @@ def normalize(clip, mean, std, inplace=False): clip = clip.clone() mean = paddle.to_tensor(data=mean, place=clip.place).astype(clip.dtype) std = paddle.to_tensor(data=std, place=clip.place).astype(clip.dtype) - clip = clip.substract(mean[:, (None), (None), (None)]).divide(std[:, ( - None), (None), (None)]) + clip = clip.substract(mean[:, (None), (None), (None)]).divide(std[:, (None), (None), (None)]) return clip diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/_transforms_video.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/_transforms_video.py index aaaa301718d58..97b39c8cf8f86 100644 --- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/_transforms_video.py +++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/_transforms_video.py @@ -44,15 +44,15 @@ def __repr__(self) -> str: class RandomResizedCropVideo(paddle.vision.transforms.RandomResizedCrop): def __init__( - self, - size, - scale=(0.08, 1.0), - ratio=(3.0 / 4.0, 4.0 / 3.0), - interpolation_mode="bilinear", ): + self, + size, + scale=(0.08, 1.0), + ratio=(3.0 / 4.0, 4.0 / 3.0), + interpolation_mode="bilinear", + ): if isinstance(size, tuple): if len(size) != 2: - raise ValueError( - f"size should be tuple (height, width), instead got {size}") + raise ValueError(f"size should be tuple (height, width), instead got {size}") self.size = size else: self.size = size, size @@ -69,8 +69,7 @@ def __call__(self, clip): size is (C, T, H, W) """ i, j, h, w = self.get_params(clip, self.scale, self.ratio) - return F.resized_crop(clip, i, j, h, w, self.size, - self.interpolation_mode) + return F.resized_crop(clip, i, j, h, w, self.size, self.interpolation_mode) def __repr__(self) -> str: return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}, scale={self.scale}, ratio={self.ratio})" diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/frame_dataset.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/frame_dataset.py index c40a946bb1047..e2e940e51fc97 100644 --- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/frame_dataset.py +++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/frame_dataset.py @@ -21,6 +21,7 @@ from PIL import Image, ImageFile from ._transforms_video import CenterCropVideo, RandomCropVideo + """ VideoFrameDataset """ ImageFile.LOAD_TRUNCATED_IMAGES = True IMG_EXTENSIONS = [ @@ -72,9 +73,7 @@ def is_image_file(filename): def find_classes(dir): assert os.path.exists(dir), f"{dir} does not exist" - classes = [ - d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d)) - ] + classes = [d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))] classes.sort() class_to_idx = {classes[i]: i for i in range(len(classes))} return classes, class_to_idx @@ -87,10 +86,7 @@ def class_name_to_idx(annotation_dir): fpath = os.path.join(annotation_dir, "classInd.txt") with open(fpath, "r") as f: data = f.readlines() - class_to_idx = { - x.strip().split(" ")[1].lower(): int(x.strip().split(" ")[0]) - 1 - for x in data - } + class_to_idx = {x.strip().split(" ")[1].lower(): int(x.strip().split(" ")[0]) - 1 for x in data} return class_to_idx @@ -151,8 +147,7 @@ def split_by_captical(s): return string.rstrip(" ").lower() -def make_dataset_ucf(dir, nframes, class_to_idx, frame_stride=1, - clip_step=None): +def make_dataset_ucf(dir, nframes, class_to_idx, frame_stride=1, clip_step=None): """ Load consecutive clips and consecutive frames from `dir`. @@ -181,11 +176,9 @@ def make_dataset_ucf(dir, nframes, class_to_idx, frame_stride=1, assert os.path.isdir(video_path) frames = [] for i, fname in enumerate(sorted(os.listdir(video_path))): - assert is_image_file( - fname), f"fname={fname},video_path={video_path},dir={dir}" + assert is_image_file(fname), f"fname={fname},video_path={video_path},dir={dir}" img_path = os.path.join(video_path, fname) - class_name = video_name.split("_")[ - 1].lower() # v_BoxingSpeedBag_g12_c05 -> boxingspeedbag + class_name = video_name.split("_")[1].lower() # v_BoxingSpeedBag_g12_c05 -> boxingspeedbag class_caption = split_by_captical( video_name.split("_")[1] ) # v_BoxingSpeedBag_g12_c05 -> BoxingSpeedBag -> boxing speed bag @@ -201,7 +194,7 @@ def make_dataset_ucf(dir, nframes, class_to_idx, frame_stride=1, frames = frames[::frame_stride] start_indices = list(range(len(frames)))[::clip_step] for i in start_indices: - clip = frames[i:i + nframes] + clip = frames[i : i + nframes] if len(clip) == nframes: clips.append(clip) return clips, videos @@ -234,18 +227,19 @@ def load_and_transform_frames(frame_list, loader, img_transform=None): class VideoFrameDataset(paddle.io.Dataset): def __init__( - self, - data_root, - resolution, - video_length, - dataset_name="", - subset_split="", - annotation_dir=None, - spatial_transform="", - temporal_transform="", - frame_stride=1, - clip_step=None, - tokenizer=None, ): + self, + data_root, + resolution, + video_length, + dataset_name="", + subset_split="", + annotation_dir=None, + spatial_transform="", + temporal_transform="", + frame_stride=1, + clip_step=None, + tokenizer=None, + ): self.loader = default_loader self.video_length = video_length self.subset_split = subset_split @@ -264,8 +258,7 @@ def __init__( if annotation_dir is None: annotation_dir = os.path.join(data_root, "ucfTrainTestlist") class_to_idx = class_name_to_idx(annotation_dir) - assert (len(class_to_idx) == 101 - ), f"num of classes = {len(class_to_idx)}, not 101" + assert len(class_to_idx) == 101, f"num of classes = {len(class_to_idx)}, not 101" elif dataset_name == "sky": classes, class_to_idx = find_classes(video_dir) else: @@ -279,9 +272,9 @@ def __init__( video_length, class_to_idx, frame_stride=frame_stride, - clip_step=clip_step, ) - assert (len(self.clips[0]) == video_length - ), f"Invalid clip length = {len(self.clips[0])}" + clip_step=clip_step, + ) + assert len(self.clips[0]) == video_length, f"Invalid clip length = {len(self.clips[0])}" if self.temporal_transform == "rand_clips": self.clips = self.videos if subset_split == "all": @@ -296,31 +289,33 @@ def __init__( print("[VideoFrameDataset] video_length", self.video_length) if len(self.clips) == 0: raise RuntimeError( - f"Found 0 clips in {video_dir}. \nSupported image extensions are: " - + ",".join(IMG_EXTENSIONS)) - self.img_transform = paddle.vision.transforms.Compose([ - paddle.vision.transforms.ToTensor(), - paddle.vision.transforms.Normalize((0.5, 0.5, 0.5), - (0.5, 0.5, 0.5)), - ]) + f"Found 0 clips in {video_dir}. \nSupported image extensions are: " + ",".join(IMG_EXTENSIONS) + ) + self.img_transform = paddle.vision.transforms.Compose( + [ + paddle.vision.transforms.ToTensor(), + paddle.vision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ] + ) if self.spatial_transform == "center_crop_resize": print("Spatial transform: center crop and then resize") - self.video_transform = paddle.vision.transforms.Compose([ - paddle.vision.transforms.Resize(resolution), - CenterCropVideo(resolution), - ]) - self.video_transform_step1 = paddle.vision.transforms.Compose([ - paddle.vision.transforms.Resize(resolution), - ]) - self.video_transform_step2 = paddle.vision.transforms.Compose( - [CenterCropVideo(resolution)]) + self.video_transform = paddle.vision.transforms.Compose( + [ + paddle.vision.transforms.Resize(resolution), + CenterCropVideo(resolution), + ] + ) + self.video_transform_step1 = paddle.vision.transforms.Compose( + [ + paddle.vision.transforms.Resize(resolution), + ] + ) + self.video_transform_step2 = paddle.vision.transforms.Compose([CenterCropVideo(resolution)]) elif self.spatial_transform == "resize": print("Spatial transform: resize with no crop") - self.video_transform = paddle.vision.transforms.Resize( - (resolution, resolution)) + self.video_transform = paddle.vision.transforms.Resize((resolution, resolution)) elif self.spatial_transform == "random_crop": - self.video_transform = paddle.vision.transforms.Compose( - [RandomCropVideo(resolution)]) + self.video_transform = paddle.vision.transforms.Compose([RandomCropVideo(resolution)]) elif self.spatial_transform == "": self.video_transform = None else: @@ -332,7 +327,8 @@ def __init__( padding="max_length", truncation=True, max_length=tokenizer.model_max_length, - return_tensors="np", ).input_ids[0] + return_tensors="np", + ).input_ids[0] else: self.text_processing = None @@ -340,14 +336,13 @@ def __getitem__(self, index): if self.temporal_transform == "rand_clips": raw_video = self.clips[index] rand_idx = random.randint(0, len(raw_video) - self.video_length) - clip = raw_video[rand_idx:rand_idx + self.video_length] + clip = raw_video[rand_idx : rand_idx + self.video_length] else: clip = self.clips[index] assert ( len(clip) == self.video_length ), f"current clip_length={len(clip)}, target clip_length={self.video_length}, {clip}" - frames, labels = load_and_transform_frames(clip, self.loader, - self.img_transform) + frames, labels = load_and_transform_frames(clip, self.loader, self.img_transform) assert ( len(frames) == self.video_length @@ -357,8 +352,7 @@ def __getitem__(self, index): if self.spatial_transform == "center_crop_resize": temp_frames = rearrange(frames, "c t h w -> (c t) h w") temp_frames = self.video_transform_step1(temp_frames) - frames = rearrange( - temp_frames, "(c t) h w -> c t h w", c=frames.shape[0]) + frames = rearrange(temp_frames, "(c t) h w -> c t h w", c=frames.shape[0]) frames = self.video_transform_step2(frames) else: frames = self.video_transform(frames) @@ -377,7 +371,9 @@ def __getitem__(self, index): "input_ids": self.text_processing(example["caption"]), } else: - tensor_out = {"pixel_values": example["image"], } + tensor_out = { + "pixel_values": example["image"], + } return tensor_out def __len__(self): diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_short.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_short.py index a4aefa02a1008..e91a6f6018c21 100644 --- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_short.py +++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_short.py @@ -25,100 +25,82 @@ class ModelArguments: # for initialization task_type: str = field( default="short", - metadata={ - "help": - "Type of train task. Should be one of ['short', 'text2video']" - }, ) + metadata={"help": "Type of train task. Should be one of ['short', 'text2video']"}, + ) pretrained_model_name_or_path: str = field( default=None, - metadata={ - "help": - "Path to pretrained model or model, when we want to resume training." - }, ) + metadata={"help": "Path to pretrained model or model, when we want to resume training."}, + ) tokenizer_name_or_path: Optional[str] = field( default=None, - metadata={ - "help": - "Pretrained tokenizer name or path if not use pretrained model name or path" - }, ) + metadata={"help": "Pretrained tokenizer name or path if not use pretrained model name or path"}, + ) vae_type: str = field( default="3d", metadata={"help": "Type of vae to use. Should be one of ['2d', '3d']"}, ) vae_name_or_path: Optional[str] = field( default=None, - metadata={ - "help": - "Pretrained vae name or path if not use pretrained model name or path" - }, ) + metadata={"help": "Pretrained vae name or path if not use pretrained model name or path"}, + ) text_encoder_name_or_path: Optional[str] = field( default=None, - metadata={ - "help": - "Pretrained text encoder name or path if not use pretrained model name or path" - }, ) + metadata={"help": "Pretrained text encoder name or path if not use pretrained model name or path"}, + ) text_encoder_config_file: Optional[str] = field( default=None, - metadata={ - "help": - "Text encoder config file if not use pretrained text encoder" - }, ) - is_text_encoder_trainable: bool = field( - default=False, metadata={"help": "Whether or not use ema"}) + metadata={"help": "Text encoder config file if not use pretrained text encoder"}, + ) + is_text_encoder_trainable: bool = field(default=False, metadata={"help": "Whether or not use ema"}) unet_name_or_path: Optional[str] = field( default=None, - metadata={ - "help": - "Pretrained unet name or path if not use pretrained model name or path" - }, ) + metadata={"help": "Pretrained unet name or path if not use pretrained model name or path"}, + ) unet_config_file: Optional[str] = field( - default=None, - metadata={"help": "Unet config file if not use pretrained unet"}) + default=None, metadata={"help": "Unet config file if not use pretrained unet"} + ) scheduler_beta_start: Optional[float] = field( - default=0.0015, - metadata={"help": "Train or eval scheduler beta start"}) - scheduler_beta_end: Optional[float] = field( - default=0.0155, metadata={"help": "Train or eval scheduler beta end"}) + default=0.0015, metadata={"help": "Train or eval scheduler beta start"} + ) + scheduler_beta_end: Optional[float] = field(default=0.0155, metadata={"help": "Train or eval scheduler beta end"}) scheduler_num_train_timesteps: Optional[int] = field( default=1000, metadata={"help": "Train or eval scheduler number of train timesteps"}, ) eval_scheduler_num_inference_steps: Optional[int] = field( - default=50, - metadata={"help": "Eval scheduler number of inference timesteps"}) + default=50, metadata={"help": "Eval scheduler number of inference timesteps"} + ) # for training - use_ema: bool = field( - default=False, metadata={"help": "Whether or not use ema"}) + use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"}) enable_xformers_memory_efficient_attention: bool = field( - default=False, - metadata={"help": "enable xformers memory efficient attention"}) + default=False, metadata={"help": "enable xformers memory efficient attention"} + ) scale_factor: Optional[float] = field( default=0.33422927, - metadata={"help": "The scale factor in the first stage encoding"}, ) + metadata={"help": "The scale factor in the first stage encoding"}, + ) shift_factor: Optional[float] = field( default=1.4606637, - metadata={"help": "The shift factor in the first stage encoding"}, ) + metadata={"help": "The shift factor in the first stage encoding"}, + ) loss_type: str = field( default="l1", - metadata={ - "help": - "The loss type to use in training. Should be one of ['l2', 'l1']" - }, ) + metadata={"help": "The loss type to use in training. Should be one of ['l2', 'l1']"}, + ) # for alignmemnt latents_path: str = field( default=None, - metadata={"help": "Path to latents, used for alignment"}, ) - use_paddle_conv_init: bool = field( - default=False, - metadata={"help": "Whether or not use paddle conv2d init"}) + metadata={"help": "Path to latents, used for alignment"}, + ) + use_paddle_conv_init: bool = field(default=False, metadata={"help": "Whether or not use paddle conv2d init"}) if_numpy_genarator_random_alignment: bool = field( default=False, - metadata={"help": "Whether to align random using numpy generator"}, ) + metadata={"help": "Whether to align random using numpy generator"}, + ) numpy_genarator_random_seed: Optional[int] = field( - default=42, metadata={"help": "The random seed for numpy generator"}) - set_seed_for_alignment: bool = field( - default=False, - metadata={"help": "Whether to set seed again for alignment"}) + default=42, metadata={"help": "The random seed for numpy generator"} + ) + set_seed_for_alignment: bool = field(default=False, metadata={"help": "Whether to set seed again for alignment"}) @dataclass @@ -128,8 +110,7 @@ class TrainerArguments: """ # for log - image_logging_steps: Optional[int] = field( - default=1000, metadata={"help": "Log image every X steps."}) + image_logging_steps: Optional[int] = field(default=1000, metadata={"help": "Log image every X steps."}) @dataclass @@ -140,28 +121,29 @@ class VideoFrameDatasetArguments: train_data_root: str = field( default="/root/data/lvdm/sky", - metadata={"help": "The root path of train dataset files"}, ) - train_subset_split: str = field( - default="train", metadata={"help": "The train subset split"}) + metadata={"help": "The root path of train dataset files"}, + ) + train_subset_split: str = field(default="train", metadata={"help": "The train subset split"}) eval_data_root: str = field( default="/root/data/lvdm/sky", - metadata={"help": "The root path of validation dataset files"}, ) - eval_subset_split: str = field( - default="train", metadata={"help": "The validation subset split"}) + metadata={"help": "The root path of validation dataset files"}, + ) + eval_subset_split: str = field(default="train", metadata={"help": "The validation subset split"}) resolution: int = field( default=256, - metadata={"help": "The resolution"}, ) + metadata={"help": "The resolution"}, + ) video_length: int = field( default=16, - metadata={"help": "The video length"}, ) - dataset_name: str = field( - default="sky", metadata={"help": "The dataset name"}) + metadata={"help": "The video length"}, + ) + dataset_name: str = field(default="sky", metadata={"help": "The dataset name"}) spatial_transform: str = field( default="center_crop_resize", - metadata={"help": "The spatial transform type to use"}, ) - temporal_transform: str = field( - default="rand_clips", - metadata={"help": "The temporal transform type to use"}) + metadata={"help": "The spatial transform type to use"}, + ) + temporal_transform: str = field(default="rand_clips", metadata={"help": "The temporal transform type to use"}) clip_step: int = field( default=None, - metadata={"help": "The clip step"}, ) + metadata={"help": "The clip step"}, + ) diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_text2video.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_text2video.py index feb46a5f5e3ad..39000183c6cce 100644 --- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_text2video.py +++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_args_text2video.py @@ -25,100 +25,79 @@ class ModelArguments: # for initialization task_type: str = field( default="text2video", - metadata={ - "help": - "Type of train task. Should be one of ['short', 'text2video']" - }, ) + metadata={"help": "Type of train task. Should be one of ['short', 'text2video']"}, + ) pretrained_model_name_or_path: str = field( default=None, - metadata={ - "help": - "Path to pretrained model or model, when we want to resume training." - }, ) + metadata={"help": "Path to pretrained model or model, when we want to resume training."}, + ) tokenizer_name_or_path: Optional[str] = field( default=None, - metadata={ - "help": - "Pretrained tokenizer name or path if not use pretrained model name or path" - }, ) + metadata={"help": "Pretrained tokenizer name or path if not use pretrained model name or path"}, + ) vae_type: str = field( default="2d", metadata={"help": "Type of vae to use. Should be one of ['2d', '3d']"}, ) vae_name_or_path: Optional[str] = field( default=None, - metadata={ - "help": - "Pretrained vae name or path if not use pretrained model name or path" - }, ) + metadata={"help": "Pretrained vae name or path if not use pretrained model name or path"}, + ) text_encoder_name_or_path: Optional[str] = field( default="openai/clip-vit-large-patch14", - metadata={ - "help": - "Pretrained text encoder name or path if not use pretrained model name or path" - }, ) + metadata={"help": "Pretrained text encoder name or path if not use pretrained model name or path"}, + ) text_encoder_config_file: Optional[str] = field( default=None, - metadata={ - "help": - "Text encoder config file if not use pretrained text encoder" - }, ) - is_text_encoder_trainable: bool = field( - default=False, metadata={"help": "Whether or not use ema"}) + metadata={"help": "Text encoder config file if not use pretrained text encoder"}, + ) + is_text_encoder_trainable: bool = field(default=False, metadata={"help": "Whether or not use ema"}) unet_name_or_path: Optional[str] = field( default=None, - metadata={ - "help": - "Pretrained unet name or path if not use pretrained model name or path" - }, ) + metadata={"help": "Pretrained unet name or path if not use pretrained model name or path"}, + ) unet_config_file: Optional[str] = field( - default=None, - metadata={"help": "Unet config file if not use pretrained unet"}) + default=None, metadata={"help": "Unet config file if not use pretrained unet"} + ) scheduler_beta_start: Optional[float] = field( - default=0.00085, - metadata={"help": "Train or eval scheduler beta start"}) - scheduler_beta_end: Optional[float] = field( - default=0.012, metadata={"help": "Train or eval scheduler beta end"}) + default=0.00085, metadata={"help": "Train or eval scheduler beta start"} + ) + scheduler_beta_end: Optional[float] = field(default=0.012, metadata={"help": "Train or eval scheduler beta end"}) scheduler_num_train_timesteps: Optional[int] = field( default=1000, metadata={"help": "Train or eval scheduler number of train timesteps"}, ) eval_scheduler_num_inference_steps: Optional[int] = field( - default=50, - metadata={"help": "Eval scheduler number of inference timesteps"}) + default=50, metadata={"help": "Eval scheduler number of inference timesteps"} + ) # for training - use_ema: bool = field( - default=False, metadata={"help": "Whether or not use ema"}) + use_ema: bool = field(default=False, metadata={"help": "Whether or not use ema"}) enable_xformers_memory_efficient_attention: bool = field( - default=False, - metadata={"help": "enable xformers memory efficient attention"}) + default=False, metadata={"help": "enable xformers memory efficient attention"} + ) scale_factor: Optional[float] = field( default=0.18215, - metadata={"help": "The scale factor in the first stage encoding"}, ) - shift_factor: Optional[float] = field( - default=0, - metadata={"help": "The shift factor in the first stage encoding"}) + metadata={"help": "The scale factor in the first stage encoding"}, + ) + shift_factor: Optional[float] = field(default=0, metadata={"help": "The shift factor in the first stage encoding"}) loss_type: str = field( default="l2", - metadata={ - "help": - "The loss type to use in training. Should be one of ['l2', 'l1']" - }, ) + metadata={"help": "The loss type to use in training. Should be one of ['l2', 'l1']"}, + ) # for alignmemnt latents_path: str = field( default=None, - metadata={"help": "Path to latents, used for alignment"}, ) - use_paddle_conv_init: bool = field( - default=False, - metadata={"help": "Whether or not use paddle conv2d init"}) + metadata={"help": "Path to latents, used for alignment"}, + ) + use_paddle_conv_init: bool = field(default=False, metadata={"help": "Whether or not use paddle conv2d init"}) if_numpy_genarator_random_alignment: bool = field( default=False, - metadata={"help": "Whether to align random using numpy generator"}, ) + metadata={"help": "Whether to align random using numpy generator"}, + ) numpy_genarator_random_seed: Optional[int] = field( - default=42, metadata={"help": "The random seed for numpy generator"}) - set_seed_for_alignment: bool = field( - default=False, - metadata={"help": "Whether to set seed again for alignment"}) + default=42, metadata={"help": "The random seed for numpy generator"} + ) + set_seed_for_alignment: bool = field(default=False, metadata={"help": "Whether to set seed again for alignment"}) @dataclass @@ -128,8 +107,7 @@ class TrainerArguments: """ # for log - image_logging_steps: Optional[int] = field( - default=1000, metadata={"help": "Log image every X steps."}) + image_logging_steps: Optional[int] = field(default=1000, metadata={"help": "Log image every X steps."}) @dataclass @@ -140,27 +118,34 @@ class WebVidDatasetArguments: train_data_root: str = field( default="/root/data/lvdm/webvid/share_datasets", - metadata={"help": "The root path of train dataset files"}, ) + metadata={"help": "The root path of train dataset files"}, + ) train_annotation_path: str = field( default="/root/data/lvdm/webvid/share_datasets/train_type_data.list", - metadata={"help": "The root path of train annotation"}, ) - train_subset_split: str = field( - default="all", metadata={"help": "The train subset split"}) + metadata={"help": "The root path of train annotation"}, + ) + train_subset_split: str = field(default="all", metadata={"help": "The train subset split"}) eval_data_root: str = field( default="/root/data/lvdm/webvid/share_datasets", - metadata={"help": "The root path of validation dataset files"}, ) + metadata={"help": "The root path of validation dataset files"}, + ) eval_annotation_path: str = field( default="/root/data/lvdm/webvid/share_datasets/val_type_data.list", - metadata={"help": "The root path of validation annotation"}, ) - eval_subset_split: str = field( - default="all", metadata={"help": "The validation subset split"}) + metadata={"help": "The root path of validation annotation"}, + ) + eval_subset_split: str = field(default="all", metadata={"help": "The validation subset split"}) resolution: int = field( default=256, - metadata={"help": "The resolution"}, ) + metadata={"help": "The resolution"}, + ) video_length: int = field( default=16, - metadata={"help": "The video length"}, ) - frame_stride: int = field(default=4, ) + metadata={"help": "The video length"}, + ) + frame_stride: int = field( + default=4, + ) spatial_transform: str = field( default="center_crop_resize", - metadata={"help": "The spatial transform type to use"}, ) + metadata={"help": "The spatial transform type to use"}, + ) diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_model.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_model.py index 9b00773644bbb..a087314494b33 100644 --- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_model.py +++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_model.py @@ -21,23 +21,34 @@ import numpy as np import paddle import paddle.nn as nn -from einops import rearrange, repeat +from einops import rearrange from paddlenlp.transformers import AutoTokenizer, CLIPTextModel from paddlenlp.utils.log import logger -from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler, - LVDMAutoencoderKL, LVDMUNet3DModel, - is_ppxformers_available) -from ppdiffusers.initializer import (normal_, reset_initialized_parameter, - xavier_uniform_, zeros_) +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + DDPMScheduler, + LVDMAutoencoderKL, + LVDMUNet3DModel, + is_ppxformers_available, +) +from ppdiffusers.initializer import ( + normal_, + reset_initialized_parameter, + xavier_uniform_, + zeros_, +) from ppdiffusers.models.ema import LitEma -from ppdiffusers.models.lvdm_attention_temporal import (RelativePosition, - TemporalCrossAttention) +from ppdiffusers.models.lvdm_attention_temporal import ( + RelativePosition, + TemporalCrossAttention, +) from ppdiffusers.models.lvdm_distributions import DiagonalGaussianDistribution from ppdiffusers.training_utils import freeze_params -def set_seed(seed: int=1234, args=None): +def set_seed(seed: int = 1234, args=None): if args is None: random.seed(seed) np.random.seed(seed) @@ -45,16 +56,14 @@ def set_seed(seed: int=1234, args=None): if args is not None: if args.use_hybrid_parallel: - from paddle.distributed.fleet.meta_parallel import \ - get_rng_state_tracker + from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker random.seed(args.seed + args.dataset_rank) np.random.seed(args.seed + args.dataset_rank) paddle.seed(args.seed + args.dataset_rank) # local_seed/ global_seed is used to control dropout in ModelParallel - local_seed = (args.seed + 59999 + args.tensor_parallel_rank * 10 + - args.pipeline_parallel_rank * 1000) + local_seed = args.seed + 59999 + args.tensor_parallel_rank * 10 + args.pipeline_parallel_rank * 1000 global_seed = args.seed + 100003 + args.dataset_rank tracker = get_rng_state_tracker() @@ -78,12 +87,10 @@ def split_video_to_clips(video, clip_length, drop_left=True): video_length = video.shape[2] shape = video.shape if video_length % clip_length != 0 and drop_left: - video = video[:, :, :video_length // clip_length * clip_length, :, :] - print( - f"[split_video_to_clips] Drop frames from {shape} to {video.shape}") + video = video[:, :, : video_length // clip_length * clip_length, :, :] + print(f"[split_video_to_clips] Drop frames from {shape} to {video.shape}") nclips = video_length // clip_length - clips = rearrange( - video, "b c (nc cl) h w -> (b nc) c cl h w", cl=clip_length, nc=nclips) + clips = rearrange(video, "b c (nc cl) h w -> (b nc) c cl h w", cl=clip_length, nc=nclips) return clips @@ -104,17 +111,17 @@ def __init__(self, model_args): if model_args.task_type == "text2video": tokenizer_name_or_path = ( model_args.tokenizer_name_or_path - if model_args.pretrained_model_name_or_path is None else - os.path.join(model_args.pretrained_model_name_or_path, - "tokenizer")) - self.tokenizer = AutoTokenizer.from_pretrained( - tokenizer_name_or_path) + if model_args.pretrained_model_name_or_path is None + else os.path.join(model_args.pretrained_model_name_or_path, "tokenizer") + ) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path) # init vae vae_name_or_path = ( model_args.vae_name_or_path - if model_args.pretrained_model_name_or_path is None else - os.path.join(model_args.pretrained_model_name_or_path, "vae")) + if model_args.pretrained_model_name_or_path is None + else os.path.join(model_args.pretrained_model_name_or_path, "vae") + ) self.vae_type = model_args.vae_type self.encoder_type = model_args.vae_type if model_args.vae_type == "2d": @@ -122,7 +129,7 @@ def __init__(self, model_args): elif model_args.vae_type == "3d": self.vae = LVDMAutoencoderKL.from_pretrained(vae_name_or_path) else: - raise ValueError(f"`vae_type` to be `2d` or `3d`.") + raise ValueError("`vae_type` to be `2d` or `3d`.") freeze_params(self.vae.parameters()) logger.info("Freeze vae parameters!") @@ -130,16 +137,14 @@ def __init__(self, model_args): if model_args.task_type == "text2video": text_encoder_name_or_path = ( model_args.text_encoder_name_or_path - if model_args.pretrained_model_name_or_path is None else - os.path.join(model_args.pretrained_model_name_or_path, - "text_encoder")) + if model_args.pretrained_model_name_or_path is None + else os.path.join(model_args.pretrained_model_name_or_path, "text_encoder") + ) self.text_encoder_is_pretrained = text_encoder_name_or_path is not None if self.text_encoder_is_pretrained: - self.text_encoder = CLIPTextModel.from_pretrained( - text_encoder_name_or_path) + self.text_encoder = CLIPTextModel.from_pretrained(text_encoder_name_or_path) else: - self.text_encoder = CLIPTextModel( - **read_json(model_args.text_encoder_config_file)) + self.text_encoder = CLIPTextModel(**read_json(model_args.text_encoder_config_file)) self.init_text_encoder_weights() if not model_args.is_text_encoder_trainable: freeze_params(self.text_encoder.parameters()) @@ -148,14 +153,14 @@ def __init__(self, model_args): # init unet unet_name_or_path = ( model_args.unet_name_or_path - if model_args.pretrained_model_name_or_path is None else - os.path.join(model_args.pretrained_model_name_or_path, "unet")) + if model_args.pretrained_model_name_or_path is None + else os.path.join(model_args.pretrained_model_name_or_path, "unet") + ) self.unet_is_pretrained = model_args.pretrained_model_name_or_path is not None if self.unet_is_pretrained: self.unet = LVDMUNet3DModel.from_pretrained(unet_name_or_path) else: - self.unet = LVDMUNet3DModel( - **read_json(model_args.unet_config_file)) + self.unet = LVDMUNet3DModel(**read_json(model_args.unet_config_file)) self.init_unet_weights() # init train scheduler @@ -163,7 +168,8 @@ def __init__(self, model_args): beta_start=model_args.scheduler_beta_start, beta_end=model_args.scheduler_beta_end, beta_schedule="scaled_linear", - num_train_timesteps=model_args.scheduler_num_train_timesteps, ) + num_train_timesteps=model_args.scheduler_num_train_timesteps, + ) # init eval scheduler self.eval_scheduler = DDIMScheduler( @@ -173,23 +179,23 @@ def __init__(self, model_args): num_train_timesteps=model_args.scheduler_num_train_timesteps, steps_offset=1, clip_sample=False, - set_alpha_to_one=False, ) - self.eval_scheduler.set_timesteps( - model_args.eval_scheduler_num_inference_steps) + set_alpha_to_one=False, + ) + self.eval_scheduler.set_timesteps(model_args.eval_scheduler_num_inference_steps) # set training parameters self.use_ema = model_args.use_ema if self.use_ema: self.model_ema = LitEma(self.unet) - if (model_args.enable_xformers_memory_efficient_attention and - is_ppxformers_available()): + if model_args.enable_xformers_memory_efficient_attention and is_ppxformers_available(): try: self.unet.enable_xformers_memory_efficient_attention() except Exception as e: logger.warn( "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed" - f" correctly and a GPU is available: {e}") + f" correctly and a GPU is available: {e}" + ) self.scale_factor = model_args.scale_factor self.shift_factor = model_args.shift_factor self.loss_type = model_args.loss_type @@ -198,24 +204,19 @@ def __init__(self, model_args): self.use_preconfig_latents = False if model_args.latents_path: self.use_preconfig_latents = True - self.register_buffer("preconfig_latents", - paddle.load(model_args.latents_path)) + self.register_buffer("preconfig_latents", paddle.load(model_args.latents_path)) - self.if_numpy_genarator_random_alignment = ( - model_args.if_numpy_genarator_random_alignment) + self.if_numpy_genarator_random_alignment = model_args.if_numpy_genarator_random_alignment if self.if_numpy_genarator_random_alignment: - self.generator = np.random.RandomState( - model_args.numpy_genarator_random_seed) + self.generator = np.random.RandomState(model_args.numpy_genarator_random_seed) self.set_seed_for_alignment = model_args.set_seed_for_alignment def init_text_encoder_weights(self): if not self.text_encoder_is_pretrained: reset_initialized_parameter(self.text_encoder) - normal_(self.text_encoder.embeddings.word_embeddings.weight, 0, - 0.02) - normal_(self.text_encoder.embeddings.position_embeddings.weight, 0, - 0.02) + normal_(self.text_encoder.embeddings.word_embeddings.weight, 0, 0.02) + normal_(self.text_encoder.embeddings.position_embeddings.weight, 0, 0.02) def init_unet_weights(self): if not self.unet_is_pretrained: @@ -256,9 +257,7 @@ def get_first_stage_encoding(self, encoder_posterior, noise=None): elif isinstance(encoder_posterior, paddle.Tensor): z = encoder_posterior else: - raise NotImplementedError( - f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented" - ) + raise NotImplementedError(f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented") z = self.scale_factor * (z + self.shift_factor) return z @@ -291,12 +290,7 @@ def decode(self, z, **kwargs): return results @paddle.no_grad() - def overlapped_decode(self, - z, - max_z_t=None, - overlap_t=2, - predict_cids=False, - force_not_quantize=False): + def overlapped_decode(self, z, max_z_t=None, overlap_t=2, predict_cids=False, force_not_quantize=False): if max_z_t is None: max_z_t = z.shape[2] assert max_z_t > overlap_t @@ -315,69 +309,56 @@ def overlapped_decode(self, reses = [] for i, z_ in enumerate(zs): if i == 0: - res = self.decode( - z_, predict_cids, - force_not_quantize).cpu()[:, :, :max_x_t - drop_r_x, :, :] + res = self.decode(z_, predict_cids, force_not_quantize).cpu()[:, :, : max_x_t - drop_r_x, :, :] elif i == len(zs) - 1: - res = self.decode( - z_, predict_cids, - force_not_quantize).cpu()[:, :, drop_l_x:, :, :] + res = self.decode(z_, predict_cids, force_not_quantize).cpu()[:, :, drop_l_x:, :, :] else: - res = self.decode(z_, predict_cids, force_not_quantize).cpu( - )[:, :, drop_l_x:max_x_t - drop_r_x, :, :] + res = self.decode(z_, predict_cids, force_not_quantize).cpu()[ + :, :, drop_l_x : max_x_t - drop_r_x, :, : + ] reses.append(res) results = paddle.concat(x=reses, axis=2) return results @paddle.no_grad() - def decode_first_stage_2DAE_video(self, - z, - decode_bs=16, - return_cpu=True, - **kwargs): + def decode_first_stage_2DAE_video(self, z, decode_bs=16, return_cpu=True, **kwargs): b, _, t, _, _ = z.shape z = rearrange(z, "b c t h w -> (b t) c h w") if decode_bs is None: results = self.decode(z, **kwargs) else: - z = paddle.split( - x=z, num_or_sections=z.shape[0] // decode_bs, axis=0) + z = paddle.split(x=z, num_or_sections=z.shape[0] // decode_bs, axis=0) if return_cpu: - results = paddle.concat( - x=[self.decode(z_, **kwargs).cpu() for z_ in z], axis=0) + results = paddle.concat(x=[self.decode(z_, **kwargs).cpu() for z_ in z], axis=0) else: - results = paddle.concat( - x=[self.decode(z_, **kwargs) for z_ in z], axis=0) - results = rearrange( - results, "(b t) c h w -> b c t h w", b=b, t=t).contiguous() + results = paddle.concat(x=[self.decode(z_, **kwargs) for z_ in z], axis=0) + results = rearrange(results, "(b t) c h w -> b c t h w", b=b, t=t).contiguous() return results @paddle.no_grad() def decode_latents( - self, - z, - decode_bs=16, - return_cpu=True, - bs=None, - decode_single_video_allframes=False, - max_z_t=None, - overlapped_length=0, - **kwargs, ): + self, + z, + decode_bs=16, + return_cpu=True, + bs=None, + decode_single_video_allframes=False, + max_z_t=None, + overlapped_length=0, + **kwargs, + ): b, _, t, _, _ = z.shape if self.encoder_type == "2d" and z.dim() == 5: - return self.decode_first_stage_2DAE_video( - z, decode_bs=decode_bs, return_cpu=return_cpu, **kwargs) + return self.decode_first_stage_2DAE_video(z, decode_bs=decode_bs, return_cpu=return_cpu, **kwargs) if decode_single_video_allframes: z = paddle.split(x=z, num_or_sections=z.shape[0] // 1, axis=0) cat_dim = 0 elif max_z_t is not None: if self.encoder_type == "3d": - z = paddle.split( - x=z, num_or_sections=z.shape[2] // max_z_t, axis=2) + z = paddle.split(x=z, num_or_sections=z.shape[2] // max_z_t, axis=2) cat_dim = 2 if self.encoder_type == "2d": - z = paddle.split( - x=z, num_or_sections=z.shape[0] // max_z_t, axis=0) + z = paddle.split(x=z, num_or_sections=z.shape[0] // max_z_t, axis=0) cat_dim = 0 # elif self.split_clips and self.downfactor_t is not None or self.clip_length is not None and self.downfactor_t is not None and z.shape[ # 2 @@ -410,8 +391,7 @@ def get_loss(self, pred, target, mean=True, mask=None): if mean: loss = paddle.nn.functional.mse_loss(target, pred) else: - loss = paddle.nn.functional.mse_loss( - target, pred, reduction="none") + loss = paddle.nn.functional.mse_loss(target, pred, reduction="none") else: raise NotImplementedError("unknown loss type '{loss_type}'") if mask is not None: @@ -438,18 +418,18 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs): self.generator.randint( 0, self.noise_scheduler.num_train_timesteps, - size=(latents.shape[0], ), ), - dtype="int64", ) - noise = paddle.to_tensor( - self.generator.randn(*latents.shape), dtype="float32") + size=(latents.shape[0],), + ), + dtype="int64", + ) + noise = paddle.to_tensor(self.generator.randn(*latents.shape), dtype="float32") else: timesteps = paddle.randint( - 0, self.noise_scheduler.num_train_timesteps, - (latents.shape[0], )).astype("int64") + 0, self.noise_scheduler.num_train_timesteps, (latents.shape[0],) + ).astype("int64") noise = paddle.randn_like(latents) - noisy_latents = self.noise_scheduler.add_noise(latents, noise, - timesteps) + noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps) encoder_hidden_states = None if self.task_type == "text2video": encoder_hidden_states = self.text_encoder(input_ids)[0] @@ -458,7 +438,8 @@ def forward(self, input_ids=None, pixel_values=None, **kwargs): noise_pred = self.unet( noisy_latents, timesteps, - context=encoder_hidden_states, ).sample + context=encoder_hidden_states, + ).sample loss = self.get_loss(noise_pred, noise, mean=True) return loss @@ -485,20 +466,19 @@ def log_reconstruct_frames(self, pixel_values=None, **kwargs): @paddle.no_grad() def log_text2video_sample_frames( - self, - input_ids=None, - height=256, - width=256, - eta=1.0, - guidance_scale=9, - num_frames=16, - **kwargs, ): + self, + input_ids=None, + height=256, + width=256, + eta=1.0, + guidance_scale=9, + num_frames=16, + **kwargs, + ): self.eval() with self.ema_scope(): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") # only log 2 video if input_ids.shape[0] > 2: input_ids = input_ids[:2] @@ -512,10 +492,10 @@ def log_text2video_sample_frames( padding="max_length", truncation=True, max_length=max_length, - return_tensors="pd", ) + return_tensors="pd", + ) uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0] - text_embeddings = paddle.concat( - [uncond_embeddings, text_embeddings], axis=0) + text_embeddings = paddle.concat([uncond_embeddings, text_embeddings], axis=0) if self.use_preconfig_latents: latents = self.preconfig_latents else: @@ -528,36 +508,32 @@ def log_text2video_sample_frames( ] latents = paddle.randn(shape) - accepts_eta = "eta" in set( - inspect.signature(self.eval_scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta for t in self.eval_scheduler.timesteps: # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents # ddim donot use this - latent_model_input = self.eval_scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( latent_model_input, t, - context=text_embeddings, ).sample + context=text_embeddings, + ).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.eval_scheduler.step( - noise_pred, t, latents, **extra_step_kwargs).prev_sample + latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample sampled_videos = self.decode_latents(latents) @@ -574,19 +550,11 @@ def log_text2video_sample_frames( return videos_frames @paddle.no_grad() - def log_short_sample_frames(self, - height=256, - width=256, - eta=0.0, - guidance_scale=9, - num_frames=16, - **kwargs): + def log_short_sample_frames(self, height=256, width=256, eta=0.0, guidance_scale=9, num_frames=16, **kwargs): self.eval() with self.ema_scope(): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") # only log 2 video batch_size = 2 @@ -602,8 +570,7 @@ def log_short_sample_frames(self, ] latents = paddle.randn(shape) - accepts_eta = "eta" in set( - inspect.signature(self.eval_scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.eval_scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta @@ -613,17 +580,16 @@ def log_short_sample_frames(self, latent_model_input = latents # ddim donot use this - latent_model_input = self.eval_scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = self.eval_scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( latent_model_input, - t, ).sample + t, + ).sample # compute the previous noisy sample x_t -> x_t-1 - latents = self.eval_scheduler.step( - noise_pred, t, latents, **extra_step_kwargs).prev_sample + latents = self.eval_scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample sampled_videos = self.decode_latents(latents) @@ -643,7 +609,6 @@ def set_recompute(self, value=False): def fn(layer): if hasattr(layer, "gradient_checkpointing"): layer.gradient_checkpointing = value - print("Set", layer.__class__, "recompute", - layer.gradient_checkpointing) + print("Set", layer.__class__, "recompute", layer.gradient_checkpointing) self.unet.apply(fn) diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_trainer.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_trainer.py index 90d32ee1eda0b..9fa09eb560f4c 100644 --- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_trainer.py +++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/lvdm_trainer.py @@ -19,8 +19,11 @@ import paddle.amp.auto_cast as autocast from paddle.io import DataLoader from paddlenlp.trainer import Trainer -from paddlenlp.trainer.integrations import (INTEGRATION_TO_CALLBACK, - VisualDLCallback, rewrite_logs) +from paddlenlp.trainer.integrations import ( + INTEGRATION_TO_CALLBACK, + VisualDLCallback, + rewrite_logs, +) from paddlenlp.trainer.utils.helper import nested_detach from paddlenlp.utils.log import logger @@ -39,19 +42,17 @@ def autocast_smart_context_manager(self, args): "c_softmax_with_cross_entropy", ], level=args.fp16_opt_level, - dtype=amp_dtype, ) + dtype=amp_dtype, + ) else: - ctx_manager = (contextlib.nullcontext() - if sys.version_info >= (3, 7) else - contextlib.suppress()) + ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress() return ctx_manager def on_step_end(self, args, state, control, model=None, **kwargs): if hasattr(model, "on_train_batch_end"): model.on_train_batch_end() - if (args.image_logging_steps > 0 and - state.global_step % args.image_logging_steps == 0): + if args.image_logging_steps > 0 and state.global_step % args.image_logging_steps == 0: control.should_log = True def on_log(self, args, state, control, logs=None, **kwargs): @@ -62,27 +63,30 @@ def on_log(self, args, state, control, logs=None, **kwargs): inputs = kwargs.get("inputs", None) model = kwargs.get("model", None) image_logs = {} - if (inputs is not None and model is not None and - args.image_logging_steps > 0 and - state.global_step % args.image_logging_steps == 0): + if ( + inputs is not None + and model is not None + and args.image_logging_steps > 0 + and state.global_step % args.image_logging_steps == 0 + ): with self.autocast_smart_context_manager(args): - image_logs["reconstruction"] = model.log_reconstruct_frames( - pixel_values=inputs["pixel_values"]) + image_logs["reconstruction"] = model.log_reconstruct_frames(pixel_values=inputs["pixel_values"]) if model.task_type == "text2video": - image_logs[ - "ddim-samples"] = model.log_text2video_sample_frames( - input_ids=inputs["input_ids"], - height=256, - width=256, - eta=1.0, - guidance_scale=9, - num_frames=16, ) + image_logs["ddim-samples"] = model.log_text2video_sample_frames( + input_ids=inputs["input_ids"], + height=256, + width=256, + eta=1.0, + guidance_scale=9, + num_frames=16, + ) elif model.task_type == "short": image_logs["ddim-samples"] = model.log_short_sample_frames( height=256, width=256, eta=1.0, - num_frames=16, ) + num_frames=16, + ) if self.vdl_writer is None: self._init_summary_writer(args) @@ -97,11 +101,11 @@ def on_log(self, args, state, control, logs=None, **kwargs): "Trainer is attempting to log a value of " f'"{v}" of type {type(v)} for key "{k}" as a scalar. ' "This invocation of VisualDL's writer.add_scalar() " - "is incorrect so we dropped this attribute.") + "is incorrect so we dropped this attribute." + ) # log images for k, v in image_logs.items(): - self.vdl_writer.add_image( - k, v, state.global_step, dataformats="NHWC") + self.vdl_writer.add_image(k, v, state.global_step, dataformats="NHWC") self.vdl_writer.flush() @@ -117,43 +121,41 @@ def compute_loss(self, model, inputs, return_outputs=False): def get_train_dataloader(self): if self.train_dataset is None: raise ValueError("Trainer: training requires a train_dataset.") - if isinstance(self.train_dataset, VideoFrameDataset) or isinstance( - self.train_dataset, WebVidDataset): + if isinstance(self.train_dataset, VideoFrameDataset) or isinstance(self.train_dataset, WebVidDataset): return DataLoader( self.train_dataset, batch_size=self.args.train_batch_size, num_workers=self.args.dataloader_num_workers, shuffle=True, worker_init_fn=None, - collate_fn=None, ) + collate_fn=None, + ) else: return super().get_train_dataloader() def prediction_step( - self, - model, - inputs, - prediction_loss_only, - ignore_keys, ): + self, + model, + inputs, + prediction_loss_only, + ignore_keys, + ): if self.args.pipeline_parallel_degree > 1: # hack for pipeline mode inputs = self._prepare_inputs(inputs) - return self.prediction_pipeline_step( - model, inputs, prediction_loss_only, ignore_keys) + return self.prediction_pipeline_step(model, inputs, prediction_loss_only, ignore_keys) has_labels = all(inputs.get(k) is not None for k in self.label_names) inputs = self._prepare_inputs(inputs) if ignore_keys is None: if hasattr(self.model, "config"): - ignore_keys = getattr(self.model.config, - "keys_to_ignore_at_inference", []) + ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", []) else: ignore_keys = [] # labels may be popped when computing the loss (label smoothing for instance) so we grab them first. if has_labels: - labels = nested_detach( - tuple(inputs.get(name) for name in self.label_names)) + labels = nested_detach(tuple(inputs.get(name) for name in self.label_names)) if len(labels) == 1: labels = labels[0] else: diff --git a/ppdiffusers/examples/text_to_video_lvdm/lvdm/webvid_dataset.py b/ppdiffusers/examples/text_to_video_lvdm/lvdm/webvid_dataset.py index 345c3311c88cd..b6636d5924fec 100644 --- a/ppdiffusers/examples/text_to_video_lvdm/lvdm/webvid_dataset.py +++ b/ppdiffusers/examples/text_to_video_lvdm/lvdm/webvid_dataset.py @@ -38,16 +38,17 @@ class WebVidDataset(paddle.io.Dataset): """ def __init__( - self, - data_root, - resolution, - video_length, - subset_split, - frame_stride, - spatial_transform="", - load_method="decord", - annotation_path=None, - tokenizer=None, ): + self, + data_root, + resolution, + video_length, + subset_split, + frame_stride, + spatial_transform="", + load_method="decord", + annotation_path=None, + tokenizer=None, + ): self.annotation_path = annotation_path self.data_root = data_root self.resolution = resolution @@ -57,9 +58,7 @@ def __init__( self.spatial_transform = spatial_transform self.load_method = load_method assert self.load_method in ["decord", "readvideo", "videoclips"] - assert self.subset_split in [ - "train", "test", "all", "results_10M_train" - ] + assert self.subset_split in ["train", "test", "all", "results_10M_train"] self.exts = ["avi", "mp4", "webm"] if isinstance(self.resolution, int): self.resolution = [self.resolution, self.resolution] @@ -67,22 +66,23 @@ def __init__( self.max_resolution = max(self.resolution) if self.spatial_transform == "center_crop_resize": print("Spatial transform: center crop and then resize") - self.video_transform = paddle.vision.transforms.Compose([ - paddle.vision.transforms.Resize(resolution), - CenterCropVideo(resolution), - ]) - self.video_transform_step1 = paddle.vision.transforms.Compose([ - paddle.vision.transforms.Resize(resolution), - ]) - self.video_transform_step2 = paddle.vision.transforms.Compose( - [CenterCropVideo(resolution)]) + self.video_transform = paddle.vision.transforms.Compose( + [ + paddle.vision.transforms.Resize(resolution), + CenterCropVideo(resolution), + ] + ) + self.video_transform_step1 = paddle.vision.transforms.Compose( + [ + paddle.vision.transforms.Resize(resolution), + ] + ) + self.video_transform_step2 = paddle.vision.transforms.Compose([CenterCropVideo(resolution)]) elif self.spatial_transform == "resize": print("Spatial transform: resize with no crop") - self.video_transform = paddle.vision.transforms.Resize( - (resolution, resolution)) + self.video_transform = paddle.vision.transforms.Resize((resolution, resolution)) elif self.spatial_transform == "random_crop": - self.video_transform = paddle.vision.transforms.Compose( - [RandomCropVideo(resolution)]) + self.video_transform = paddle.vision.transforms.Compose([RandomCropVideo(resolution)]) elif self.spatial_transform == "": self.video_transform = None else: @@ -96,7 +96,8 @@ def __init__( truncation=True, max_length=tokenizer.model_max_length, return_tensors="pd", - return_overflowing_tokens=False, ).input_ids[0] + return_overflowing_tokens=False, + ).input_ids[0] else: self.text_processing = None @@ -111,12 +112,9 @@ def _make_dataset(self): self.annotations = fp.read().splitlines() else: self.annotations = sum( - [ - glob.glob( - os.path.join(data_folder, "**", f"*.{ext}"), - recursive=True) for ext in self.exts - ], - [], ) + [glob.glob(os.path.join(data_folder, "**", f"*.{ext}"), recursive=True) for ext in self.exts], + [], + ) print(f"Number of videos = {len(self.annotations)}") def get_annotation(self, index): @@ -140,7 +138,8 @@ def get_data_decord(self, index): video_path, ctx=cpu(0), width=self.max_resolution, - height=self.max_resolution, ) + height=self.max_resolution, + ) if len(video_reader) < self.video_length: index += 1 continue @@ -155,23 +154,20 @@ def get_data_decord(self, index): rand_idx = random.randint(0, len(all_frames) - self.video_length) frame_indices = list(range(rand_idx, rand_idx + self.video_length)) frames = video_reader.get_batch(frame_indices) - assert (frames.shape[0] == self.video_length - ), f"{len(frames)}, self.video_length={self.video_length}" - frames = (paddle.to_tensor(data=frames.asnumpy()) - .astype(dtype="float32").transpose(perm=[0, 3, 1, 2])) + assert frames.shape[0] == self.video_length, f"{len(frames)}, self.video_length={self.video_length}" + frames = paddle.to_tensor(data=frames.asnumpy()).astype(dtype="float32").transpose(perm=[0, 3, 1, 2]) if self.video_transform is not None: if self.spatial_transform == "center_crop_resize": temp_frames = rearrange(frames, "c t h w -> (c t) h w") temp_frames = self.video_transform_step1(temp_frames) - frames = rearrange( - temp_frames, "(c t) h w -> c t h w", c=frames.shape[0]) + frames = rearrange(temp_frames, "(c t) h w -> c t h w", c=frames.shape[0]) frames = self.video_transform_step2(frames) else: frames = self.video_transform(frames) frames = frames.transpose(perm=[1, 0, 2, 3]).astype(dtype="float32") - assert (frames.shape[2] == self.resolution[0] and - frames.shape[3] == self.resolution[1] - ), f"frames={frames.shape}, self.resolution={self.resolution}" + assert ( + frames.shape[2] == self.resolution[0] and frames.shape[3] == self.resolution[1] + ), f"frames={frames.shape}, self.resolution={self.resolution}" frames = (frames / 255 - 0.5) * 2 data = {"video": frames, "caption": caption} @@ -181,7 +177,9 @@ def get_data_decord(self, index): "input_ids": self.text_processing(data["caption"]), } else: - tensor_out = {"pixel_values": data["video"], } + tensor_out = { + "pixel_values": data["video"], + } return tensor_out def get_data_readvideo(self, index): @@ -215,9 +213,9 @@ def main(): subset_split=subset_split, frame_stride=frame_stride, spatial_transform=spatial_transform, - annotation_path=annotation_path, ) - dataloader = paddle.io.data.DataLoader( - dataset, batch_size=2, shuffle=False, num_workers=0) + annotation_path=annotation_path, + ) + dataloader = paddle.io.data.DataLoader(dataset, batch_size=2, shuffle=False, num_workers=0) starttime = time.time() for id, data in enumerate(dataloader): endtime = time.time() @@ -227,7 +225,8 @@ def main(): endtime - starttime, " shape:", data["video"].shape, - data["caption"], ) + data["caption"], + ) starttime = endtime return diff --git a/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_short.py b/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_short.py index f0ef60f1d4cfd..33a27a91410e8 100644 --- a/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_short.py +++ b/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_short.py @@ -17,8 +17,7 @@ from ppdiffusers import LVDMUncondPipeline # 加载模型和scheduler -pipe = LVDMUncondPipeline.from_pretrained( - "westfish/lvdm_short_sky_epoch2239_step150079") +pipe = LVDMUncondPipeline.from_pretrained("westfish/lvdm_short_sky_epoch2239_step150079") # 执行pipeline进行推理 seed = 1000 @@ -32,4 +31,5 @@ save_dir=".", save_name="ddim_lvdm_short_sky_epoch2239_step150079", scale_factor=0.33422927, - shift_factor=1.4606637, ) + shift_factor=1.4606637, +) diff --git a/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_text2video.py b/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_text2video.py index 520ee5339fbde..bbd9587186d87 100644 --- a/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_text2video.py +++ b/ppdiffusers/examples/text_to_video_lvdm/scripts/lvdm_sample_text2video.py @@ -17,8 +17,7 @@ from ppdiffusers import LVDMTextToVideoPipeline # 加载模型和scheduler -pipe = LVDMTextToVideoPipeline.from_pretrained( - "westfish/lvdm_text2video_orig_webvid_2m") +pipe = LVDMTextToVideoPipeline.from_pretrained("westfish/lvdm_text2video_orig_webvid_2m") # 执行pipeline进行推理 seed = 2013 @@ -36,4 +35,5 @@ save_name="ddim_lvdm_text_to_video_ucf", encoder_type="2d", scale_factor=0.18215, - shift_factor=0, ) + shift_factor=0, +) diff --git a/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_short_ckpt_to_ppdiffusers.py b/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_short_ckpt_to_ppdiffusers.py index d562f6ff8b359..2db650c780345 100644 --- a/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_short_ckpt_to_ppdiffusers.py +++ b/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_short_ckpt_to_ppdiffusers.py @@ -27,13 +27,19 @@ raise ImportError( "OmegaConf is required to convert the SD checkpoints. Please install it with `pip install OmegaConf`." ) -from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer from ppdiffusers import ( - AutoencoderKL, DDIMScheduler, DPMSolverMultistepScheduler, - EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, - HeunDiscreteScheduler, LMSDiscreteScheduler, LVDMAutoencoderKL, - LVDMUncondPipeline, LVDMUNet3DModel, PNDMScheduler) + DDIMScheduler, + DPMSolverMultistepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + HeunDiscreteScheduler, + LMSDiscreteScheduler, + LVDMAutoencoderKL, + LVDMUncondPipeline, + LVDMUNet3DModel, + PNDMScheduler, +) paddle.set_device("cpu") MZ_ZIP_LOCAL_DIR_HEADER_SIZE = 30 @@ -116,8 +122,7 @@ def get_data_iostream(file: str, file_name="data.pkl"): FILENAME = f"archive/{file_name}".encode("latin") padding_size_plus_fbxx = 4 + 14 data_iostream = [] - offset = MZ_ZIP_LOCAL_DIR_HEADER_SIZE + len( - FILENAME) + padding_size_plus_fbxx + offset = MZ_ZIP_LOCAL_DIR_HEADER_SIZE + len(FILENAME) + padding_size_plus_fbxx with open(file, "rb") as r: r.seek(offset) for bytes_data in io.BytesIO(r.read()): @@ -130,8 +135,7 @@ def get_data_iostream(file: str, file_name="data.pkl"): return out, offset + len(out) -def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad, - backward_hooks): +def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad, backward_hooks): if isinstance(storage, TensorMeta): storage.size = size return storage @@ -162,7 +166,8 @@ def create_unet_diffusers_config(original_config): padding_t=unet_params.padding_t, temporal_length=unet_params.temporal_length, use_relative_position=unet_params.use_relative_position, - use_scale_shift_norm=unet_params.use_scale_shift_norm, ) + use_scale_shift_norm=unet_params.use_scale_shift_norm, + ) return config @@ -181,7 +186,8 @@ def create_lvdm_vae_diffusers_config(original_config): padding_type=vae_params.encoder.params.padding_type, double_z=vae_params.encoder.params.double_z, z_channels=vae_params.encoder.params.z_channels, - upsample=vae_params.decoder.params.upsample, ) + upsample=vae_params.decoder.params.upsample, + ) return config @@ -190,14 +196,12 @@ def create_diffusers_schedular(original_config): num_train_timesteps=original_config.model.params.timesteps, beta_start=original_config.model.params.linear_start, beta_end=original_config.model.params.linear_end, - beta_schedule="scaled_linear", ) + beta_schedule="scaled_linear", + ) return schedular -def convert_lvdm_unet_checkpoint(checkpoint, - config, - path=None, - extract_ema=False): +def convert_lvdm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False): """ Takes a state dict and a config, and returns a converted checkpoint. """ @@ -218,8 +222,7 @@ def convert_lvdm_unet_checkpoint(checkpoint, for key in keys: if key.startswith("model.diffusion_model"): flat_ema_key = "model_ema." + "".join(key.split(".")[1:]) - unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop( - flat_ema_key) + unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key) else: print( "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA" @@ -251,9 +254,7 @@ def convert_lvdm_vae_checkpoint(checkpoint, vae_checkpoint, config): return new_checkpoint -def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, - diffusers_vae_unet_checkpoint, - dtype="float32"): +def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint, dtype="float32"): need_transpose = [] for k, v in vae_or_unet.named_sublayers(include_self=True): if isinstance(v, paddle.nn.Linear): @@ -275,8 +276,7 @@ def check_keys(model, state_dict): if k not in state_dict.keys(): missing_keys.append(k) if list(v.shape) != list(state_dict[k].shape): - mismatched_keys.append( - str((k, list(v.shape), list(state_dict[k].shape)))) + mismatched_keys.append(str((k, list(v.shape), list(state_dict[k].shape)))) if len(missing_keys): missing_keys_str = ", ".join(missing_keys) print(f"{cls_name} Found missing_keys {missing_keys_str}!") @@ -293,13 +293,15 @@ def check_keys(model, state_dict): default=None, type=str, required=True, - help="Path to the checkpoint to convert.", ) + help="Path to the checkpoint to convert.", + ) parser.add_argument( "--vae_checkpoint_path", default=None, type=str, required=False, - help="Path to the checkpoint to convert.", ) + help="Path to the checkpoint to convert.", + ) parser.add_argument( "--original_config_file", default=None, @@ -325,13 +327,15 @@ def check_keys(model, state_dict): "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights" " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield" " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning." - ), ) + ), + ) parser.add_argument( "--dump_path", default=None, type=str, required=True, - help="Path to the output model.", ) + help="Path to the output model.", + ) args = parser.parse_args() # image_size = 512 @@ -340,15 +344,13 @@ def check_keys(model, state_dict): vae_checkpoint = None if args.vae_checkpoint_path: - vae_checkpoint = torch.load( - args.vae_checkpoint_path, map_location="cpu") + vae_checkpoint = torch.load(args.vae_checkpoint_path, map_location="cpu") vae_checkpoint = vae_checkpoint.get("state_dict", vae_checkpoint) original_config = OmegaConf.load(args.original_config_file) if args.num_in_channels is not None: - original_config["model"]["params"]["unet_config"]["params"][ - "in_channels"] = args.num_in_channels + original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = args.num_in_channels num_train_timesteps = original_config.model.params.timesteps beta_start = original_config.model.params.linear_start @@ -361,7 +363,8 @@ def check_keys(model, state_dict): num_train_timesteps=num_train_timesteps, steps_offset=1, clip_sample=False, - set_alpha_to_one=False, ) + set_alpha_to_one=False, + ) # make sure scheduler works correctly with DDIM scheduler.register_to_config(clip_sample=False) @@ -377,15 +380,13 @@ def check_keys(model, state_dict): elif args.scheduler_type == "euler": scheduler = EulerDiscreteScheduler.from_config(scheduler.config) elif args.scheduler_type == "euler-ancestral": - scheduler = EulerAncestralDiscreteScheduler.from_config( - scheduler.config) + scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config) elif args.scheduler_type == "dpm": scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config) elif args.scheduler_type == "ddim": scheduler = scheduler else: - raise ValueError( - f"Scheduler of type {args.scheduler_type} doesn't exist!") + raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!") # 1. Convert the LVDMUNet3DModel model. diffusers_unet_config = create_unet_diffusers_config(original_config) @@ -393,26 +394,25 @@ def check_keys(model, state_dict): checkpoint, diffusers_unet_config, path=args.checkpoint_path, - extract_ema=args.extract_ema, ) + extract_ema=args.extract_ema, + ) unet = LVDMUNet3DModel.from_config(diffusers_unet_config) - ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers( - unet, diffusers_unet_checkpoint) + ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(unet, diffusers_unet_checkpoint) check_keys(unet, ppdiffusers_unet_checkpoint) unet.load_dict(ppdiffusers_unet_checkpoint) # 2. Convert the LVDMAutoencoderKL model. vae_config = create_lvdm_vae_diffusers_config(original_config) - diffusers_vae_checkpoint = convert_lvdm_vae_checkpoint( - checkpoint, vae_checkpoint, vae_config) + diffusers_vae_checkpoint = convert_lvdm_vae_checkpoint(checkpoint, vae_checkpoint, vae_config) vae = LVDMAutoencoderKL.from_config(vae_config) - ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers( - vae, diffusers_vae_checkpoint) + ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(vae, diffusers_vae_checkpoint) check_keys(vae, ppdiffusers_vae_checkpoint) vae.load_dict(ppdiffusers_vae_checkpoint) pipe = LVDMUncondPipeline( vae=vae, unet=unet, - scheduler=scheduler, ) + scheduler=scheduler, + ) pipe.save_pretrained(args.dump_path) diff --git a/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_text2video_ckpt_to_ppdiffusers.py b/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_text2video_ckpt_to_ppdiffusers.py index 0b09aa164dfe5..0662e05b5bcaa 100644 --- a/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_text2video_ckpt_to_ppdiffusers.py +++ b/ppdiffusers/examples/text_to_video_lvdm/tools/convert_orig_lvdm_text2video_ckpt_to_ppdiffusers.py @@ -27,13 +27,20 @@ "OmegaConf is required to convert the SD checkpoints. Please install it with `pip install OmegaConf`." ) from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer -from transformers import CLIPTextModel as HFCLIPTextModel from ppdiffusers import ( - AutoencoderKL, DDIMScheduler, DPMSolverMultistepScheduler, - EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, - HeunDiscreteScheduler, LMSDiscreteScheduler, LVDMAutoencoderKL, - LVDMTextToVideoPipeline, LVDMUNet3DModel, PNDMScheduler) + AutoencoderKL, + DDIMScheduler, + DPMSolverMultistepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + HeunDiscreteScheduler, + LMSDiscreteScheduler, + LVDMAutoencoderKL, + LVDMTextToVideoPipeline, + LVDMUNet3DModel, + PNDMScheduler, +) paddle.set_device("cpu") MZ_ZIP_LOCAL_DIR_HEADER_SIZE = 30 @@ -116,8 +123,7 @@ def get_data_iostream(file: str, file_name="data.pkl"): FILENAME = f"archive/{file_name}".encode("latin") padding_size_plus_fbxx = 4 + 14 data_iostream = [] - offset = MZ_ZIP_LOCAL_DIR_HEADER_SIZE + len( - FILENAME) + padding_size_plus_fbxx + offset = MZ_ZIP_LOCAL_DIR_HEADER_SIZE + len(FILENAME) + padding_size_plus_fbxx with open(file, "rb") as r: r.seek(offset) for bytes_data in io.BytesIO(r.read()): @@ -130,8 +136,7 @@ def get_data_iostream(file: str, file_name="data.pkl"): return out, offset + len(out) -def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad, - backward_hooks): +def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad, backward_hooks): if isinstance(storage, TensorMeta): storage.size = size return storage @@ -160,8 +165,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0): new_item = old_item new_item = new_item.replace("nin_shortcut", "conv_shortcut") - new_item = shave_segments( - new_item, n_shave_prefix_segments=n_shave_prefix_segments) + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) mapping.append({"old": old_item, "new": new_item}) @@ -191,8 +195,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): new_item = new_item.replace("proj_out.weight", "proj_attn.weight") new_item = new_item.replace("proj_out.bias", "proj_attn.bias") - new_item = shave_segments( - new_item, n_shave_prefix_segments=n_shave_prefix_segments) + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) mapping.append({"old": old_item, "new": new_item}) @@ -200,12 +203,13 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): def assign_to_checkpoint( - paths, - checkpoint, - old_checkpoint, - attention_paths_to_split=None, - additional_replacements=None, - config=None, ): + paths, + checkpoint, + old_checkpoint, + attention_paths_to_split=None, + additional_replacements=None, + config=None, +): """ This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits attention layers, and takes into account additional replacements @@ -213,9 +217,7 @@ def assign_to_checkpoint( Assigns the weights to the new checkpoint. """ - assert isinstance( - paths, - list), "Paths should be a list of dicts containing 'old' and 'new' keys." + assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys." # Splits the attention layers into three variables. if attention_paths_to_split is not None: @@ -223,13 +225,11 @@ def assign_to_checkpoint( old_tensor = old_checkpoint[path] channels = old_tensor.shape[0] // 3 - target_shape = (-1, channels) if len(old_tensor.shape) == 3 else ( - -1) + target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1) num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3 - old_tensor = old_tensor.reshape((num_heads, 3 * channels // - num_heads) + old_tensor.shape[1:]) + old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:]) query, key, value = np.split(old_tensor, 3, axis=1) @@ -241,8 +241,7 @@ def assign_to_checkpoint( new_path = path["new"] # These have already been assigned - if (attention_paths_to_split is not None and - new_path in attention_paths_to_split): + if attention_paths_to_split is not None and new_path in attention_paths_to_split: continue # Global renaming happens here @@ -252,8 +251,7 @@ def assign_to_checkpoint( if additional_replacements is not None: for replacement in additional_replacements: - new_path = new_path.replace(replacement["old"], - replacement["new"]) + new_path = new_path.replace(replacement["old"], replacement["new"]) # proj_attn.weight has to be converted from conv 1D to linear if "proj_attn.weight" in new_path: @@ -297,7 +295,8 @@ def create_unet_diffusers_config(original_config): kernel_size_t=unet_params.kernel_size_t, padding_t=unet_params.padding_t, temporal_length=unet_params.temporal_length, - use_relative_position=unet_params.use_relative_position, ) + use_relative_position=unet_params.use_relative_position, + ) return config @@ -321,7 +320,8 @@ def create_vae_diffusers_config(original_config, image_size: int): up_block_types=tuple(up_block_types), block_out_channels=tuple(block_out_channels), latent_channels=vae_params.z_channels, - layers_per_block=vae_params.num_res_blocks, ) + layers_per_block=vae_params.num_res_blocks, + ) return config @@ -339,7 +339,8 @@ def create_lvdm_vae_diffusers_config(original_config): padding_type=vae_params.encoder.params.padding_type, double_z=vae_params.encoder.params.double_z, z_channels=vae_params.encoder.params.z_channels, - upsample=vae_params.decoder.params.upsample, ) + upsample=vae_params.decoder.params.upsample, + ) return config @@ -348,14 +349,12 @@ def create_diffusers_schedular(original_config): num_train_timesteps=original_config.model.params.timesteps, beta_start=original_config.model.params.linear_start, beta_end=original_config.model.params.linear_end, - beta_schedule="scaled_linear", ) + beta_schedule="scaled_linear", + ) return schedular -def convert_lvdm_unet_checkpoint(checkpoint, - config, - path=None, - extract_ema=False): +def convert_lvdm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False): """ Takes a state dict and a config, and returns a converted checkpoint. """ @@ -376,8 +375,7 @@ def convert_lvdm_unet_checkpoint(checkpoint, for key in keys: if key.startswith("model.diffusion_model"): flat_ema_key = "model_ema." + "".join(key.split(".")[1:]) - unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop( - flat_ema_key) + unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key) else: print( "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA" @@ -407,107 +405,74 @@ def convert_ldm_vae_checkpoint(checkpoint, vae_checkpoint, config): # new_checkpoint = vae_state_dict new_checkpoint = {} - new_checkpoint["encoder.conv_in.weight"] = vae_state_dict[ - "encoder.conv_in.weight"] - new_checkpoint["encoder.conv_in.bias"] = vae_state_dict[ - "encoder.conv_in.bias"] - new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[ - "encoder.conv_out.weight"] - new_checkpoint["encoder.conv_out.bias"] = vae_state_dict[ - "encoder.conv_out.bias"] - new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[ - "encoder.norm_out.weight"] - new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[ - "encoder.norm_out.bias"] - - new_checkpoint["decoder.conv_in.weight"] = vae_state_dict[ - "decoder.conv_in.weight"] - new_checkpoint["decoder.conv_in.bias"] = vae_state_dict[ - "decoder.conv_in.bias"] - new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[ - "decoder.conv_out.weight"] - new_checkpoint["decoder.conv_out.bias"] = vae_state_dict[ - "decoder.conv_out.bias"] - new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[ - "decoder.norm_out.weight"] - new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[ - "decoder.norm_out.bias"] + new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"] + new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"] + new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"] + new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"] + new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"] + new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"] + + new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"] + new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"] + new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"] + new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"] + new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"] + new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"] new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"] new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"] - new_checkpoint["post_quant_conv.weight"] = vae_state_dict[ - "post_quant_conv.weight"] - new_checkpoint["post_quant_conv.bias"] = vae_state_dict[ - "post_quant_conv.bias"] + new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"] + new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"] # Retrieves the keys for the encoder down blocks only - num_down_blocks = len({ - ".".join(layer.split(".")[:3]) - for layer in vae_state_dict if "encoder.down" in layer - }) + num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer}) down_blocks = { - layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] - for layer_id in range(num_down_blocks) + layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks) } # Retrieves the keys for the decoder up blocks only - num_up_blocks = len({ - ".".join(layer.split(".")[:3]) - for layer in vae_state_dict if "decoder.up" in layer - }) + num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer}) up_blocks = { - layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] - for layer_id in range(num_up_blocks) + layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks) } for i in range(num_down_blocks): - resnets = [ - key for key in down_blocks[i] - if f"down.{i}" in key and f"down.{i}.downsample" not in key - ] + resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key] if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict: - new_checkpoint[ - f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop( - f"encoder.down.{i}.downsample.conv.weight") - new_checkpoint[ - f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop( - f"encoder.down.{i}.downsample.conv.bias") + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop( + f"encoder.down.{i}.downsample.conv.weight" + ) + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop( + f"encoder.down.{i}.downsample.conv.bias" + ) paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"down.{i}.block", - "new": f"down_blocks.{i}.resnets" - } + meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key] num_mid_res_blocks = 2 for i in range(1, num_mid_res_blocks + 1): - resnets = [ - key for key in mid_resnets if f"encoder.mid.block_{i}" in key - ] + resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key] paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"mid.block_{i}", - "new": f"mid_block.resnets.{i - 1}" - } + meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) - mid_attentions = [ - key for key in vae_state_dict if "encoder.mid.attn" in key - ] + mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key] paths = renew_vae_attention_paths(mid_attentions) meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} assign_to_checkpoint( @@ -515,58 +480,50 @@ def convert_ldm_vae_checkpoint(checkpoint, vae_checkpoint, config): new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) conv_attn_to_linear(new_checkpoint) for i in range(num_up_blocks): block_id = num_up_blocks - 1 - i resnets = [ - key for key in up_blocks[block_id] - if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key + key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key ] if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict: - new_checkpoint[ - f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[ - f"decoder.up.{block_id}.upsample.conv.weight"] - new_checkpoint[ - f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[ - f"decoder.up.{block_id}.upsample.conv.bias"] + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[ + f"decoder.up.{block_id}.upsample.conv.weight" + ] + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[ + f"decoder.up.{block_id}.upsample.conv.bias" + ] paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"up.{block_id}.block", - "new": f"up_blocks.{i}.resnets" - } + meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key] num_mid_res_blocks = 2 for i in range(1, num_mid_res_blocks + 1): - resnets = [ - key for key in mid_resnets if f"decoder.mid.block_{i}" in key - ] + resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key] paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"mid.block_{i}", - "new": f"mid_block.resnets.{i - 1}" - } + meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) - mid_attentions = [ - key for key in vae_state_dict if "decoder.mid.attn" in key - ] + mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key] paths = renew_vae_attention_paths(mid_attentions) meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} assign_to_checkpoint( @@ -574,7 +531,8 @@ def convert_ldm_vae_checkpoint(checkpoint, vae_checkpoint, config): new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) conv_attn_to_linear(new_checkpoint) return new_checkpoint @@ -595,9 +553,7 @@ def convert_lvdm_vae_checkpoint(checkpoint, vae_checkpoint, config): return new_checkpoint -def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, - diffusers_vae_unet_checkpoint, - dtype="float32"): +def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint, dtype="float32"): need_transpose = [] for k, v in vae_or_unet.named_sublayers(include_self=True): if isinstance(v, paddle.nn.Linear): @@ -619,8 +575,7 @@ def check_keys(model, state_dict): if k not in state_dict.keys(): missing_keys.append(k) elif list(v.shape) != list(state_dict[k].shape): - mismatched_keys.append( - str((k, list(v.shape), list(state_dict[k].shape)))) + mismatched_keys.append(str((k, list(v.shape), list(state_dict[k].shape)))) if len(missing_keys): missing_keys_str = ", ".join(missing_keys) print(f"{cls_name} Found missing_keys {missing_keys_str}!") @@ -633,7 +588,7 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"): clip = {} for key in checkpoint.keys(): if key.startswith("cond_stage_model.transformer"): - clip[key[len("cond_stage_model.transformer."):]] = checkpoint[key] + clip[key[len("cond_stage_model.transformer.") :]] = checkpoint[key] new_model_state = {} transformers2ppnlp = { @@ -653,9 +608,7 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"): ".vision_model.": ".", } ignore_value = ["position_ids"] - donot_transpose = [ - "embeddings", "norm", "concept_embeds", "special_care_embeds" - ] + donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"] for name, value in clip.items(): # step1: ignore position_ids if any(i in name for i in ignore_value): @@ -668,16 +621,13 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"): name = name.replace(hf_name, ppnlp_name) # step4: 0d tensor -> 1d tensor if name == "logit_scale": - value = value.reshape((1, )) + value = value.reshape((1,)) new_model_state[name] = value.cpu().numpy().astype(dtype) new_config = { - "max_text_length": - new_model_state["text_model.positional_embedding.weight"].shape[0], - "vocab_size": - new_model_state["text_model.token_embedding.weight"].shape[0], - "text_embed_dim": - new_model_state["text_model.token_embedding.weight"].shape[1], + "max_text_length": new_model_state["text_model.positional_embedding.weight"].shape[0], + "vocab_size": new_model_state["text_model.token_embedding.weight"].shape[0], + "text_embed_dim": new_model_state["text_model.token_embedding.weight"].shape[1], "text_heads": 12, "text_layers": 12, "text_hidden_act": "quick_gelu", @@ -696,19 +646,22 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"): default=None, type=str, required=True, - help="Path to the checkpoint to convert.", ) + help="Path to the checkpoint to convert.", + ) parser.add_argument( "--vae_checkpoint_path", default=None, type=str, required=False, - help="Path to the checkpoint to convert.", ) + help="Path to the checkpoint to convert.", + ) parser.add_argument( "--vae_type", default="2d", type=str, required=False, - help="The type of vae, chosen from [`2d `, `3d`].", ) + help="The type of vae, chosen from [`2d `, `3d`].", + ) parser.add_argument( "--original_config_file", default=None, @@ -734,13 +687,15 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"): "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights" " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield" " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning." - ), ) + ), + ) parser.add_argument( "--dump_path", default=None, type=str, required=True, - help="Path to the output model.", ) + help="Path to the output model.", + ) args = parser.parse_args() image_size = 512 @@ -750,15 +705,13 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"): vae_checkpoint = None if args.vae_checkpoint_path: - vae_checkpoint = torch.load( - args.vae_checkpoint_path, map_location="cpu") + vae_checkpoint = torch.load(args.vae_checkpoint_path, map_location="cpu") vae_checkpoint = vae_checkpoint.get("state_dict", vae_checkpoint) original_config = OmegaConf.load(args.original_config_file) if args.num_in_channels is not None: - original_config["model"]["params"]["unet_config"]["params"][ - "in_channels"] = args.num_in_channels + original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = args.num_in_channels num_train_timesteps = original_config.model.params.timesteps beta_start = original_config.model.params.linear_start @@ -771,7 +724,8 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"): num_train_timesteps=num_train_timesteps, steps_offset=1, clip_sample=False, - set_alpha_to_one=False, ) + set_alpha_to_one=False, + ) # make sure scheduler works correctly with DDIM scheduler.register_to_config(clip_sample=False) @@ -786,15 +740,13 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"): elif args.scheduler_type == "euler": scheduler = EulerDiscreteScheduler.from_config(scheduler.config) elif args.scheduler_type == "euler-ancestral": - scheduler = EulerAncestralDiscreteScheduler.from_config( - scheduler.config) + scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config) elif args.scheduler_type == "dpm": scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config) elif args.scheduler_type == "ddim": scheduler = scheduler else: - raise ValueError( - f"Scheduler of type {args.scheduler_type} doesn't exist!") + raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!") # 1. Convert the LVDMUNet3DModel model. diffusers_unet_config = create_unet_diffusers_config(original_config) @@ -802,46 +754,41 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"): checkpoint, diffusers_unet_config, path=args.checkpoint_path, - extract_ema=args.extract_ema, ) + extract_ema=args.extract_ema, + ) unet = LVDMUNet3DModel.from_config(diffusers_unet_config) - ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers( - unet, diffusers_unet_checkpoint) + ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(unet, diffusers_unet_checkpoint) check_keys(unet, ppdiffusers_unet_checkpoint) unet.load_dict(ppdiffusers_unet_checkpoint) # 2. Convert the AutoencoderKL model. if args.vae_type == "2d": - vae_config = create_vae_diffusers_config( - original_config, image_size=image_size) - diffusers_vae_checkpoint = convert_ldm_vae_checkpoint( - checkpoint, vae_checkpoint, vae_config) + vae_config = create_vae_diffusers_config(original_config, image_size=image_size) + diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_checkpoint, vae_config) vae = AutoencoderKL.from_config(vae_config) else: vae_config = create_lvdm_vae_diffusers_config(original_config) - diffusers_vae_checkpoint = convert_lvdm_vae_checkpoint( - checkpoint, vae_checkpoint, vae_config) + diffusers_vae_checkpoint = convert_lvdm_vae_checkpoint(checkpoint, vae_checkpoint, vae_config) vae = LVDMAutoencoderKL.from_config(vae_config) - ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers( - vae, diffusers_vae_checkpoint) + ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(vae, diffusers_vae_checkpoint) check_keys(vae, ppdiffusers_vae_checkpoint) vae.load_dict(ppdiffusers_vae_checkpoint) # 3. Convert the text model. - text_model_state_dict, text_config = convert_hf_clip_to_ppnlp_clip( - checkpoint, dtype="float32") + text_model_state_dict, text_config = convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32") text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_config)) text_encoder.eval() check_keys(text_encoder, text_model_state_dict) text_encoder.load_dict(text_model_state_dict) # 4. load tokenizer. - pp_tokenizer = CLIPTokenizer.from_pretrained( - "openai/clip-vit-large-patch14") + pp_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") pipe = LVDMTextToVideoPipeline( vae=vae, text_encoder=text_encoder, tokenizer=pp_tokenizer, unet=unet, - scheduler=scheduler, ) + scheduler=scheduler, + ) pipe.save_pretrained(args.dump_path) diff --git a/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_short.py b/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_short.py index 967fa9cd80f36..2eba6ece4b713 100644 --- a/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_short.py +++ b/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_short.py @@ -16,34 +16,40 @@ import os import paddle -from lvdm import (LatentVideoDiffusion, LatentVideoDiffusionTrainer, - VideoFrameDataset) -from lvdm.lvdm_args_short import (ModelArguments, TrainerArguments, - VideoFrameDatasetArguments) -from paddlenlp.trainer import (PdArgumentParser, TrainingArguments, - get_last_checkpoint) +from lvdm import LatentVideoDiffusion, LatentVideoDiffusionTrainer, VideoFrameDataset +from lvdm.lvdm_args_short import ( + ModelArguments, + TrainerArguments, + VideoFrameDatasetArguments, +) +from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint from paddlenlp.utils.log import logger def main(): - parser = PdArgumentParser(( - ModelArguments, - VideoFrameDatasetArguments, - TrainerArguments, - TrainingArguments, )) + parser = PdArgumentParser( + ( + ModelArguments, + VideoFrameDatasetArguments, + TrainerArguments, + TrainingArguments, + ) + ) ( model_args, data_args, trainer_args, - training_args, ) = parser.parse_args_into_dataclasses() + training_args, + ) = parser.parse_args_into_dataclasses() # report to custom_visualdl training_args.report_to = ["custom_visualdl"] training_args.resolution = data_args.resolution training_args.image_logging_steps = trainer_args.image_logging_steps = ( - (math.ceil(trainer_args.image_logging_steps / - training_args.logging_steps) * training_args.logging_steps) - if trainer_args.image_logging_steps > 0 else -1) + (math.ceil(trainer_args.image_logging_steps / training_args.logging_steps) * training_args.logging_steps) + if trainer_args.image_logging_steps > 0 + else -1 + ) training_args.print_config(model_args, "Model") training_args.print_config(trainer_args, "Trainer") @@ -53,16 +59,14 @@ def main(): # Detecting last checkpoint. last_checkpoint = None - if (os.path.isdir(training_args.output_dir) and training_args.do_train and - not training_args.overwrite_output_dir): + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len( - os.listdir(training_args.output_dir)) > 0: + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome.") - elif (last_checkpoint is not None and - training_args.resume_from_checkpoint is None): + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." @@ -78,7 +82,8 @@ def main(): subset_split=data_args.train_subset_split, spatial_transform=data_args.spatial_transform, clip_step=data_args.clip_step, - temporal_transform=data_args.temporal_transform, ) + temporal_transform=data_args.temporal_transform, + ) eval_dataset = VideoFrameDataset( data_root=data_args.eval_data_root, resolution=data_args.resolution, @@ -87,13 +92,15 @@ def main(): subset_split=data_args.eval_subset_split, spatial_transform=data_args.spatial_transform, clip_step=data_args.clip_step, - temporal_transform=data_args.temporal_transform, ) + temporal_transform=data_args.temporal_transform, + ) trainer = LatentVideoDiffusionTrainer( model=model, args=training_args, train_dataset=train_dataset, - eval_dataset=eval_dataset, ) + eval_dataset=eval_dataset, + ) # must set recompute after trainer init trainer.model.set_recompute(training_args.recompute) diff --git a/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_text2video.py b/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_text2video.py index 4959f59c1b1a6..f7a04f62abb77 100644 --- a/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_text2video.py +++ b/ppdiffusers/examples/text_to_video_lvdm/train_lvdm_text2video.py @@ -16,31 +16,33 @@ import os import paddle -from lvdm import (LatentVideoDiffusion, LatentVideoDiffusionTrainer, - WebVidDataset) -from lvdm.lvdm_args_text2video import (ModelArguments, TrainerArguments, - WebVidDatasetArguments) -from paddlenlp.trainer import (PdArgumentParser, TrainingArguments, - get_last_checkpoint) +from lvdm import LatentVideoDiffusion, LatentVideoDiffusionTrainer, WebVidDataset +from lvdm.lvdm_args_text2video import ( + ModelArguments, + TrainerArguments, + WebVidDatasetArguments, +) +from paddlenlp.trainer import PdArgumentParser, TrainingArguments, get_last_checkpoint from paddlenlp.utils.log import logger def main(): - parser = PdArgumentParser((ModelArguments, WebVidDatasetArguments, - TrainerArguments, TrainingArguments)) + parser = PdArgumentParser((ModelArguments, WebVidDatasetArguments, TrainerArguments, TrainingArguments)) ( model_args, data_args, trainer_args, - training_args, ) = parser.parse_args_into_dataclasses() + training_args, + ) = parser.parse_args_into_dataclasses() # report to custom_visualdl training_args.report_to = ["custom_visualdl"] training_args.resolution = data_args.resolution training_args.image_logging_steps = trainer_args.image_logging_steps = ( - (math.ceil(trainer_args.image_logging_steps / - training_args.logging_steps) * training_args.logging_steps) - if trainer_args.image_logging_steps > 0 else -1) + (math.ceil(trainer_args.image_logging_steps / training_args.logging_steps) * training_args.logging_steps) + if trainer_args.image_logging_steps > 0 + else -1 + ) training_args.print_config(model_args, "Model") training_args.print_config(trainer_args, "Trainer") @@ -50,16 +52,14 @@ def main(): # Detecting last checkpoint. last_checkpoint = None - if (os.path.isdir(training_args.output_dir) and training_args.do_train and - not training_args.overwrite_output_dir): + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len( - os.listdir(training_args.output_dir)) > 0: + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome.") - elif (last_checkpoint is not None and - training_args.resume_from_checkpoint is None): + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." @@ -74,7 +74,8 @@ def main(): video_length=data_args.video_length, frame_stride=data_args.frame_stride, spatial_transform=data_args.spatial_transform, - tokenizer=model.tokenizer, ) + tokenizer=model.tokenizer, + ) eval_dataset = WebVidDataset( data_root=data_args.eval_data_root, annotation_path=data_args.eval_annotation_path, @@ -83,14 +84,16 @@ def main(): video_length=data_args.video_length, frame_stride=data_args.frame_stride, spatial_transform=data_args.spatial_transform, - tokenizer=model.tokenizer, ) + tokenizer=model.tokenizer, + ) trainer = LatentVideoDiffusionTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset if training_args.do_eval else None, - tokenizer=model.tokenizer, ) + tokenizer=model.tokenizer, + ) # must set recompute after trainer init trainer.model.set_recompute(training_args.recompute) diff --git a/ppdiffusers/examples/textual_inversion/train_textual_inversion.py b/ppdiffusers/examples/textual_inversion/train_textual_inversion.py index 32134c2808903..26c629dff52ae 100644 --- a/ppdiffusers/examples/textual_inversion/train_textual_inversion.py +++ b/ppdiffusers/examples/textual_inversion/train_textual_inversion.py @@ -29,10 +29,10 @@ import paddle.nn as nn import paddle.nn.functional as F from huggingface_hub import HfFolder, Repository, create_repo, whoami -from paddle.distributed.fleet.utils.hybrid_parallel_util import \ - fused_allreduce_gradients -from paddle.io import (BatchSampler, DataLoader, Dataset, - DistributedBatchSampler) +from paddle.distributed.fleet.utils.hybrid_parallel_util import ( + fused_allreduce_gradients, +) +from paddle.io import BatchSampler, DataLoader, Dataset, DistributedBatchSampler from paddle.optimizer import AdamW from paddle.vision.transforms import RandomHorizontalFlip from paddlenlp.trainer import set_seed @@ -41,27 +41,30 @@ from PIL import Image from tqdm.auto import tqdm -from ppdiffusers import (AutoencoderKL, DDPMScheduler, DiffusionPipeline, - DPMSolverMultistepScheduler, UNet2DConditionModel, - is_ppxformers_available) +from ppdiffusers import ( + AutoencoderKL, + DDPMScheduler, + DiffusionPipeline, + DPMSolverMultistepScheduler, + UNet2DConditionModel, + is_ppxformers_available, +) from ppdiffusers.optimization import get_scheduler -from ppdiffusers.training_utils import (freeze_params, unfreeze_params, - unwrap_model) +from ppdiffusers.training_utils import freeze_params, unfreeze_params, unwrap_model from ppdiffusers.utils import PIL_INTERPOLATION, check_min_version check_min_version("0.16.1") def url_or_path_join(*path_list): - return (os.path.join(*path_list) - if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list)) + return os.path.join(*path_list) if os.path.isdir(os.path.join(*path_list)) else "/".join(path_list) -def import_model_class_from_model_name_or_path( - pretrained_model_name_or_path: str): +def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str): try: text_encoder_config = PretrainedConfig.from_pretrained( - url_or_path_join(pretrained_model_name_or_path, "text_encoder")) + url_or_path_join(pretrained_model_name_or_path, "text_encoder") + ) model_class = text_encoder_config.architectures[0] except Exception: model_class = "LDMBertModel" @@ -70,8 +73,9 @@ def import_model_class_from_model_name_or_path( return CLIPTextModel elif model_class == "RobertaSeriesModelWithTransformation": - from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import \ - RobertaSeriesModelWithTransformation + from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import ( + RobertaSeriesModelWithTransformation, + ) return RobertaSeriesModelWithTransformation elif model_class == "BertModel": @@ -79,8 +83,9 @@ def import_model_class_from_model_name_or_path( return BertModel elif model_class == "LDMBertModel": - from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import \ - LDMBertModel + from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import ( + LDMBertModel, + ) return LDMBertModel else: @@ -118,25 +123,28 @@ def get_report_to(args): def save_progress(text_encoder, placeholder_token_ids, args, save_path): logger.info("Saving embeddings") learned_embeds = ( - unwrap_model(text_encoder).get_input_embeddings() - .weight[min(placeholder_token_ids):max(placeholder_token_ids) + 1]) + unwrap_model(text_encoder) + .get_input_embeddings() + .weight[min(placeholder_token_ids) : max(placeholder_token_ids) + 1] + ) learned_embeds_dict = {args.placeholder_token: learned_embeds.detach()} paddle.save(learned_embeds_dict, save_path) def parse_args(): - parser = argparse.ArgumentParser( - description="Simple example of a training script.") + parser = argparse.ArgumentParser(description="Simple example of a training script.") parser.add_argument( "--save_steps", type=int, default=500, - help="Save learned_embeds.pdparams every X updates steps.", ) + help="Save learned_embeds.pdparams every X updates steps.", + ) parser.add_argument( "--only_save_embeds", action="store_true", default=True, - help="Save only the embeddings for the new concept.", ) + help="Save only the embeddings for the new concept.", + ) parser.add_argument( "--num_vectors", type=int, @@ -161,70 +169,79 @@ def parse_args(): type=str, default=None, required=True, - help="A folder containing the training data.", ) + help="A folder containing the training data.", + ) parser.add_argument( "--placeholder_token", type=str, default=None, required=True, - help="A token to use as a placeholder for the concept.", ) + help="A token to use as a placeholder for the concept.", + ) parser.add_argument( "--initializer_token", type=str, default=None, required=True, - help="A token to use as initializer word.", ) + help="A token to use as initializer word.", + ) parser.add_argument( "--learnable_property", type=str, default="object", - help="Choose between 'object' and 'style'", ) + help="Choose between 'object' and 'style'", + ) parser.add_argument( "--repeats", type=int, default=100, - help="How many times to repeat the training data.", ) + help="How many times to repeat the training data.", + ) parser.add_argument( "--output_dir", type=str, default="text-inversion-model", help="The output directory where the model predictions and checkpoints will be written.", ) - parser.add_argument( - "--seed", - type=int, - default=None, - help="A seed for reproducible training.") + parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") parser.add_argument( "--height", type=int, default=None, help=( "The height for input images, all the images in the train/validation dataset will be resized to this" - " height"), ) + " height" + ), + ) parser.add_argument( "--width", type=int, default=None, help=( "The width for input images, all the images in the train/validation dataset will be resized to this" - " width"), ) + " width" + ), + ) parser.add_argument( "--resolution", type=int, default=512, help=( "The resolution for input images, all the images in the train/validation dataset will be resized to this" - " resolution"), ) + " resolution" + ), + ) parser.add_argument( "--center_crop", action="store_true", - help="Whether to center crop images before resizing to resolution.", ) + help="Whether to center crop images before resizing to resolution.", + ) parser.add_argument( "--train_batch_size", type=int, default=16, - help="Batch size (per device) for the training dataloader.", ) + help="Batch size (per device) for the training dataloader.", + ) parser.add_argument("--num_train_epochs", type=int, default=100) parser.add_argument( "--max_train_steps", @@ -261,19 +278,23 @@ def parse_args(): default="constant", help=( 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' - ' "constant", "constant_with_warmup"]'), ) + ' "constant", "constant_with_warmup"]' + ), + ) parser.add_argument( "--dataloader_num_workers", type=int, default=0, help=( "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process." - ), ) + ), + ) parser.add_argument( "--lr_warmup_steps", type=int, default=500, - help="Number of steps for the warmup in the lr scheduler.", ) + help="Number of steps for the warmup in the lr scheduler.", + ) parser.add_argument( "--lr_num_cycles", type=int, @@ -284,38 +305,39 @@ def parse_args(): "--lr_power", type=float, default=1.0, - help="Power factor of the polynomial scheduler.", ) + help="Power factor of the polynomial scheduler.", + ) parser.add_argument( "--adam_beta1", type=float, default=0.9, - help="The beta1 parameter for the Adam optimizer.", ) + help="The beta1 parameter for the Adam optimizer.", + ) parser.add_argument( "--adam_beta2", type=float, default=0.999, - help="The beta2 parameter for the Adam optimizer.", ) - parser.add_argument( - "--adam_weight_decay", - type=float, - default=1e-2, - help="Weight decay to use.") + help="The beta2 parameter for the Adam optimizer.", + ) + parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") parser.add_argument( "--adam_epsilon", type=float, default=1e-08, - help="Epsilon value for the Adam optimizer", ) - parser.add_argument( - "--max_grad_norm", default=-1, type=float, help="Max gradient norm.") + help="Epsilon value for the Adam optimizer", + ) + parser.add_argument("--max_grad_norm", default=-1, type=float, help="Max gradient norm.") parser.add_argument( "--push_to_hub", action="store_true", - help="Whether or not to push the model to the Hub.", ) + help="Whether or not to push the model to the Hub.", + ) parser.add_argument( "--hub_token", type=str, default=None, - help="The token to use to push to the Model Hub.", ) + help="The token to use to push to the Model Hub.", + ) parser.add_argument( "--hub_model_id", type=str, @@ -328,19 +350,24 @@ def parse_args(): default="logs", help=( "[TensorBoard](https://www.tensorflow.org/tensorboard) or [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl) log directory. Will default to" - "*output_dir/logs"), ) + "*output_dir/logs" + ), + ) parser.add_argument( "--report_to", type=str, default="visualdl", help=( 'The integration to report the results and logs to. Supported platforms are `"visualdl"`' - ' (default), `"tensorboard"`.'), ) + ' (default), `"tensorboard"`.' + ), + ) parser.add_argument( "--language", default="en", choices=["en", "zh", "zh_en"], - help="Model language.", ) + help="Model language.", + ) parser.add_argument( "--validation_prompt", type=str, @@ -360,16 +387,15 @@ def parse_args(): help=( "Run validation every X epochs. Validation consists of running the prompt" " `args.validation_prompt` multiple times: `args.num_validation_images`" - " and logging the images."), ) + " and logging the images." + ), + ) parser.add_argument( "--enable_xformers_memory_efficient_attention", action="store_true", - help="Whether or not to use xformers.", ) - parser.add_argument( - "--noise_offset", - type=float, - default=0, - help="The scale of noise offset.") + help="Whether or not to use xformers.", + ) + parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.") args = parser.parse_args() @@ -379,9 +405,7 @@ def parse_args(): if args.language == "en": if "chinese-en" in args.pretrained_model_name_or_path.lower(): args.language = "zh_en" - logger.info( - "Detect Chinese-English Model, we will set language to 'zh_en'. " - ) + logger.info("Detect Chinese-English Model, we will set language to 'zh_en'. ") elif "chinese" in args.pretrained_model_name_or_path.lower(): args.language = "zh" logger.info("Detect Chinese Model, we will set language to 'zh'. ") @@ -486,19 +510,20 @@ def parse_args(): class TextualInversionDataset(Dataset): def __init__( - self, - data_root, - tokenizer, - learnable_property="object", # [object, style] - height=512, - width=512, - repeats=100, - interpolation="bicubic", - flip_p=0.5, - set="train", - placeholder_token="*", - center_crop=False, - language="en", ): + self, + data_root, + tokenizer, + learnable_property="object", # [object, style] + height=512, + width=512, + repeats=100, + interpolation="bicubic", + flip_p=0.5, + set="train", + placeholder_token="*", + center_crop=False, + language="en", + ): self.data_root = data_root self.tokenizer = tokenizer self.learnable_property = learnable_property @@ -514,8 +539,7 @@ def __init__( ext = ["png", "jpg", "jpeg", "bmp", "PNG", "JPG", "JPEG", "BMP"] self.image_paths = [] for e in ext: - self.image_paths.extend( - glob.glob(os.path.join(data_root, "*." + e))) + self.image_paths.extend(glob.glob(os.path.join(data_root, "*." + e))) self.num_images = len(self.image_paths) self._length = self.num_images @@ -562,7 +586,8 @@ def __getitem__(self, i): padding="max_length", truncation=True, max_length=self.tokenizer.model_max_length, - return_attention_mask=False, ).input_ids + return_attention_mask=False, + ).input_ids # default to score-sde preprocessing img = np.array(image).astype(np.uint8) @@ -571,13 +596,12 @@ def __getitem__(self, i): crop = min(img.shape[0], img.shape[1]) h, w, = ( img.shape[0], - img.shape[1], ) - img = img[(h - crop) // 2:(h + crop) // 2, (w - crop) // 2:(w + crop - ) // 2] + img.shape[1], + ) + img = img[(h - crop) // 2 : (h + crop) // 2, (w - crop) // 2 : (w + crop) // 2] image = Image.fromarray(img) - image = image.resize( - (self.width, self.height), resample=self.interpolation) + image = image.resize((self.width, self.height), resample=self.interpolation) image = self.flip_transform(image) image = np.array(image).astype(np.uint8) @@ -587,9 +611,7 @@ def __getitem__(self, i): return example -def get_full_repo_name(model_id: str, - organization: Optional[str]=None, - token: Optional[str]=None): +def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None): if token is None: token = HfFolder.get_token() if organization is None: @@ -618,16 +640,13 @@ def main(): os.makedirs(args.output_dir, exist_ok=True) if args.push_to_hub: if args.hub_model_id is None: - repo_name = get_full_repo_name( - Path(args.output_dir).name, token=args.hub_token) + repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id create_repo(repo_name, exist_ok=True, token=args.hub_token) - repo = Repository( - args.output_dir, clone_from=repo_name, token=args.hub_token) + repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token) - with open(os.path.join(args.output_dir, ".gitignore"), - "w+") as gitignore: + with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: if "step_*" not in gitignore: gitignore.write("step_*\n") if "epoch_*" not in gitignore: @@ -638,18 +657,14 @@ def main(): tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name) elif args.pretrained_model_name_or_path: # support windows "\" - tokenizer = AutoTokenizer.from_pretrained( - url_or_path_join(args.pretrained_model_name_or_path, "tokenizer")) + tokenizer = AutoTokenizer.from_pretrained(url_or_path_join(args.pretrained_model_name_or_path, "tokenizer")) # Load scheduler and models - noise_scheduler = DDPMScheduler.from_pretrained( - args.pretrained_model_name_or_path, subfolder="scheduler") + noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") # Add the placeholder token in tokenizer placeholder_tokens = [args.placeholder_token] if args.num_vectors < 1: - raise ValueError( - f"--num_vectors has to be larger or equal to 1, but is {args.num_vectors}" - ) + raise ValueError(f"--num_vectors has to be larger or equal to 1, but is {args.num_vectors}") # add dummy tokens for multi-vector additional_tokens = [] @@ -661,33 +676,28 @@ def main(): if num_added_tokens != args.num_vectors: raise ValueError( f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different" - " `placeholder_token` that is not already in the tokenizer.") + " `placeholder_token` that is not already in the tokenizer." + ) # Convert the initializer_token, placeholder_token to ids - initializer_token_ids = tokenizer.encode( - args.initializer_token, add_special_tokens=False)["input_ids"] + initializer_token_ids = tokenizer.encode(args.initializer_token, add_special_tokens=False)["input_ids"] if len(initializer_token_ids) < 1: - raise ValueError( - "The initializer token must be a greater equal than one.") + raise ValueError("The initializer token must be a greater equal than one.") placeholder_token_ids = tokenizer.convert_tokens_to_ids(placeholder_tokens) - text_encoder_cls = import_model_class_from_model_name_or_path( - args.pretrained_model_name_or_path) + text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path) text_encoder = text_encoder_cls.from_pretrained( - url_or_path_join(args.pretrained_model_name_or_path, "text_encoder")) - text_config = (text_encoder.config if isinstance(text_encoder.config, dict) - else text_encoder.config.to_dict()) - if (text_config.get("use_attention_mask", None) is not None and - text_config["use_attention_mask"]): + url_or_path_join(args.pretrained_model_name_or_path, "text_encoder") + ) + text_config = text_encoder.config if isinstance(text_encoder.config, dict) else text_encoder.config.to_dict() + if text_config.get("use_attention_mask", None) is not None and text_config["use_attention_mask"]: use_attention_mask = True else: use_attention_mask = False - vae = AutoencoderKL.from_pretrained( - args.pretrained_model_name_or_path, subfolder="vae") - unet = UNet2DConditionModel.from_pretrained( - args.pretrained_model_name_or_path, subfolder="unet") + vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae") + unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet") # Resize the token embeddings as we are adding new special tokens to the tokenizer text_encoder.resize_token_embeddings(len(tokenizer)) @@ -698,8 +708,8 @@ def main(): # we will compute mean for token_id in placeholder_token_ids: token_embeds.weight[token_id] = paddle.stack( - [token_embeds.weight[each] - for each in initializer_token_ids]).mean(0) + [token_embeds.weight[each] for each in initializer_token_ids] + ).mean(0) # Freeze vae and unet freeze_params(vae.parameters()) @@ -712,14 +722,14 @@ def main(): # unet.enable_gradient_checkpointing() set_recompute(text_encoder, True) - if args.enable_xformers_memory_efficient_attention and is_ppxformers_available( - ): + if args.enable_xformers_memory_efficient_attention and is_ppxformers_available(): try: unet.enable_xformers_memory_efficient_attention() except Exception as e: logger.warn( "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed" - f" correctly and a GPU is available: {e}") + f" correctly and a GPU is available: {e}" + ) train_dataset = TextualInversionDataset( data_root=args.train_data_dir, @@ -732,71 +742,66 @@ def main(): center_crop=args.center_crop, set="train", language=args.language, - interpolation="bilinear", ) + interpolation="bilinear", + ) def collate_fn(examples): input_ids = [example["input_ids"] for example in examples] - pixel_values = paddle.to_tensor( - [example["pixel_values"] for example in examples], dtype="float32") + pixel_values = paddle.to_tensor([example["pixel_values"] for example in examples], dtype="float32") input_ids = tokenizer.pad( - { - "input_ids": input_ids - }, + {"input_ids": input_ids}, padding="max_length", max_length=tokenizer.model_max_length, - return_tensors="pd", ).input_ids + return_tensors="pd", + ).input_ids return { "input_ids": input_ids, "pixel_values": pixel_values, } - train_sampler = (DistributedBatchSampler( - train_dataset, batch_size=args.train_batch_size, shuffle=True) - if num_processes > 1 else BatchSampler( - train_dataset, - batch_size=args.train_batch_size, - shuffle=True)) + train_sampler = ( + DistributedBatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True) + if num_processes > 1 + else BatchSampler(train_dataset, batch_size=args.train_batch_size, shuffle=True) + ) train_dataloader = DataLoader( train_dataset, batch_sampler=train_sampler, collate_fn=collate_fn, - num_workers=args.dataloader_num_workers, ) + num_workers=args.dataloader_num_workers, + ) # Scheduler and math around the number of training steps. - num_update_steps_per_epoch = math.ceil( - len(train_dataloader) / args.gradient_accumulation_steps) + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch # Afterwards we recalculate our number of training epochs - args.num_train_epochs = math.ceil(args.max_train_steps / - num_update_steps_per_epoch) + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) if args.scale_lr: - args.learning_rate = (args.learning_rate * - args.gradient_accumulation_steps * - args.train_batch_size * num_processes) + args.learning_rate = ( + args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * num_processes + ) # Initialize the lr_scheduler lr_scheduler = get_scheduler( args.lr_scheduler, learning_rate=args.learning_rate, - num_warmup_steps=args.lr_warmup_steps * - args.gradient_accumulation_steps, - num_training_steps=args.max_train_steps * - args.gradient_accumulation_steps, + num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, num_cycles=args.lr_num_cycles, - power=args.lr_power, ) + power=args.lr_power, + ) # Initialize the optimizer optimizer = AdamW( learning_rate=lr_scheduler, - parameters=text_encoder.get_input_embeddings().parameters( - ), # only optimize the embeddings + parameters=text_encoder.get_input_embeddings().parameters(), # only optimize the embeddings beta1=args.adam_beta1, beta2=args.adam_beta2, weight_decay=args.adam_weight_decay, epsilon=args.adam_epsilon, - grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) - if args.max_grad_norm > 0 else None, ) + grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None, + ) if num_processes > 1: text_encoder = paddle.DataParallel(text_encoder) @@ -809,35 +814,27 @@ def collate_fn(examples): writer = get_report_to(args) # Train! - total_batch_size = (args.train_batch_size * num_processes * - args.gradient_accumulation_steps) + total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num batches each epoch = {len(train_dataloader)}") logger.info(f" Num Epochs = {args.num_train_epochs}") - logger.info( - f" Instantaneous batch size per device = {args.train_batch_size}") - logger.info( - f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" - ) - logger.info( - f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Instantaneous batch size per device = {args.train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. - progress_bar = tqdm( - range(args.max_train_steps), disable=not is_main_process) + progress_bar = tqdm(range(args.max_train_steps), disable=not is_main_process) progress_bar.set_description("Train Steps") global_step = 0 # keep original embeddings as reference - orig_embeds_params = ( - unwrap_model(text_encoder).get_input_embeddings().weight.clone()) + orig_embeds_params = unwrap_model(text_encoder).get_input_embeddings().weight.clone() - index_no_updates = paddle.ones((len(tokenizer), ), dtype=paddle.bool) - index_no_updates[min(placeholder_token_ids):max(placeholder_token_ids) + - 1] = False + index_no_updates = paddle.ones((len(tokenizer),), dtype=paddle.bool) + index_no_updates[min(placeholder_token_ids) : max(placeholder_token_ids) + 1] = False index_no_updates = index_no_updates.cast("int64").sum() # Keep vae and unet in eval model as we don't train these vae.eval() @@ -855,20 +852,19 @@ def collate_fn(examples): if args.noise_offset: # https://www.crosslabs.org//blog/diffusion-with-offset-noise noise += args.noise_offset * paddle.randn( - (latents.shape[0], latents.shape[1], 1, 1), - dtype=latents.dtype) + (latents.shape[0], latents.shape[1], 1, 1), dtype=latents.dtype + ) batch_size = latents.shape[0] # Sample a random timestep for each image - timesteps = paddle.randint( - 0, noise_scheduler.config.num_train_timesteps, - (batch_size, )).cast("int64") + timesteps = paddle.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,)).cast("int64") # Add noise to the latents according to the noise magnitude at each timestep # (this is the forward diffusion process) noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) - if num_processes > 1 and (args.gradient_checkpointing or ( - (step + 1) % args.gradient_accumulation_steps != 0)): + if num_processes > 1 and ( + args.gradient_checkpointing or ((step + 1) % args.gradient_accumulation_steps != 0) + ): # grad acc, no_sync when (step + 1) % args.gradient_accumulation_steps != 0: # gradient_checkpointing, no_sync every where # gradient_checkpointing + grad_acc, no_sync every where @@ -876,35 +872,29 @@ def collate_fn(examples): text_encoder_ctx_manager = text_encoder.no_sync() else: # unet_ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress() - text_encoder_ctx_manager = (contextlib.nullcontext() - if sys.version_info >= (3, 7) else - contextlib.suppress()) + text_encoder_ctx_manager = ( + contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress() + ) with text_encoder_ctx_manager: # Get the text embedding for conditioning if use_attention_mask: - attention_mask = (batch["input_ids"] != - tokenizer.pad_token_id).cast("int64") + attention_mask = (batch["input_ids"] != tokenizer.pad_token_id).cast("int64") else: attention_mask = None - encoder_hidden_states = text_encoder( - batch["input_ids"], attention_mask=attention_mask)[0] + encoder_hidden_states = text_encoder(batch["input_ids"], attention_mask=attention_mask)[0] # with unet_ctx_manager: # Predict the noise or sample - model_pred = unet(noisy_latents, timesteps, - encoder_hidden_states).sample + model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample # Get the target for loss depending on the prediction type if noise_scheduler.config.prediction_type == "epsilon": target = noise elif noise_scheduler.config.prediction_type == "v_prediction": - target = noise_scheduler.get_velocity(latents, noise, - timesteps) + target = noise_scheduler.get_velocity(latents, noise, timesteps) else: - raise ValueError( - f"Unknown prediction type {noise_scheduler.config.prediction_type}" - ) + raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") loss = F.mse_loss(model_pred, target, reduction="mean") if args.gradient_accumulation_steps > 1: @@ -914,18 +904,17 @@ def collate_fn(examples): if (step + 1) % args.gradient_accumulation_steps == 0: if num_processes > 1 and args.gradient_checkpointing: fused_allreduce_gradients( - unwrap_model(text_encoder).get_input_embeddings() - .parameters(), - None, ) + unwrap_model(text_encoder).get_input_embeddings().parameters(), + None, + ) optimizer.step() lr_scheduler.step() optimizer.clear_grad() # Let's make sure we don't update any embedding weights besides the newly added token with paddle.no_grad(): - unwrap_model(text_encoder).get_input_embeddings( - ).weight[: - index_no_updates] = orig_embeds_params[: - index_no_updates] + unwrap_model(text_encoder).get_input_embeddings().weight[:index_no_updates] = orig_embeds_params[ + :index_no_updates + ] progress_bar.update(1) global_step += 1 @@ -945,19 +934,19 @@ def collate_fn(examples): if global_step % args.save_steps == 0: save_path = os.path.join( args.output_dir, - f"learned_embeds-steps-{global_step}.pdparams", ) - save_progress(text_encoder, placeholder_token_ids, args, - save_path) + f"learned_embeds-steps-{global_step}.pdparams", + ) + save_progress(text_encoder, placeholder_token_ids, args, save_path) if global_step >= args.max_train_steps: break if is_main_process: - if (args.validation_prompt is not None and - epoch % args.validation_epochs == 0): + if args.validation_prompt is not None and epoch % args.validation_epochs == 0: logger.info( f"Running validation... \n Generating {args.num_validation_images} images with prompt:" - f" {args.validation_prompt}.") + f" {args.validation_prompt}." + ) # create pipeline pipeline = DiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, @@ -965,29 +954,27 @@ def collate_fn(examples): tokenizer=tokenizer, paddle_dtype=paddle_dtype, safety_checker=None, - requires_safety_checker=False, ) - pipeline.scheduler = DPMSolverMultistepScheduler.from_config( - pipeline.scheduler.config) + requires_safety_checker=False, + ) + pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config) pipeline.set_progress_bar_config(disable=True) # run inference - generator = (paddle.Generator().manual_seed(args.seed) - if args.seed else None) + generator = paddle.Generator().manual_seed(args.seed) if args.seed else None images = [ pipeline( args.validation_prompt, num_inference_steps=25, - generator=generator, ).images[0] + generator=generator, + ).images[0] for _ in range(args.num_validation_images) ] np_images = np.stack([np.asarray(img) for img in images]) if args.report_to == "tensorboard": - writer.add_images( - "test", np_images, epoch, dataformats="NHWC") + writer.add_images("test", np_images, epoch, dataformats="NHWC") else: - writer.add_image( - "test", np_images, epoch, dataformats="NHWC") + writer.add_image("test", np_images, epoch, dataformats="NHWC") del pipeline gc.collect() @@ -998,9 +985,7 @@ def collate_fn(examples): if is_main_process: writer.close() if args.push_to_hub and args.only_save_embeds: - logger.warn( - "Enabling full model saving because --push_to_hub=True was specified." - ) + logger.warn("Enabling full model saving because --push_to_hub=True was specified.") save_full_model = True else: save_full_model = not args.only_save_embeds @@ -1008,17 +993,15 @@ def collate_fn(examples): pipeline = DiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, text_encoder=unwrap_model(text_encoder), - tokenizer=tokenizer, ) + tokenizer=tokenizer, + ) pipeline.save_pretrained(args.output_dir) # Save the newly trained embeddings save_path = os.path.join(args.output_dir, "learned_embeds.pdparams") save_progress(text_encoder, placeholder_token_ids, args, save_path) if args.push_to_hub: - repo.push_to_hub( - commit_message="End of training", - blocking=False, - auto_lfs_prune=True) + repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True) if __name__ == "__main__": diff --git a/ppdiffusers/examples/unconditional_image_generation/train_unconditional.py b/ppdiffusers/examples/unconditional_image_generation/train_unconditional.py index a157a1f5c1f04..80af56cbf7391 100644 --- a/ppdiffusers/examples/unconditional_image_generation/train_unconditional.py +++ b/ppdiffusers/examples/unconditional_image_generation/train_unconditional.py @@ -73,8 +73,7 @@ def get_report_to(args): def parse_args(): - parser = argparse.ArgumentParser( - description="Simple example of a training script.") + parser = argparse.ArgumentParser(description="Simple example of a training script.") parser.add_argument( "--dataset_name", type=str, @@ -83,7 +82,8 @@ def parse_args(): "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private," " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem," " or to a folder containing files that HF Datasets can understand." - ), ) + ), + ) parser.add_argument( "--dataset_config_name", type=str, @@ -104,7 +104,8 @@ def parse_args(): "A folder containing the training data. Folder contents must follow the structure described in" " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file" " must exist to provide the captions for the images. Ignored if `dataset_name` is specified." - ), ) + ), + ) parser.add_argument( "--output_dir", type=str, @@ -124,7 +125,9 @@ def parse_args(): default=64, help=( "The resolution for input images, all the images in the train/validation dataset will be resized to this" - " resolution"), ) + " resolution" + ), + ) parser.add_argument( "--center_crop", default=False, @@ -132,40 +135,48 @@ def parse_args(): help=( "Whether to center crop the input images to the resolution. If not set, the images will be randomly" " cropped. The images will be resized to the resolution first before cropping." - ), ) + ), + ) parser.add_argument( "--random_flip", default=False, action="store_true", - help="whether to randomly flip images horizontally", ) + help="whether to randomly flip images horizontally", + ) parser.add_argument( "--train_batch_size", type=int, default=16, - help="Batch size (per device) for the training dataloader.", ) + help="Batch size (per device) for the training dataloader.", + ) parser.add_argument( "--eval_batch_size", type=int, default=16, - help="The number of images to generate for evaluation.", ) + help="The number of images to generate for evaluation.", + ) parser.add_argument( "--dataloader_num_workers", type=int, default=0, help=( "The number of subprocesses to use for data loading. 0 means that the data will be loaded in the main" - " process."), ) + " process." + ), + ) parser.add_argument("--num_epochs", type=int, default=100) parser.add_argument( "--save_images_epochs", type=int, default=10, - help="How often to save images during training.", ) + help="How often to save images during training.", + ) parser.add_argument( "--save_model_epochs", type=int, default=10, - help="How often to save the model during training.", ) + help="How often to save the model during training.", + ) parser.add_argument( "--gradient_accumulation_steps", type=int, @@ -184,34 +195,40 @@ def parse_args(): default="cosine", help=( 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' - ' "constant", "constant_with_warmup"]'), ) + ' "constant", "constant_with_warmup"]' + ), + ) parser.add_argument( "--lr_warmup_steps", type=int, default=500, - help="Number of steps for the warmup in the lr scheduler.", ) + help="Number of steps for the warmup in the lr scheduler.", + ) parser.add_argument( "--adam_beta1", type=float, default=0.95, - help="The beta1 parameter for the Adam optimizer.", ) + help="The beta1 parameter for the Adam optimizer.", + ) parser.add_argument( "--adam_beta2", type=float, default=0.999, - help="The beta2 parameter for the Adam optimizer.", ) + help="The beta2 parameter for the Adam optimizer.", + ) parser.add_argument( "--adam_weight_decay", type=float, default=1e-6, - help="Weight decay magnitude for the Adam optimizer.", ) + help="Weight decay magnitude for the Adam optimizer.", + ) parser.add_argument( "--adam_epsilon", type=float, default=1e-08, - help="Epsilon value for the Adam optimizer.", ) - parser.add_argument( - "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + help="Epsilon value for the Adam optimizer.", + ) + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--use_ema", action="store_true", @@ -221,26 +238,31 @@ def parse_args(): "--ema_inv_gamma", type=float, default=1.0, - help="The inverse gamma value for the EMA decay.", ) + help="The inverse gamma value for the EMA decay.", + ) parser.add_argument( "--ema_power", type=float, default=3 / 4, - help="The power value for the EMA decay.", ) + help="The power value for the EMA decay.", + ) parser.add_argument( "--ema_max_decay", type=float, default=0.9999, - help="The maximum decay magnitude for EMA.", ) + help="The maximum decay magnitude for EMA.", + ) parser.add_argument( "--push_to_hub", action="store_true", - help="Whether or not to push the model to the Hub.", ) + help="Whether or not to push the model to the Hub.", + ) parser.add_argument( "--hub_token", type=str, default=None, - help="The token to use to push to the Model Hub.", ) + help="The token to use to push to the Model Hub.", + ) parser.add_argument( "--hub_model_id", type=str, @@ -250,7 +272,8 @@ def parse_args(): parser.add_argument( "--hub_private_repo", action="store_true", - help="Whether or not to create a private repository.", ) + help="Whether or not to create a private repository.", + ) parser.add_argument( "--logger", type=str, @@ -259,14 +282,17 @@ def parse_args(): help=( "Whether to use [tensorboard](https://www.tensorflow.org/tensorboard) or [wandb](https://www.wandb.ai)" " for experiment tracking and logging of model metrics and model checkpoints" - ), ) + ), + ) parser.add_argument( "--logging_dir", type=str, default="logs", help=( "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" - " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."), ) + " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." + ), + ) parser.add_argument( "--prediction_type", type=str, @@ -283,7 +309,9 @@ def parse_args(): default=500, help=( "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming" - " training using `--resume_from_checkpoint`."), ) + " training using `--resume_from_checkpoint`." + ), + ) parser.add_argument( "--checkpoints_total_limit", type=int, @@ -291,29 +319,24 @@ def parse_args(): help=( "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" - " for more docs"), ) - parser.add_argument( - "--seed", - type=int, - default=None, - help="A seed for reproducible training.") + " for more docs" + ), + ) + parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") parser.add_argument( "--enable_xformers_memory_efficient_attention", action="store_true", - help="Whether or not to use xformers.", ) + help="Whether or not to use xformers.", + ) args = parser.parse_args() if args.dataset_name is None and args.train_data_dir is None: - raise ValueError( - "You must specify either a dataset name from the hub or a train data directory." - ) + raise ValueError("You must specify either a dataset name from the hub or a train data directory.") return args -def get_full_repo_name(model_id: str, - organization: Optional[str]=None, - token: Optional[str]=None): +def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None): if token is None: token = HfFolder.get_token() if organization is None: @@ -349,8 +372,7 @@ def save_model_hook(models, weights, output_dir): def load_model_hook(models, input_dir): if args.use_ema: - load_model = EMAModel.from_pretrained( - os.path.join(input_dir, "unet_ema"), UNet2DModel) + load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DModel) ema_model.load_state_dict(load_model.state_dict()) del load_model @@ -359,8 +381,7 @@ def load_model_hook(models, input_dir): model = models.pop() # load ppdiffusers style into model - load_model = UNet2DModel.from_pretrained( - input_dir, subfolder="unet") + load_model = UNet2DModel.from_pretrained(input_dir, subfolder="unet") model.register_to_config(**load_model.config) model.load_state_dict(load_model.state_dict()) @@ -374,21 +395,20 @@ def load_model_hook(models, input_dir): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", - level=logging.INFO, ) + level=logging.INFO, + ) # Handle the repository creation if is_main_process: if args.push_to_hub: if args.hub_model_id is None: - repo_name = get_full_repo_name( - Path(args.output_dir).name, token=args.hub_token) + repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id create_repo(repo_name, exist_ok=True, token=args.hub_token) # repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token) - with open(os.path.join(args.output_dir, ".gitignore"), - "w+") as gitignore: + with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: if "step_*" not in gitignore: gitignore.write("step_*\n") if "epoch_*" not in gitignore: @@ -410,14 +430,17 @@ def load_model_hook(models, input_dir): "DownBlock2D", "DownBlock2D", "AttnDownBlock2D", - "DownBlock2D", ), + "DownBlock2D", + ), up_block_types=( "UpBlock2D", "AttnUpBlock2D", "UpBlock2D", "UpBlock2D", "UpBlock2D", - "UpBlock2D", ), ) + "UpBlock2D", + ), + ) else: config = UNet2DModel.load_config(args.model_config_name_or_path) model = UNet2DModel.from_config(config) @@ -431,28 +454,30 @@ def load_model_hook(models, input_dir): inv_gamma=args.ema_inv_gamma, power=args.ema_power, model_cls=UNet2DModel, - model_config=model.config, ) + model_config=model.config, + ) - if args.enable_xformers_memory_efficient_attention and is_ppxformers_available( - ): + if args.enable_xformers_memory_efficient_attention and is_ppxformers_available(): try: model.enable_xformers_memory_efficient_attention() except Exception as e: logger.warn( "Could not enable memory efficient attention. Make sure develop paddlepaddle is installed" - f" correctly and a GPU is available: {e}") + f" correctly and a GPU is available: {e}" + ) # Initialize the scheduler - accepts_prediction_type = "prediction_type" in set( - inspect.signature(DDPMScheduler.__init__).parameters.keys()) + accepts_prediction_type = "prediction_type" in set(inspect.signature(DDPMScheduler.__init__).parameters.keys()) if accepts_prediction_type: noise_scheduler = DDPMScheduler( num_train_timesteps=args.ddpm_num_steps, beta_schedule=args.ddpm_beta_schedule, - prediction_type=args.prediction_type, ) + prediction_type=args.prediction_type, + ) else: noise_scheduler = DDPMScheduler( num_train_timesteps=args.ddpm_num_steps, - beta_schedule=args.ddpm_beta_schedule, ) + beta_schedule=args.ddpm_beta_schedule, + ) # Get the datasets: you can either provide your own training and evaluation files (see below) # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub). @@ -464,31 +489,30 @@ def load_model_hook(models, input_dir): args.dataset_name, args.dataset_config_name, cache_dir=args.cache_dir, - split="train", ) + split="train", + ) else: dataset = load_dataset( "imagefolder", data_dir=args.train_data_dir, cache_dir=args.cache_dir, - split="train", ) + split="train", + ) # See more about loading custom images at # Preprocessing the datasets and DataLoaders creation. - augmentations = transforms.Compose([ - transforms.Resize( - args.resolution, interpolation="bilinear"), - transforms.CenterCrop(args.resolution) - if args.center_crop else transforms.RandomCrop(args.resolution), - transforms.RandomHorizontalFlip() - if args.random_flip else transforms.Lambda(lambda x: x), - transforms.ToTensor(), - transforms.Normalize([0.5], [0.5]), - ]) + augmentations = transforms.Compose( + [ + transforms.Resize(args.resolution, interpolation="bilinear"), + transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution), + transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x), + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] + ) def transform_images(examples): - images = [ - augmentations(image.convert("RGB")) for image in examples["image"] - ] + images = [augmentations(image.convert("RGB")) for image in examples["image"]] return {"input": images} # logger.info(f"Dataset size: {len(dataset)}") @@ -498,7 +522,8 @@ def transform_images(examples): dataset, batch_size=args.train_batch_size, shuffle=True, - num_workers=args.dataloader_num_workers, ) + num_workers=args.dataloader_num_workers, + ) if num_processes > 1: model = paddle.DataParallel(model) @@ -507,9 +532,9 @@ def transform_images(examples): lr_scheduler = get_scheduler( args.lr_scheduler, learning_rate=args.learning_rate, - num_warmup_steps=args.lr_warmup_steps * - args.gradient_accumulation_steps, - num_training_steps=(len(train_dataloader) * args.num_epochs), ) + num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=(len(train_dataloader) * args.num_epochs), + ) # Initialize the optimizer optimizer = paddle.optimizer.AdamW( @@ -519,8 +544,8 @@ def transform_images(examples): beta2=args.adam_beta2, weight_decay=args.adam_weight_decay, epsilon=args.adam_epsilon, - grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) - if args.max_grad_norm > 0 else None, ) + grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm) if args.max_grad_norm > 0 else None, + ) if is_main_process: logger.info("----------- Configuration Arguments -----------") @@ -530,22 +555,16 @@ def transform_images(examples): writer = get_report_to(args) # Prepare everything with our `accelerator`. - total_batch_size = (args.train_batch_size * num_processes * - args.gradient_accumulation_steps) - num_update_steps_per_epoch = math.ceil( - len(train_dataloader) / args.gradient_accumulation_steps) + total_batch_size = args.train_batch_size * num_processes * args.gradient_accumulation_steps + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) max_train_steps = args.num_epochs * num_update_steps_per_epoch logger.info("***** Running training *****") logger.info(f" Num examples = {len(dataset)}") logger.info(f" Num Epochs = {args.num_epochs}") - logger.info( - f" Instantaneous batch size per device = {args.train_batch_size}") - logger.info( - f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" - ) - logger.info( - f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Instantaneous batch size per device = {args.train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {max_train_steps}") global_step = 0 @@ -554,8 +573,7 @@ def transform_images(examples): # Train! for epoch in range(first_epoch, args.num_epochs): model.train() - progress_bar = tqdm( - total=num_update_steps_per_epoch, disable=not is_main_process) + progress_bar = tqdm(total=num_update_steps_per_epoch, disable=not is_main_process) progress_bar.set_description(f"Epoch {epoch}") for step, batch in enumerate(train_dataloader): clean_images = batch["input"] @@ -563,34 +581,30 @@ def transform_images(examples): noise = paddle.randn(clean_images.shape) bsz = clean_images.shape[0] # Sample a random timestep for each image - timesteps = paddle.randint( - 0, noise_scheduler.config.num_train_timesteps, - (bsz, )).cast("int64") + timesteps = paddle.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,)).cast("int64") # Add noise to the clean images according to the noise magnitude at each timestep # (this is the forward diffusion process) - noisy_images = noise_scheduler.add_noise(clean_images, noise, - timesteps) + noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps) # Predict the noise residual model_output = model(noisy_images, timesteps).sample if args.prediction_type == "epsilon": - loss = F.mse_loss(model_output, - noise) # this could have different weights! + loss = F.mse_loss(model_output, noise) # this could have different weights! elif args.prediction_type == "sample": alpha_t = _extract_into_tensor( noise_scheduler.alphas_cumprod, timesteps, - (clean_images.shape[0], 1, 1, 1), ) + (clean_images.shape[0], 1, 1, 1), + ) snr_weights = alpha_t / (1 - alpha_t) loss = snr_weights * F.mse_loss( model_output, clean_images, reduction="none" ) # use SNR weighting from distillation paper loss = loss.mean() else: - raise ValueError( - f"Unsupported prediction type: {args.prediction_type}") + raise ValueError(f"Unsupported prediction type: {args.prediction_type}") loss.backward() @@ -607,13 +621,10 @@ def transform_images(examples): if global_step % args.checkpointing_steps == 0: if is_main_process: - save_path = os.path.join(args.output_dir, - f"checkpoint-{global_step}") + save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") if args.use_ema: - unwrap_model(ema_model).save_pretrained( - os.path.join(save_path, "unet_ema")) - unwrap_model(model).save_pretrained( - os.path.join(save_path, "unet")) + unwrap_model(ema_model).save_pretrained(os.path.join(save_path, "unet_ema")) + unwrap_model(model).save_pretrained(os.path.join(save_path, "unet")) logger.info(f"Saved state to {save_path}") @@ -638,7 +649,8 @@ def transform_images(examples): ema_model.copy_to(unet.parameters()) pipeline = DDPMPipeline( unet=unet, - scheduler=noise_scheduler, ) + scheduler=noise_scheduler, + ) generator = paddle.Generator().manual_seed(0) # run pipeline in inference (sample random noise and denoise) @@ -646,7 +658,8 @@ def transform_images(examples): generator=generator, batch_size=args.eval_batch_size, num_inference_steps=args.ddpm_num_inference_steps, - output_type="numpy", ).images + output_type="numpy", + ).images if args.use_ema: ema_model.restore(unet.parameters()) @@ -657,13 +670,15 @@ def transform_images(examples): "test", images_processed.transpose(0, 3, 1, 2), epoch, - dataformats="NHWC", ) + dataformats="NHWC", + ) else: writer.add_image( "test", images_processed.transpose(0, 3, 1, 2), epoch, - dataformats="NHWC", ) + dataformats="NHWC", + ) if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1: # save the model @@ -676,7 +691,8 @@ def transform_images(examples): pipeline = DDPMPipeline( unet=unet, - scheduler=noise_scheduler, ) + scheduler=noise_scheduler, + ) pipeline.save_pretrained(args.output_dir) diff --git a/ppdiffusers/ppdiffusers/__init__.py b/ppdiffusers/ppdiffusers/__init__.py index f8c3b7f6ce1f4..f86f792718938 100644 --- a/ppdiffusers/ppdiffusers/__init__.py +++ b/ppdiffusers/ppdiffusers/__init__.py @@ -17,13 +17,26 @@ from . import patches from .configuration_utils import ConfigMixin from .utils import ( - OptionalDependencyNotAvailable, is_einops_available, - is_fastdeploy_available, is_inflect_available, is_k_diffusion_available, - is_k_diffusion_version, is_librosa_available, is_note_seq_available, - is_paddle_available, is_paddle_version, is_paddlenlp_available, - is_paddlenlp_version, is_ppxformers_available, is_safetensors_available, - is_scipy_available, is_torch_available, is_unidecode_available, - is_visualdl_available, logging) + OptionalDependencyNotAvailable, + is_einops_available, + is_fastdeploy_available, + is_inflect_available, + is_k_diffusion_available, + is_k_diffusion_version, + is_librosa_available, + is_note_seq_available, + is_paddle_available, + is_paddle_version, + is_paddlenlp_available, + is_paddlenlp_version, + is_ppxformers_available, + is_safetensors_available, + is_scipy_available, + is_torch_available, + is_unidecode_available, + is_visualdl_available, + logging, +) from .version import VERSION as __version__ try: @@ -41,32 +54,75 @@ from .utils.dummy_paddle_objects import * # noqa F403 else: from .models import ( - AutoencoderKL, ControlNetModel, LitEma, LVDMAutoencoderKL, - LVDMUNet3DModel, ModelMixin, MultiAdapter, PriorTransformer, T2IAdapter, - T5FilmDecoder, Transformer2DModel, UNet1DModel, UNet2DConditionModel, - UNet2DModel, UNet3DConditionModel, VQModel) + AutoencoderKL, + ControlNetModel, + LitEma, + LVDMAutoencoderKL, + LVDMUNet3DModel, + ModelMixin, + MultiAdapter, + PriorTransformer, + T2IAdapter, + T5FilmDecoder, + Transformer2DModel, + UNet1DModel, + UNet2DConditionModel, + UNet2DModel, + UNet3DConditionModel, + VQModel, + ) from .optimization import ( - get_constant_schedule, get_constant_schedule_with_warmup, + get_constant_schedule, + get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup, - get_polynomial_decay_schedule_with_warmup, get_scheduler) + get_polynomial_decay_schedule_with_warmup, + get_scheduler, + ) from .pipelines import ( - AudioPipelineOutput, DanceDiffusionPipeline, DDIMPipeline, DDPMPipeline, - DiffusionPipeline, DiTPipeline, ImagePipelineOutput, KarrasVePipeline, - LDMPipeline, LDMSuperResolutionPipeline, PNDMPipeline, RePaintPipeline, - ScoreSdeVePipeline, TextPipelineOutput) + AudioPipelineOutput, + DanceDiffusionPipeline, + DDIMPipeline, + DDPMPipeline, + DiffusionPipeline, + DiTPipeline, + ImagePipelineOutput, + KarrasVePipeline, + LDMPipeline, + LDMSuperResolutionPipeline, + PNDMPipeline, + RePaintPipeline, + ScoreSdeVePipeline, + TextPipelineOutput, + ) from .schedulers import ( - DDIMInverseScheduler, DDIMScheduler, DDPMScheduler, - DEISMultistepScheduler, DPMSolverMultistepScheduler, - DPMSolverSinglestepScheduler, DPMSolverUniDiffuserScheduler, - EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, - HeunDiscreteScheduler, IPNDMScheduler, KarrasVeScheduler, - KDPM2AncestralDiscreteScheduler, KDPM2DiscreteScheduler, PNDMScheduler, - RePaintScheduler, SchedulerMixin, ScoreSdeVeScheduler, UnCLIPScheduler, - UniPCMultistepScheduler, VQDiffusionScheduler) - from .schedulers.preconfig import (PreconfigEulerAncestralDiscreteScheduler, - PreconfigLMSDiscreteScheduler) + DDIMInverseScheduler, + DDIMScheduler, + DDPMScheduler, + DEISMultistepScheduler, + DPMSolverMultistepScheduler, + DPMSolverSinglestepScheduler, + DPMSolverUniDiffuserScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + HeunDiscreteScheduler, + IPNDMScheduler, + KarrasVeScheduler, + KDPM2AncestralDiscreteScheduler, + KDPM2DiscreteScheduler, + PNDMScheduler, + RePaintScheduler, + SchedulerMixin, + ScoreSdeVeScheduler, + UnCLIPScheduler, + UniPCMultistepScheduler, + VQDiffusionScheduler, + ) + from .schedulers.preconfig import ( + PreconfigEulerAncestralDiscreteScheduler, + PreconfigLMSDiscreteScheduler, + ) from .training_utils import EMAModel try: @@ -84,36 +140,58 @@ from .utils.dummy_paddle_and_paddlenlp_objects import * # noqa F403 else: from .pipelines import ( - AltDiffusionImg2ImgPipeline, AltDiffusionPipeline, AudioLDMPipeline, - CycleDiffusionPipeline, IFImg2ImgPipeline, - IFImg2ImgSuperResolutionPipeline, IFInpaintingPipeline, - IFInpaintingSuperResolutionPipeline, IFPipeline, - IFSuperResolutionPipeline, LDMTextToImagePipeline, - LVDMTextToVideoPipeline, LVDMUncondPipeline, PaintByExamplePipeline, - SemanticStableDiffusionPipeline, StableDiffusionAdapterPipeline, + AltDiffusionImg2ImgPipeline, + AltDiffusionPipeline, + AudioLDMPipeline, + CycleDiffusionPipeline, + IFImg2ImgPipeline, + IFImg2ImgSuperResolutionPipeline, + IFInpaintingPipeline, + IFInpaintingSuperResolutionPipeline, + IFPipeline, + IFSuperResolutionPipeline, + LDMTextToImagePipeline, + LVDMTextToVideoPipeline, + LVDMUncondPipeline, + PaintByExamplePipeline, + SemanticStableDiffusionPipeline, + StableDiffusionAdapterPipeline, StableDiffusionAttendAndExcitePipeline, - StableDiffusionControlNetPipeline, StableDiffusionDepth2ImgPipeline, - StableDiffusionImageVariationPipeline, StableDiffusionImg2ImgPipeline, - StableDiffusionInpaintPipeline, StableDiffusionInpaintPipelineLegacy, + StableDiffusionControlNetPipeline, + StableDiffusionDepth2ImgPipeline, + StableDiffusionImageVariationPipeline, + StableDiffusionImg2ImgPipeline, + StableDiffusionInpaintPipeline, + StableDiffusionInpaintPipelineLegacy, StableDiffusionInstructPix2PixPipeline, - StableDiffusionLatentUpscalePipeline, StableDiffusionMegaPipeline, - StableDiffusionModelEditingPipeline, StableDiffusionPanoramaPipeline, - StableDiffusionPipeline, StableDiffusionPipelineAllinOne, - StableDiffusionPipelineSafe, StableDiffusionPix2PixZeroPipeline, - StableDiffusionSAGPipeline, StableDiffusionUpscalePipeline, - StableUnCLIPImg2ImgPipeline, StableUnCLIPPipeline, - TextToVideoSDPipeline, TextToVideoZeroPipeline, - UnCLIPImageVariationPipeline, UnCLIPPipeline, UniDiffuserPipeline, + StableDiffusionLatentUpscalePipeline, + StableDiffusionMegaPipeline, + StableDiffusionModelEditingPipeline, + StableDiffusionPanoramaPipeline, + StableDiffusionPipeline, + StableDiffusionPipelineAllinOne, + StableDiffusionPipelineSafe, + StableDiffusionPix2PixZeroPipeline, + StableDiffusionSAGPipeline, + StableDiffusionUpscalePipeline, + StableUnCLIPImg2ImgPipeline, + StableUnCLIPPipeline, + TextToVideoSDPipeline, + TextToVideoZeroPipeline, + UnCLIPImageVariationPipeline, + UnCLIPPipeline, + UniDiffuserPipeline, VersatileDiffusionDualGuidedPipeline, - VersatileDiffusionImageVariationPipeline, VersatileDiffusionPipeline, - VersatileDiffusionTextToImagePipeline, VQDiffusionPipeline) - from .pipelines.latent_diffusion.pipeline_latent_diffusion import \ - LDMBertModel + VersatileDiffusionImageVariationPipeline, + VersatileDiffusionPipeline, + VersatileDiffusionTextToImagePipeline, + VQDiffusionPipeline, + ) + from .pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel from .pipelines.unidiffuser.caption_decoder import CaptionDecoder try: - if not (is_paddle_available() and is_paddlenlp_available() and - is_k_diffusion_available()): + if not (is_paddle_available() and is_paddlenlp_available() and is_k_diffusion_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from .utils.dummy_paddle_and_paddlenlp_and_k_diffusion_objects import * # noqa F403 @@ -121,21 +199,22 @@ from .pipelines import StableDiffusionKDiffusionPipeline try: - if not (is_paddle_available() and is_paddlenlp_available() and - is_fastdeploy_available()): + if not (is_paddle_available() and is_paddlenlp_available() and is_fastdeploy_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from .utils.dummy_paddle_and_paddlenlp_and_fastdeploy_objects import * # noqa F403 else: - from .pipelines import (FastDeployCycleDiffusionPipeline, - FastDeployStableDiffusionControlNetPipeline, - FastDeployStableDiffusionImageVariationPipeline, - FastDeployStableDiffusionImg2ImgPipeline, - FastDeployStableDiffusionInpaintPipeline, - FastDeployStableDiffusionInpaintPipelineLegacy, - FastDeployStableDiffusionMegaPipeline, - FastDeployStableDiffusionPipeline, - FastDeployStableDiffusionUpscalePipeline) + from .pipelines import ( + FastDeployCycleDiffusionPipeline, + FastDeployStableDiffusionControlNetPipeline, + FastDeployStableDiffusionImageVariationPipeline, + FastDeployStableDiffusionImg2ImgPipeline, + FastDeployStableDiffusionInpaintPipeline, + FastDeployStableDiffusionInpaintPipelineLegacy, + FastDeployStableDiffusionMegaPipeline, + FastDeployStableDiffusionPipeline, + FastDeployStableDiffusionUpscalePipeline, + ) try: if not (is_paddle_available() and is_librosa_available()): @@ -146,8 +225,7 @@ from .pipelines import AudioDiffusionPipeline, Mel try: - if not (is_paddle_available() and is_paddlenlp_available() and - is_note_seq_available()): + if not (is_paddle_available() and is_paddlenlp_available() and is_note_seq_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from .utils.dummy_paddle_and_paddlenlp_and_note_seq_objects import * # noqa F403 @@ -155,8 +233,7 @@ from .pipelines import SpectrogramDiffusionPipeline try: - if not (is_paddle_available() and is_paddlenlp_available() and - is_einops_available()): + if not (is_paddle_available() and is_paddlenlp_available() and is_einops_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from .utils.dummy_paddle_and_paddlenlp_and_einops_objects import * # noqa F403 diff --git a/ppdiffusers/ppdiffusers/commands/env.py b/ppdiffusers/ppdiffusers/commands/env.py index a020de6813b7d..0ad95fd647340 100644 --- a/ppdiffusers/ppdiffusers/commands/env.py +++ b/ppdiffusers/ppdiffusers/commands/env.py @@ -57,9 +57,7 @@ def run(self): "Using distributed or parallel set-up in script?": "", } - print( - "\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n" - ) + print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n") print(self.format_dict(info)) return info diff --git a/ppdiffusers/ppdiffusers/commands/ppdiffusers_cli.py b/ppdiffusers/ppdiffusers/commands/ppdiffusers_cli.py index d14e14711dedc..7575e5902a50e 100644 --- a/ppdiffusers/ppdiffusers/commands/ppdiffusers_cli.py +++ b/ppdiffusers/ppdiffusers/commands/ppdiffusers_cli.py @@ -20,10 +20,8 @@ def main(): - parser = ArgumentParser( - "PPDiffusers CLI tool", usage="ppdiffusers-cli []") - commands_parser = parser.add_subparsers( - help="ppdiffusers-cli command helpers") + parser = ArgumentParser("PPDiffusers CLI tool", usage="ppdiffusers-cli []") + commands_parser = parser.add_subparsers(help="ppdiffusers-cli command helpers") # Register commands EnvironmentCommand.register_subcommand(commands_parser) diff --git a/ppdiffusers/ppdiffusers/configuration_utils.py b/ppdiffusers/ppdiffusers/configuration_utils.py index 2c5d4e88c84e7..551fb118afa9e 100644 --- a/ppdiffusers/ppdiffusers/configuration_utils.py +++ b/ppdiffusers/ppdiffusers/configuration_utils.py @@ -33,9 +33,16 @@ import numpy as np import paddle -from .utils import (DIFFUSERS_CACHE, PPDIFFUSERS_CACHE, DummyObject, - bos_hf_download, deprecate, extract_commit_hash, - http_user_agent, logging) +from .utils import ( + DIFFUSERS_CACHE, + PPDIFFUSERS_CACHE, + DummyObject, + bos_hf_download, + deprecate, + extract_commit_hash, + http_user_agent, + logging, +) from .utils.constants import FROM_HF_HUB from .version import VERSION as __version__ @@ -54,36 +61,25 @@ def __init__(self, *args, **kwargs): self.__frozen = True def __delitem__(self, *args, **kwargs): - raise Exception( - f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance." - ) + raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.") def setdefault(self, *args, **kwargs): - raise Exception( - f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance." - ) + raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.") def pop(self, *args, **kwargs): - raise Exception( - f"You cannot use ``pop`` on a {self.__class__.__name__} instance.") + raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.") def update(self, *args, **kwargs): - raise Exception( - f"You cannot use ``update`` on a {self.__class__.__name__} instance." - ) + raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.") def __setattr__(self, name, value): if hasattr(self, "__frozen") and self.__frozen: - raise Exception( - f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance." - ) + raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.") super().__setattr__(name, value) def __setitem__(self, name, value): if hasattr(self, "__frozen") and self.__frozen: - raise Exception( - f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance." - ) + raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.") super().__setitem__(name, value) @@ -112,9 +108,7 @@ class ConfigMixin: def register_to_config(self, **kwargs): if self.config_name is None: - raise NotImplementedError( - f"Make sure that {self.__class__} has defined a class name `config_name`" - ) + raise NotImplementedError(f"Make sure that {self.__class__} has defined a class name `config_name`") # Special case for `kwargs` used in deprecation warning added to schedulers # TODO: remove this when we remove the deprecation warning, and the `kwargs` argument, # or solve in a more general way. @@ -124,9 +118,8 @@ def register_to_config(self, **kwargs): internal_dict = kwargs else: previous_dict = dict(self._internal_dict) - internal_dict = { ** self._internal_dict, ** kwargs} - logger.debug( - f"Updating config from {previous_dict} to {internal_dict}") + internal_dict = {**self._internal_dict, **kwargs} + logger.debug(f"Updating config from {previous_dict} to {internal_dict}") self._internal_dict = FrozenDict(internal_dict) @@ -137,8 +130,7 @@ def __getattr__(self, name: str) -> Any: https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module """ - is_in_config = "_internal_dict" in self.__dict__ and hasattr( - self.__dict__["_internal_dict"], name) + is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name) is_attribute = name in self.__dict__ if is_in_config and not is_attribute: @@ -147,18 +139,19 @@ def __getattr__(self, name: str) -> Any: "direct config name access", "1.0.0", deprecation_message, - standard_warn=False, ) + standard_warn=False, + ) return self._internal_dict[name] - raise AttributeError( - f"'{type(self).__name__}' object has no attribute '{name}'") + raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") def save_config( - self, - save_directory: Union[str, os.PathLike], - push_to_hub: bool=False, - to_diffusers=False, - **kwargs, ): + self, + save_directory: Union[str, os.PathLike], + push_to_hub: bool = False, + to_diffusers=False, + **kwargs, + ): """ Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the [`~ConfigMixin.from_config`] class method. @@ -168,9 +161,7 @@ def save_config( Directory where the configuration JSON file will be saved (will be created if it does not exist). """ if os.path.isfile(save_directory): - raise AssertionError( - f"Provided path ({save_directory}) should be a directory, not a file" - ) + raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") os.makedirs(save_directory, exist_ok=True) @@ -182,10 +173,11 @@ def save_config( @classmethod def from_config( - cls, - config: Union[FrozenDict, Dict[str, Any]]=None, - return_unused_kwargs=False, - **kwargs, ): + cls, + config: Union[FrozenDict, Dict[str, Any]] = None, + return_unused_kwargs=False, + **kwargs, + ): r""" Instantiate a Python class from a config dictionary @@ -222,9 +214,7 @@ def from_config( config = kwargs.pop("pretrained_model_name_or_path") if config is None: - raise ValueError( - "Please make sure to provide a config as the first positional argument." - ) + raise ValueError("Please make sure to provide a config as the first positional argument.") # ======> if not isinstance(config, dict): @@ -233,24 +223,27 @@ def from_config( deprecation_message += ( f"If you were trying to load a scheduler, please use {cls}.from_pretrained(...) instead." " Otherwise, please make sure to pass a configuration dictionary instead. This functionality will" - " be removed in v1.0.0.") + " be removed in v1.0.0." + ) elif "Model" in cls.__name__: deprecation_message += ( f"If you were trying to load a model, please use {cls}.load_config(...) followed by" f" {cls}.from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary" - " instead. This functionality will be removed in v1.0.0.") + " instead. This functionality will be removed in v1.0.0." + ) deprecate( "config-passed-as-path", "1.0.0", deprecation_message, - standard_warn=False, ) + standard_warn=False, + ) config, kwargs = cls.load_config( pretrained_model_name_or_path=config, return_unused_kwargs=True, - **kwargs, ) + **kwargs, + ) - init_dict, unused_kwargs, hidden_dict = cls.extract_init_dict(config, - **kwargs) + init_dict, unused_kwargs, hidden_dict = cls.extract_init_dict(config, **kwargs) # Allow dtype to be specified on initialization if "dtype" in unused_kwargs: @@ -259,8 +252,7 @@ def from_config( # add possible deprecated kwargs for deprecated_kwarg in cls._deprecated_kwargs: if deprecated_kwarg in unused_kwargs: - init_dict[deprecated_kwarg] = unused_kwargs.pop( - deprecated_kwarg) + init_dict[deprecated_kwarg] = unused_kwargs.pop(deprecated_kwarg) # Return model and optionally state and/or unused_kwargs model = cls(**init_dict) @@ -269,7 +261,7 @@ def from_config( model.register_to_config(**hidden_dict) # add hidden kwargs of compatible classes to unused_kwargs - unused_kwargs = { ** unused_kwargs, ** hidden_dict} + unused_kwargs = {**unused_kwargs, **hidden_dict} if return_unused_kwargs: return (model, unused_kwargs) @@ -280,21 +272,19 @@ def from_config( def get_config_dict(cls, *args, **kwargs): deprecation_message = ( f" The function get_config_dict is deprecated. Please use {cls}.load_config instead. This function will be" - " removed in version v1.0.0") - deprecate( - "get_config_dict", - "1.0.0", - deprecation_message, - standard_warn=False) + " removed in version v1.0.0" + ) + deprecate("get_config_dict", "1.0.0", deprecation_message, standard_warn=False) return cls.load_config(*args, **kwargs) @classmethod def load_config( - cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - return_unused_kwargs=False, - return_commit_hash=False, - **kwargs, ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + cls, + pretrained_model_name_or_path: Union[str, os.PathLike], + return_unused_kwargs=False, + return_commit_hash=False, + **kwargs, + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: r""" Instantiate a Python class from a config dictionary @@ -354,8 +344,9 @@ def load_config( """ from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB) - cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub - else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)) + cache_dir = ( + kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE) + ) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) proxies = kwargs.pop("proxies", None) @@ -365,7 +356,7 @@ def load_config( _ = kwargs.pop("mirror", None) subfolder = kwargs.pop("subfolder", None) user_agent = kwargs.pop("user_agent", {}) - user_agent = { ** user_agent, "file_type": "config"} + user_agent = {**user_agent, "file_type": "config"} user_agent = http_user_agent(user_agent) # new add return_config_file return_config_file = kwargs.pop("return_config_file", False) @@ -381,17 +372,13 @@ def load_config( if os.path.isfile(pretrained_model_name_or_path): config_file = pretrained_model_name_or_path elif os.path.isdir(pretrained_model_name_or_path): - if os.path.isfile( - os.path.join(pretrained_model_name_or_path, - cls.config_name)): + if os.path.isfile(os.path.join(pretrained_model_name_or_path, cls.config_name)): # Load from a PyTorch checkpoint - config_file = os.path.join(pretrained_model_name_or_path, - cls.config_name) + config_file = os.path.join(pretrained_model_name_or_path, cls.config_name) elif subfolder is not None and os.path.isfile( - os.path.join(pretrained_model_name_or_path, subfolder, - cls.config_name)): - config_file = os.path.join(pretrained_model_name_or_path, - subfolder, cls.config_name) + os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name) + ): + config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name) else: raise EnvironmentError( f"Error no file named {cls.config_name} found in directory {pretrained_model_name_or_path}." @@ -409,7 +396,8 @@ def load_config( user_agent=user_agent, subfolder=subfolder, revision=revision, - from_hf_hub=from_hf_hub, ) + from_hf_hub=from_hf_hub, + ) try: # Load config dict @@ -417,23 +405,20 @@ def load_config( commit_hash = extract_commit_hash(config_file) except (json.JSONDecodeError, UnicodeDecodeError): - raise EnvironmentError( - f"It looks like the config file at '{config_file}' is not a valid JSON file." - ) + raise EnvironmentError(f"It looks like the config file at '{config_file}' is not a valid JSON file.") - if not (return_unused_kwargs or return_commit_hash or - return_config_file): + if not (return_unused_kwargs or return_commit_hash or return_config_file): return config_dict - outputs = (config_dict, ) + outputs = (config_dict,) if return_unused_kwargs: - outputs += (kwargs, ) + outputs += (kwargs,) if return_commit_hash: - outputs += (commit_hash, ) + outputs += (commit_hash,) if return_config_file: - outputs += (config_file, ) + outputs += (config_file,) return outputs @@ -462,43 +447,26 @@ def extract_init_dict(cls, config_dict, **kwargs): ppdiffusers_library = importlib.import_module(__name__.split(".")[0]) if cls.has_compatibles: - compatible_classes = [ - c for c in cls._get_compatibles() - if not isinstance(c, DummyObject) - ] + compatible_classes = [c for c in cls._get_compatibles() if not isinstance(c, DummyObject)] else: compatible_classes = [] expected_keys_comp_cls = set() for c in compatible_classes: expected_keys_c = cls._get_init_keys(c) - expected_keys_comp_cls = expected_keys_comp_cls.union( - expected_keys_c) - expected_keys_comp_cls = expected_keys_comp_cls - cls._get_init_keys( - cls) - config_dict = { - k: v - for k, v in config_dict.items() if k not in expected_keys_comp_cls - } + expected_keys_comp_cls = expected_keys_comp_cls.union(expected_keys_c) + expected_keys_comp_cls = expected_keys_comp_cls - cls._get_init_keys(cls) + config_dict = {k: v for k, v in config_dict.items() if k not in expected_keys_comp_cls} # remove attributes from orig class that cannot be expected orig_cls_name = config_dict.pop("_class_name", cls.__name__) - if orig_cls_name != cls.__name__ and hasattr(ppdiffusers_library, - orig_cls_name): + if orig_cls_name != cls.__name__ and hasattr(ppdiffusers_library, orig_cls_name): orig_cls = getattr(ppdiffusers_library, orig_cls_name) - unexpected_keys_from_orig = cls._get_init_keys( - orig_cls) - expected_keys - config_dict = { - k: v - for k, v in config_dict.items() - if k not in unexpected_keys_from_orig - } + unexpected_keys_from_orig = cls._get_init_keys(orig_cls) - expected_keys + config_dict = {k: v for k, v in config_dict.items() if k not in unexpected_keys_from_orig} # remove private attributes - config_dict = { - k: v - for k, v in config_dict.items() if not k.startswith("_") - } + config_dict = {k: v for k, v in config_dict.items() if not k.startswith("_")} # 3. Create keyword arguments that will be passed to __init__ from expected keyword arguments init_dict = {} @@ -520,7 +488,8 @@ def extract_init_dict(cls, config_dict, **kwargs): logger.warning( f"The config attributes {config_dict} were passed to {cls.__name__}, " "but are not expected and will be ignored. Please verify your " - f"{cls.config_name} configuration file.") + f"{cls.config_name} configuration file." + ) # 5. Give nice info if config attributes are initiliazed to default because they have not been passed passed_keys = set(init_dict.keys()) @@ -530,13 +499,10 @@ def extract_init_dict(cls, config_dict, **kwargs): ) # 6. Define unused keyword arguments - unused_kwargs = { ** config_dict, ** kwargs} + unused_kwargs = {**config_dict, **kwargs} # 7. Define "hidden" config parameters that were saved for compatible classes - hidden_config_dict = { - k: v - for k, v in original_dict.items() if k not in init_dict - } + hidden_config_dict = {k: v for k, v in original_dict.items() if k not in init_dict} return init_dict, unused_kwargs, hidden_config_dict @@ -546,8 +512,7 @@ def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]): text = reader.read() data = json.loads(text) if "_diffusers_version" in data and "_ppdiffusers_version" not in data: - data["_ppdiffusers_version"] = data.pop("_diffusers_version", - __version__) + data["_ppdiffusers_version"] = data.pop("_diffusers_version", __version__) if "_diffusers_version" not in data and "_ppdiffusers_version" not in data: data["_ppdiffusers_version"] = __version__ @@ -581,8 +546,7 @@ def to_json_string(self, to_diffusers=False) -> str: Returns: `str`: String containing all the attributes that make up this configuration instance in JSON format. """ - config_dict = self._internal_dict if hasattr(self, - "_internal_dict") else {} + config_dict = self._internal_dict if hasattr(self, "_internal_dict") else {} config_dict["_class_name"] = self.__class__.__name__ # json @@ -609,14 +573,12 @@ def to_json_saveable(value): config_dict.pop("_ignore_files", None) json_string = json.dumps(config_dict, indent=2, sort_keys=True) + "\n" if to_diffusers: - json_string = json_string.replace( - '"ppdiffusers"', '"diffusers"').replace( - '"paddlenlp.transformers"', '"transformers"') + json_string = json_string.replace('"ppdiffusers"', '"diffusers"').replace( + '"paddlenlp.transformers"', '"transformers"' + ) return json_string - def to_json_file(self, - json_file_path: Union[str, os.PathLike], - to_diffusers=False): + def to_json_file(self, json_file_path: Union[str, os.PathLike], to_diffusers=False): """ Save this instance to a JSON file. @@ -641,41 +603,39 @@ def register_to_config(init): def inner_init(self, *args, **kwargs): # Ignore private kwargs in the init. init_kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_")} - config_init_kwargs = { - k: v - for k, v in kwargs.items() if k.startswith("_") - } + config_init_kwargs = {k: v for k, v in kwargs.items() if k.startswith("_")} if not isinstance(self, ConfigMixin): raise RuntimeError( f"`@register_for_config` was applied to {self.__class__.__name__} init method, but this class does " - "not inherit from `ConfigMixin`.") + "not inherit from `ConfigMixin`." + ) ignore = getattr(self, "ignore_for_config", []) # Get positional arguments aligned with kwargs new_kwargs = {} signature = inspect.signature(init) parameters = { - name: p.default - for i, (name, p) in enumerate(signature.parameters.items()) - if i > 0 and name not in ignore + name: p.default for i, (name, p) in enumerate(signature.parameters.items()) if i > 0 and name not in ignore } for arg, name in zip(args, parameters.keys()): new_kwargs[name] = arg # Then add all kwargs - new_kwargs.update({ - k: init_kwargs.get(k, default) - for k, default in parameters.items() - if k not in ignore and k not in new_kwargs - }) - new_kwargs = { ** config_init_kwargs, ** new_kwargs} + new_kwargs.update( + { + k: init_kwargs.get(k, default) + for k, default in parameters.items() + if k not in ignore and k not in new_kwargs + } + ) + new_kwargs = {**config_init_kwargs, **new_kwargs} getattr(self, "register_to_config")(**new_kwargs) init(self, *args, **init_kwargs) return inner_init -def finfo(dtype: paddle.dtype=None): +def finfo(dtype: paddle.dtype = None): if dtype is None: dtype = paddle.get_default_dtype() @@ -699,10 +659,11 @@ class ModuleUtilsMixin: """ def get_extended_attention_mask( - self, - attention_mask: paddle.Tensor, - input_shape: Tuple[int], - dtype: paddle.float32=None, ) -> paddle.Tensor: + self, + attention_mask: paddle.Tensor, + input_shape: Tuple[int], + dtype: paddle.float32 = None, + ) -> paddle.Tensor: """ Makes broadcastable attention and causal masks so that future and masked tokens are ignored. Arguments: @@ -725,14 +686,15 @@ def get_extended_attention_mask( extended_attention_mask = attention_mask[:, None, None, :] else: raise ValueError( - "Wrong shape for input_ids (shape {}) or attention_mask (shape {})". - format(input_shape, attention_mask.shape)) + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( + input_shape, attention_mask.shape + ) + ) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. - extended_attention_mask = ( - 1.0 - extended_attention_mask) * finfo(dtype).min + extended_attention_mask = (1.0 - extended_attention_mask) * finfo(dtype).min return extended_attention_mask diff --git a/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py b/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py index 81cca5941a71a..730f5b91dba6c 100644 --- a/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py +++ b/ppdiffusers/ppdiffusers/experimental/rl/value_guided_sampling.py @@ -40,11 +40,12 @@ class ValueGuidedRLPipeline(DiffusionPipeline): """ def __init__( - self, - value_function: UNet1DModel, - unet: UNet1DModel, - scheduler: DDPMScheduler, - env, ): + self, + value_function: UNet1DModel, + unet: UNet1DModel, + scheduler: DDPMScheduler, + env, + ): super().__init__() self.value_function = value_function self.unet = unet @@ -89,14 +90,13 @@ def run_diffusion(self, x, conditions, n_guide_steps, scale): y = None for i in self.progress_bar(self.scheduler.timesteps): # create batch of timesteps to pass into model - timesteps = paddle.full((batch_size, ), i, dtype=paddle.int64) + timesteps = paddle.full((batch_size,), i, dtype=paddle.int64) for _ in range(n_guide_steps): with paddle.set_grad_enabled(True): x.stop_gradient = False # permute to match dimension for pre-trained models - y = self.value_function(x.transpose([0, 2, 1]), - timesteps).sample + y = self.value_function(x.transpose([0, 2, 1]), timesteps).sample grad = paddle.autograd.grad([y.sum()], [x])[0] posterior_variance = self.scheduler._get_variance(i) @@ -108,24 +108,17 @@ def run_diffusion(self, x, conditions, n_guide_steps, scale): x = x + scale * grad x = self.reset_x0(x, conditions, self.action_dim) - prev_x = self.unet(x.transpose([0, 2, 1]), - timesteps).sample.transpose([0, 2, 1]) + prev_x = self.unet(x.transpose([0, 2, 1]), timesteps).sample.transpose([0, 2, 1]) # TODO: verify deprecation of this kwarg - x = self.scheduler.step( - prev_x, i, x, predict_epsilon=False)["prev_sample"] + x = self.scheduler.step(prev_x, i, x, predict_epsilon=False)["prev_sample"] # apply conditions to the trajectory (set the initial state) x = self.reset_x0(x, conditions, self.action_dim) x = self.to_paddle(x) return x, y - def __call__(self, - obs, - batch_size=64, - planning_horizon=32, - n_guide_steps=2, - scale=0.1): + def __call__(self, obs, batch_size=64, planning_horizon=32, n_guide_steps=2, scale=0.1): # normalize the observations and create batch dimension obs = self.normalize(obs, "observations") obs = obs[None].repeat(batch_size, axis=0) @@ -144,7 +137,7 @@ def __call__(self, # sort output trajectories by value sorted_idx = paddle.argsort(y, 0, descending=True).squeeze() sorted_values = x[sorted_idx] - actions = sorted_values[:, :, :self.action_dim] + actions = sorted_values[:, :, : self.action_dim] actions = actions.detach().cpu().numpy() denorm_actions = self.de_normalize(actions, key="actions") diff --git a/ppdiffusers/ppdiffusers/image_processor.py b/ppdiffusers/ppdiffusers/image_processor.py index 3e52c14b439c4..82f9dd5f2c682 100644 --- a/ppdiffusers/ppdiffusers/image_processor.py +++ b/ppdiffusers/ppdiffusers/image_processor.py @@ -48,12 +48,13 @@ class VaeImageProcessor(ConfigMixin): @register_to_config def __init__( - self, - do_resize: bool=True, - vae_scale_factor: int=8, - resample: str="lanczos", - do_normalize: bool=True, - do_convert_rgb: bool=False, ): + self, + do_resize: bool = True, + vae_scale_factor: int = 8, + resample: str = "lanczos", + do_normalize: bool = True, + do_convert_rgb: bool = False, + ): super().__init__() @staticmethod @@ -66,26 +67,20 @@ def numpy_to_pil(images: np.ndarray) -> PIL.Image.Image: images = (images * 255).round().astype("uint8") if images.shape[-1] == 1: # special case for grayscale (single channel) images - pil_images = [ - Image.fromarray( - image.squeeze(), mode="L") for image in images - ] + pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images] else: pil_images = [Image.fromarray(image) for image in images] return pil_images @staticmethod - def pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image] - ) -> np.ndarray: + def pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray: """ Convert a PIL image or a list of PIL images to numpy arrays. """ if not isinstance(images, list): images = [images] - images = [ - np.array(image).astype(np.float32) / 255.0 for image in images - ] + images = [np.array(image).astype(np.float32) / 255.0 for image in images] images = np.stack(images, axis=0) return images @@ -132,10 +127,11 @@ def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image: return image def resize( - self, - image: PIL.Image.Image, - height: Optional[int]=None, - width: Optional[int]=None, ) -> PIL.Image.Image: + self, + image: PIL.Image.Image, + height: Optional[int] = None, + width: Optional[int] = None, + ) -> PIL.Image.Image: """ Resize a PIL image. Both height and width will be downscaled to the next integer multiple of `vae_scale_factor` """ @@ -144,20 +140,18 @@ def resize( if width is None: width = image.width - width, height = (x - x % self.config.vae_scale_factor - for x in (width, height) - ) # resize to integer multiple of vae_scale_factor - image = image.resize( - (width, height), resample=PIL_INTERPOLATION[self.config.resample]) + width, height = ( + x - x % self.config.vae_scale_factor for x in (width, height) + ) # resize to integer multiple of vae_scale_factor + image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample]) return image def preprocess( - self, - image: Union[paddle.Tensor, PIL.Image.Image, np.ndarray], - height: Optional[int]=None, - width: Optional[int]=None, - do_normalize: Optional[ - bool]=None, # new added, not exists in diffusers + self, + image: Union[paddle.Tensor, PIL.Image.Image, np.ndarray], + height: Optional[int] = None, + width: Optional[int] = None, + do_normalize: Optional[bool] = None, # new added, not exists in diffusers ) -> paddle.Tensor: """ Preprocess the image input, accepted formats are PIL images, numpy arrays or paddle tensors" @@ -165,8 +159,7 @@ def preprocess( supported_formats = (PIL.Image.Image, np.ndarray, paddle.Tensor) if isinstance(image, supported_formats): image = [image] - elif not (isinstance(image, list) and - all(isinstance(i, supported_formats) for i in image)): + elif not (isinstance(image, list) and all(isinstance(i, supported_formats) for i in image)): raise ValueError( f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support {', '.join(supported_formats)}" ) @@ -180,23 +173,19 @@ def preprocess( image = self.numpy_to_pd(image) # to pd elif isinstance(image[0], np.ndarray): - image = (np.concatenate( - image, axis=0) if image[0].ndim == 4 else np.stack( - image, axis=0)) + image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0) image = self.numpy_to_pd(image) _, _, height, width = image.shape if self.config.do_resize and ( - height % self.config.vae_scale_factor != 0 or - width % self.config.vae_scale_factor != 0): + height % self.config.vae_scale_factor != 0 or width % self.config.vae_scale_factor != 0 + ): raise ValueError( f"Currently we only support resizing for PIL image - please resize your numpy array to be divisible by {self.config.vae_scale_factor}" f"currently the sizes are {height} and {width}. You can also pass a PIL image instead to use resize option in VAEImageProcessor" ) elif isinstance(image[0], paddle.Tensor): - image = (paddle.concat( - image, axis=0) if image[0].ndim == 4 else paddle.stack( - image, axis=0)) + image = paddle.concat(image, axis=0) if image[0].ndim == 4 else paddle.stack(image, axis=0) _, channel, height, width = image.shape # don't need any preprocess if the image is latents @@ -204,21 +193,21 @@ def preprocess( return image if self.config.do_resize and ( - height % self.config.vae_scale_factor != 0 or - width % self.config.vae_scale_factor != 0): + height % self.config.vae_scale_factor != 0 or width % self.config.vae_scale_factor != 0 + ): raise ValueError( f"Currently we only support resizing for PIL image - please resize your paddle tensor to be divisible by {self.config.vae_scale_factor}" f"currently the sizes are {height} and {width}. You can also pass a PIL image instead to use resize option in VAEImageProcessor" ) # expected range [0,1], normalize to [-1,1] - do_normalize = (self.config.do_normalize - if do_normalize is None else do_normalize) + do_normalize = self.config.do_normalize if do_normalize is None else do_normalize if image.min() < 0: warnings.warn( "Passing `image` as paddle tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] " f"when passing as paddle tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]", - FutureWarning, ) + FutureWarning, + ) do_normalize = False if do_normalize: @@ -227,10 +216,11 @@ def preprocess( return image def postprocess( - self, - image: paddle.Tensor, - output_type: str="pil", - do_denormalize: Optional[List[bool]]=None, ): + self, + image: paddle.Tensor, + output_type: str = "pil", + do_denormalize: Optional[List[bool]] = None, + ): if not isinstance(image, paddle.Tensor): raise ValueError( f"Input for postprocessing is in incorrect format: {type(image)}. We only support paddle tensor" @@ -238,12 +228,14 @@ def postprocess( if output_type not in ["latent", "pd", "np", "pil"]: deprecation_message = ( f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pd`, `latent`") + "`pil`, `np`, `pd`, `latent`" + ) deprecate( "Unsupported output_type", "1.0.0", deprecation_message, - standard_warn=False, ) + standard_warn=False, + ) output_type = "np" if output_type == "latent": @@ -252,10 +244,9 @@ def postprocess( if do_denormalize is None: do_denormalize = [self.config.do_normalize] * image.shape[0] - image = paddle.stack([ - self.denormalize(image[i]) if do_denormalize[i] else image[i] - for i in range(image.shape[0]) - ]) + image = paddle.stack( + [self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])] + ) if output_type == "pd": return image diff --git a/ppdiffusers/ppdiffusers/loaders.py b/ppdiffusers/ppdiffusers/loaders.py index 934518d67b9d6..da64eb0e6ec9d 100644 --- a/ppdiffusers/ppdiffusers/loaders.py +++ b/ppdiffusers/ppdiffusers/loaders.py @@ -24,16 +24,31 @@ from huggingface_hub import hf_hub_download from huggingface_hub.file_download import _request_wrapper, hf_raise_for_status -from .models.attention_processor import (CustomDiffusionAttnProcessor, - CustomDiffusionXFormersAttnProcessor, - LoRAAttnProcessor) +from .models.attention_processor import ( + CustomDiffusionAttnProcessor, + CustomDiffusionXFormersAttnProcessor, + LoRAAttnProcessor, +) from .models.modeling_utils import convert_state_dict -from .utils import (DIFFUSERS_CACHE, FROM_DIFFUSERS, FROM_HF_HUB, - HF_HUB_OFFLINE, PPDIFFUSERS_CACHE, TEXT_ENCODER_ATTN_MODULE, - TO_DIFFUSERS, _get_model_file, is_paddlenlp_available, - is_safetensors_available, is_torch_available, is_torch_file, - logging, ppdiffusers_url_download, safetensors_load, - smart_load, torch_load) +from .utils import ( + DIFFUSERS_CACHE, + FROM_DIFFUSERS, + FROM_HF_HUB, + HF_HUB_OFFLINE, + PPDIFFUSERS_CACHE, + TEXT_ENCODER_ATTN_MODULE, + TO_DIFFUSERS, + _get_model_file, + is_paddlenlp_available, + is_safetensors_available, + is_torch_available, + is_torch_file, + logging, + ppdiffusers_url_download, + safetensors_load, + smart_load, + torch_load, +) logger = logging.get_logger(__name__) @@ -68,11 +83,9 @@ def transpose_state_dict(state_dict, name_mapping=None): for old_name, new_name in name_mapping.items(): k = k.replace(old_name, new_name) if v.ndim == 2: - new_state_dict[k] = v.T.contiguous() if hasattr( - v, "contiguous") else v.T + new_state_dict[k] = v.T.contiguous() if hasattr(v, "contiguous") else v.T else: - new_state_dict[k] = v.contiguous() if hasattr(v, - "contiguous") else v + new_state_dict[k] = v.contiguous() if hasattr(v, "contiguous") else v return new_state_dict @@ -110,8 +123,7 @@ def map_from(module, state_dict, *args, **kwargs): all_keys = list(state_dict.keys()) for key in all_keys: replace_key = remap_key(key, state_dict) - new_key = key.replace( - replace_key, f"layers.{module.rev_mapping[replace_key]}") + new_key = key.replace(replace_key, f"layers.{module.rev_mapping[replace_key]}") state_dict[new_key] = state_dict[key] del state_dict[key] @@ -124,10 +136,10 @@ class UNet2DConditionLoadersMixin: unet_name = UNET_NAME def load_attn_procs( - self, - pretrained_model_name_or_path_or_dict: Union[str, Dict[ - str, paddle.Tensor]], - **kwargs, ): + self, + pretrained_model_name_or_path_or_dict: Union[str, Dict[str, paddle.Tensor]], + **kwargs, + ): r""" Load pretrained attention processor layers into `UNet2DConditionModel`. Attention processor layers have to be defined in @@ -186,8 +198,9 @@ def load_attn_procs( """ from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB) - cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub - else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)) + cache_dir = ( + kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE) + ) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS) @@ -202,8 +215,7 @@ def load_attn_procs( # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning network_alpha = kwargs.pop("network_alpha", None) - if from_diffusers and use_safetensors and not is_safetensors_available( - ): + if from_diffusers and use_safetensors and not is_safetensors_available(): raise ValueError( "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors" ) @@ -221,13 +233,12 @@ def load_attn_procs( if from_diffusers: # Let's first try to load .safetensors weights if (use_safetensors and weight_name is None) or ( - weight_name is not None and - weight_name.endswith(".safetensors")): + weight_name is not None and weight_name.endswith(".safetensors") + ): try: model_file = _get_model_file( pretrained_model_name_or_path_or_dict, - weights_name=weight_name or - TORCH_LORA_WEIGHT_NAME_SAFE, + weights_name=weight_name or TORCH_LORA_WEIGHT_NAME_SAFE, cache_dir=cache_dir, force_download=force_download, resume_download=resume_download, @@ -237,7 +248,8 @@ def load_attn_procs( revision=revision, subfolder=subfolder, user_agent=user_agent, - from_hf_hub=from_hf_hub, ) + from_hf_hub=from_hf_hub, + ) state_dict = smart_load(model_file) except Exception: model_file = None @@ -255,7 +267,8 @@ def load_attn_procs( revision=revision, subfolder=subfolder, user_agent=user_agent, - from_hf_hub=from_hf_hub, ) + from_hf_hub=from_hf_hub, + ) state_dict = smart_load(model_file) else: model_file = _get_model_file( @@ -270,7 +283,8 @@ def load_attn_procs( revision=revision, subfolder=subfolder, user_agent=user_agent, - from_hf_hub=from_hf_hub, ) + from_hf_hub=from_hf_hub, + ) state_dict = smart_load(model_file) else: state_dict = pretrained_model_name_or_path_or_dict @@ -279,53 +293,42 @@ def load_attn_procs( attn_processors = {} is_lora = all("lora" in k for k in state_dict.keys()) - is_custom_diffusion = any("custom_diffusion" in k - for k in state_dict.keys()) + is_custom_diffusion = any("custom_diffusion" in k for k in state_dict.keys()) if from_diffusers or is_torch_file(model_file): state_dict = transpose_state_dict(state_dict) if is_lora: is_new_lora_format = all( - key.startswith(self.unet_name) or - key.startswith(self.text_encoder_name) - for key in state_dict.keys()) + key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in state_dict.keys() + ) if is_new_lora_format: # Strip the `"unet"` prefix. - is_text_encoder_present = any( - key.startswith(self.text_encoder_name) - for key in state_dict.keys()) + is_text_encoder_present = any(key.startswith(self.text_encoder_name) for key in state_dict.keys()) if is_text_encoder_present: warn_message = "The state_dict contains LoRA params corresponding to the text encoder which are not being used here. To use both UNet and text encoder related LoRA params, use [`pipe.load_lora_weights()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraLoaderMixin.load_lora_weights)." warnings.warn(warn_message) - unet_keys = [ - k for k in state_dict.keys() if k.startswith(self.unet_name) - ] - state_dict = { - k.replace(f"{self.unet_name}.", ""): v - for k, v in state_dict.items() if k in unet_keys - } + unet_keys = [k for k in state_dict.keys() if k.startswith(self.unet_name)] + state_dict = {k.replace(f"{self.unet_name}.", ""): v for k, v in state_dict.items() if k in unet_keys} lora_grouped_dict = defaultdict(dict) for key, value in state_dict.items(): - attn_processor_key, sub_key = ".".join(key.split( - ".")[:-3]), ".".join(key.split(".")[-3:]) + attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:]) lora_grouped_dict[attn_processor_key][sub_key] = value.cast( - dtype="float32") # we must cast this to float32 + dtype="float32" + ) # we must cast this to float32 for key, value_dict in lora_grouped_dict.items(): - rank = value_dict["to_k_lora.down.weight"].shape[ - 1] # 0 -> 1, torch vs paddle nn.Linear - cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[ - 0] # 1 -> 0, torch vs paddle nn.Linear - hidden_size = value_dict["to_k_lora.up.weight"].shape[ - 1] # 0 -> 1, torch vs paddle nn.Linear + rank = value_dict["to_k_lora.down.weight"].shape[1] # 0 -> 1, torch vs paddle nn.Linear + cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[0] # 1 -> 0, torch vs paddle nn.Linear + hidden_size = value_dict["to_k_lora.up.weight"].shape[1] # 0 -> 1, torch vs paddle nn.Linear attn_processors[key] = LoRAAttnProcessor( hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=rank, - network_alpha=network_alpha, ) + network_alpha=network_alpha, + ) attn_processors[key].load_dict(value_dict) elif is_custom_diffusion: custom_diffusion_grouped_dict = defaultdict(dict) @@ -334,16 +337,12 @@ def load_attn_procs( custom_diffusion_grouped_dict[key] = {} else: if "to_out" in key: - attn_processor_key, sub_key = ".".join( - key.split(".")[:-3]), ".".join( - key.split(".")[-3:]) + attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:]) else: - attn_processor_key, sub_key = ".".join( - key.split(".")[:-2]), ".".join( - key.split(".")[-2:]) - custom_diffusion_grouped_dict[attn_processor_key][ - sub_key] = value.cast( - dtype="float32") # we must cast this to float32 + attn_processor_key, sub_key = ".".join(key.split(".")[:-2]), ".".join(key.split(".")[-2:]) + custom_diffusion_grouped_dict[attn_processor_key][sub_key] = value.cast( + dtype="float32" + ) # we must cast this to float32 for key, value_dict in custom_diffusion_grouped_dict.items(): if len(value_dict) == 0: @@ -351,44 +350,42 @@ def load_attn_procs( train_kv=False, train_q_out=False, hidden_size=None, - cross_attention_dim=None, ) + cross_attention_dim=None, + ) else: - cross_attention_dim = value_dict[ - "to_k_custom_diffusion.weight"].shape[ - 0] # 1 -> 0, torch vs paddle nn.Linear - hidden_size = value_dict[ - "to_k_custom_diffusion.weight"].shape[ - 1] # 0 -> 1, torch vs paddle nn.Linear - train_q_out = (True if - "to_q_custom_diffusion.weight" in value_dict - else False) + cross_attention_dim = value_dict["to_k_custom_diffusion.weight"].shape[ + 0 + ] # 1 -> 0, torch vs paddle nn.Linear + hidden_size = value_dict["to_k_custom_diffusion.weight"].shape[ + 1 + ] # 0 -> 1, torch vs paddle nn.Linear + train_q_out = True if "to_q_custom_diffusion.weight" in value_dict else False attn_processors[key] = CustomDiffusionAttnProcessor( train_kv=True, train_q_out=train_q_out, hidden_size=hidden_size, - cross_attention_dim=cross_attention_dim, ) + cross_attention_dim=cross_attention_dim, + ) attn_processors[key].load_dict(value_dict) else: raise ValueError( f"{model_file} does not seem to be in the correct format expected by LoRA or Custom Diffusion training." ) # set correct dtype & device - attn_processors = { - k: v.to(dtype=self.dtype) - for k, v in attn_processors.items() - } + attn_processors = {k: v.to(dtype=self.dtype) for k, v in attn_processors.items()} # set layers self.set_attn_processor(attn_processors) def save_attn_procs( - self, - save_directory: Union[str, os.PathLike], - is_main_process: bool=True, - weight_name: str=None, - save_function: Callable=None, - safe_serialization: bool=False, - to_diffusers: Optional[bool]=None, ): + self, + save_directory: Union[str, os.PathLike], + is_main_process: bool = True, + weight_name: str = None, + save_function: Callable = None, + safe_serialization: bool = False, + to_diffusers: Optional[bool] = None, + ): r""" Save an attention processor to a directory, so that it can be re-loaded using the `[`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`]` method. @@ -413,34 +410,33 @@ def save_attn_procs( """ if to_diffusers is None: to_diffusers = TO_DIFFUSERS - if to_diffusers and safe_serialization and not is_safetensors_available( - ): - raise ImportError( - "`safe_serialization` requires the `safetensors library: `pip install safetensors`." - ) + if to_diffusers and safe_serialization and not is_safetensors_available(): + raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.") if os.path.isfile(save_directory): - logger.error( - f"Provided path ({save_directory}) should be a directory, not a file" - ) + logger.error(f"Provided path ({save_directory}) should be a directory, not a file") return os.makedirs(save_directory, exist_ok=True) is_custom_diffusion = any( - isinstance(x, (CustomDiffusionAttnProcessor, - CustomDiffusionXFormersAttnProcessor)) - for (_, x) in self.attn_processors.items()) + isinstance(x, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor)) + for (_, x) in self.attn_processors.items() + ) if is_custom_diffusion: - model_to_save = AttnProcsLayers({ - y: x - for (y, x) in self.attn_processors.items() - if isinstance( - x, - ( - CustomDiffusionAttnProcessor, - CustomDiffusionXFormersAttnProcessor, ), ) - }) + model_to_save = AttnProcsLayers( + { + y: x + for (y, x) in self.attn_processors.items() + if isinstance( + x, + ( + CustomDiffusionAttnProcessor, + CustomDiffusionXFormersAttnProcessor, + ), + ) + } + ) state_dict = model_to_save.state_dict() for name, attn in self.attn_processors.items(): if len(attn.state_dict()) == 0: @@ -452,16 +448,13 @@ def save_attn_procs( if weight_name is None: if to_diffusers: if safe_serialization: - weight_name = (TORCH_CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE - if is_custom_diffusion else - TORCH_LORA_WEIGHT_NAME_SAFE) + weight_name = ( + TORCH_CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE if is_custom_diffusion else TORCH_LORA_WEIGHT_NAME_SAFE + ) else: - weight_name = (TORCH_CUSTOM_DIFFUSION_WEIGHT_NAME - if is_custom_diffusion else - TORCH_LORA_WEIGHT_NAME) + weight_name = TORCH_CUSTOM_DIFFUSION_WEIGHT_NAME if is_custom_diffusion else TORCH_LORA_WEIGHT_NAME else: - weight_name = (PADDLE_CUSTOM_DIFFUSION_WEIGHT_NAME if - is_custom_diffusion else PADDLE_LORA_WEIGHT_NAME) + weight_name = PADDLE_CUSTOM_DIFFUSION_WEIGHT_NAME if is_custom_diffusion else PADDLE_LORA_WEIGHT_NAME # choose save_function if save_function is None: @@ -469,16 +462,13 @@ def save_attn_procs( if safe_serialization: if is_torch_available(): _save_function = safetensors.torch.save_file - state_dict = convert_state_dict( - state_dict, framework="torch") + state_dict = convert_state_dict(state_dict, framework="torch") else: _save_function = safetensors.numpy.save_file - state_dict = convert_state_dict( - state_dict, framework="numpy") + state_dict = convert_state_dict(state_dict, framework="numpy") def save_function(weights, filename): - return _save_function( - weights, filename, metadata={"format": "pt"}) + return _save_function(weights, filename, metadata={"format": "pt"}) else: if not is_torch_available(): @@ -486,8 +476,7 @@ def save_function(weights, filename): "`to_diffusers=True` with `safe_serialization=False` requires the `torch library: `pip install torch`." ) save_function = torch.save - state_dict = convert_state_dict( - state_dict, framework="torch") + state_dict = convert_state_dict(state_dict, framework="torch") state_dict = transpose_state_dict(state_dict) else: save_function = paddle.save @@ -495,9 +484,7 @@ def save_function(weights, filename): # Save the model save_function(state_dict, os.path.join(save_directory, weight_name)) - logger.info( - f"Model weights saved in {os.path.join(save_directory, weight_name)}" - ) + logger.info(f"Model weights saved in {os.path.join(save_directory, weight_name)}") class TextualInversionLoaderMixin: @@ -505,9 +492,7 @@ class TextualInversionLoaderMixin: Mixin class for loading textual inversion tokens and embeddings to the tokenizer and text encoder. """ - def maybe_convert_prompt(self, - prompt: Union[str, List[str]], - tokenizer: "PretrainedTokenizer"): + def maybe_convert_prompt(self, prompt: Union[str, List[str]], tokenizer: "PretrainedTokenizer"): r""" Maybe convert a prompt into a "multi vector"-compatible prompt. If the prompt includes a token that corresponds to a multi-vector textual inversion embedding, this function will process the prompt so that the special token @@ -533,9 +518,7 @@ def maybe_convert_prompt(self, return prompts - def _maybe_convert_prompt(self, - prompt: str, - tokenizer: "PretrainedTokenizer"): + def _maybe_convert_prompt(self, prompt: str, tokenizer: "PretrainedTokenizer"): r""" Maybe convert a prompt into a "multi vector"-compatible prompt. If the prompt includes a token that corresponds to a multi-vector textual inversion embedding, this function will process the prompt so that the special token @@ -563,10 +546,11 @@ def _maybe_convert_prompt(self, return prompt def load_textual_inversion( - self, - pretrained_model_name_or_path: Union[str, Dict[str, paddle.Tensor]], - token: Optional[str]=None, - **kwargs, ): + self, + pretrained_model_name_or_path: Union[str, Dict[str, paddle.Tensor]], + token: Optional[str] = None, + **kwargs, + ): r""" Load textual inversion embeddings into the text encoder of stable diffusion pipelines. Both `diffusers` and `Automatic1111` formats are supported (see example below). @@ -643,20 +627,21 @@ def load_textual_inversion( image.save("character.png") ``` """ - if not hasattr(self, "tokenizer") or not isinstance( - self.tokenizer, PretrainedTokenizer): + if not hasattr(self, "tokenizer") or not isinstance(self.tokenizer, PretrainedTokenizer): raise ValueError( f"{self.__class__.__name__} requires `self.tokenizer` of type `PretrainedTokenizer` for calling" - f" `{self.load_textual_inversion.__name__}`") + f" `{self.load_textual_inversion.__name__}`" + ) - if not hasattr(self, "text_encoder") or not isinstance( - self.text_encoder, PretrainedModel): + if not hasattr(self, "text_encoder") or not isinstance(self.text_encoder, PretrainedModel): raise ValueError( f"{self.__class__.__name__} requires `self.text_encoder` of type `PretrainedModel` for calling" - f" `{self.load_textual_inversion.__name__}`") + f" `{self.load_textual_inversion.__name__}`" + ) from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB) - cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub - else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)) + cache_dir = ( + kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE) + ) from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) @@ -668,8 +653,7 @@ def load_textual_inversion( weight_name = kwargs.pop("weight_name", None) use_safetensors = kwargs.pop("use_safetensors", None) - if from_diffusers and use_safetensors and not is_safetensors_available( - ): + if from_diffusers and use_safetensors and not is_safetensors_available(): raise ValueError( "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors" ) @@ -685,13 +669,12 @@ def load_textual_inversion( # Let's first try to load .safetensors weights if from_diffusers: if (use_safetensors and weight_name is None) or ( - weight_name is not None and - weight_name.endswith(".safetensors")): + weight_name is not None and weight_name.endswith(".safetensors") + ): try: model_file = _get_model_file( pretrained_model_name_or_path, - weights_name=weight_name or - TORCH_TEXT_INVERSION_NAME_SAFE, + weights_name=weight_name or TORCH_TEXT_INVERSION_NAME_SAFE, cache_dir=cache_dir, force_download=force_download, resume_download=resume_download, @@ -701,7 +684,8 @@ def load_textual_inversion( revision=revision, subfolder=subfolder, user_agent=user_agent, - from_hf_hub=from_hf_hub, ) + from_hf_hub=from_hf_hub, + ) state_dict = safetensors_load(model_file) except Exception: model_file = None @@ -719,7 +703,8 @@ def load_textual_inversion( revision=revision, subfolder=subfolder, user_agent=user_agent, - from_hf_hub=from_hf_hub, ) + from_hf_hub=from_hf_hub, + ) state_dict = torch_load(model_file) else: model_file = _get_model_file( @@ -734,7 +719,8 @@ def load_textual_inversion( revision=revision, subfolder=subfolder, user_agent=user_agent, - from_hf_hub=from_hf_hub, ) + from_hf_hub=from_hf_hub, + ) if is_torch_file(model_file): try: state_dict = safetensors_load(model_file) @@ -759,9 +745,7 @@ def load_textual_inversion( embedding = state_dict["string_to_param"]["*"] if token is not None and loaded_token != token: - logger.warn( - f"The loaded token: {loaded_token} is overwritten by the passed token {token}." - ) + logger.warn(f"The loaded token: {loaded_token} is overwritten by the passed token {token}.") else: token = loaded_token @@ -795,14 +779,11 @@ def load_textual_inversion( is_multi_vector = len(embedding.shape) > 1 and embedding.shape[0] > 1 if is_multi_vector: - tokens = [token] + [ - f"{token}_{i}" for i in range(1, embedding.shape[0]) - ] + tokens = [token] + [f"{token}_{i}" for i in range(1, embedding.shape[0])] embeddings = [e for e in embedding] # noqa: C416 else: tokens = [token] - embeddings = [embedding[0]] if len( - embedding.shape) > 1 else [embedding] + embeddings = [embedding[0]] if len(embedding.shape) > 1 else [embedding] # add tokens and get ids self.tokenizer.add_tokens(tokens) @@ -812,8 +793,7 @@ def load_textual_inversion( self.text_encoder.resize_token_embeddings(len(self.tokenizer)) with paddle.no_grad(): for token_id, embedding in zip(token_ids, embeddings): - self.text_encoder.get_input_embeddings().weight[ - token_id] = embedding + self.text_encoder.get_input_embeddings().weight[token_id] = embedding logger.info(f"Loaded textual inversion embedding for {token}.") @@ -830,10 +810,10 @@ class LoraLoaderMixin: unet_name = UNET_NAME def load_lora_weights( - self, - pretrained_model_name_or_path_or_dict: Union[str, Dict[ - str, paddle.Tensor]], - **kwargs, ): + self, + pretrained_model_name_or_path_or_dict: Union[str, Dict[str, paddle.Tensor]], + **kwargs, + ): r""" Load pretrained attention processor layers (such as LoRA) into [`UNet2DConditionModel`] and [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel)). @@ -885,8 +865,9 @@ def load_lora_weights( # Load the main state dict first which has the LoRA layers for either of # UNet and text encoder or both. from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB) - cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub - else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)) + cache_dir = ( + kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE) + ) from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) @@ -901,8 +882,7 @@ def load_lora_weights( # set lora scale to a reasonable default self._lora_scale = 1.0 - if from_diffusers and use_safetensors and not is_safetensors_available( - ): + if from_diffusers and use_safetensors and not is_safetensors_available(): raise ValueError( "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors" ) @@ -920,13 +900,12 @@ def load_lora_weights( if from_diffusers: # Let's first try to load .safetensors weights if (use_safetensors and weight_name is None) or ( - weight_name is not None and - weight_name.endswith(".safetensors")): + weight_name is not None and weight_name.endswith(".safetensors") + ): try: model_file = _get_model_file( pretrained_model_name_or_path_or_dict, - weights_name=weight_name or - TORCH_LORA_WEIGHT_NAME_SAFE, + weights_name=weight_name or TORCH_LORA_WEIGHT_NAME_SAFE, cache_dir=cache_dir, force_download=force_download, resume_download=resume_download, @@ -936,7 +915,8 @@ def load_lora_weights( revision=revision, subfolder=subfolder, user_agent=user_agent, - from_hf_hub=from_hf_hub, ) + from_hf_hub=from_hf_hub, + ) state_dict = smart_load(model_file) except Exception: model_file = None @@ -954,7 +934,8 @@ def load_lora_weights( revision=revision, subfolder=subfolder, user_agent=user_agent, - from_hf_hub=from_hf_hub, ) + from_hf_hub=from_hf_hub, + ) state_dict = smart_load(model_file) else: model_file = _get_model_file( @@ -969,7 +950,8 @@ def load_lora_weights( revision=revision, subfolder=subfolder, user_agent=user_agent, - from_hf_hub=from_hf_hub, ) + from_hf_hub=from_hf_hub, + ) state_dict = smart_load(model_file) else: state_dict = pretrained_model_name_or_path_or_dict @@ -979,45 +961,39 @@ def load_lora_weights( # Convert kohya-ss Style LoRA attn procs to ppdiffusers attn procs network_alpha = None - if all((k.startswith("lora_te_") or k.startswith("lora_unet_")) - for k in state_dict.keys()): - state_dict, network_alpha = self._convert_kohya_lora_to_diffusers( - state_dict) + if all((k.startswith("lora_te_") or k.startswith("lora_unet_")) for k in state_dict.keys()): + state_dict, network_alpha = self._convert_kohya_lora_to_diffusers(state_dict) from_diffusers = True # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918), # then the `state_dict` keys should have `self.unet_name` and/or `self.text_encoder_name` as # their prefixes. keys = list(state_dict.keys()) - if all( - key.startswith(self.unet_name) or - key.startswith(self.text_encoder_name) for key in keys): + if all(key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in keys): # Load the layers corresponding to UNet. unet_keys = [k for k in keys if k.startswith(self.unet_name)] logger.info(f"Loading {self.unet_name}.") unet_lora_state_dict = { - k.replace(f"{self.unet_name}.", ""): v - for k, v in state_dict.items() if k in unet_keys + k.replace(f"{self.unet_name}.", ""): v for k, v in state_dict.items() if k in unet_keys } self.unet.load_attn_procs( unet_lora_state_dict, network_alpha=network_alpha, - from_diffusers=from_diffusers, ) + from_diffusers=from_diffusers, + ) # Load the layers corresponding to text encoder and make necessary adjustments. - text_encoder_keys = [ - k for k in keys if k.startswith(self.text_encoder_name) - ] + text_encoder_keys = [k for k in keys if k.startswith(self.text_encoder_name)] text_encoder_lora_state_dict = { - k.replace(f"{self.text_encoder_name}.", ""): v - for k, v in state_dict.items() if k in text_encoder_keys + k.replace(f"{self.text_encoder_name}.", ""): v for k, v in state_dict.items() if k in text_encoder_keys } if len(text_encoder_lora_state_dict) > 0: logger.info(f"Loading {self.text_encoder_name}.") attn_procs_text_encoder = self._load_text_encoder_attn_procs( text_encoder_lora_state_dict, network_alpha=network_alpha, - from_diffusers=from_diffusers, ) + from_diffusers=from_diffusers, + ) self._modify_text_encoder(attn_procs_text_encoder) # save lora attn procs of text encoder so that it can be easily retrieved @@ -1026,13 +1002,9 @@ def load_lora_weights( # Otherwise, we're dealing with the old format. This means the `state_dict` should only # contain the module names of the `unet` as its keys WITHOUT any prefix. elif not all( - key.startswith(self.unet_name) or - key.startswith(self.text_encoder_name) - for key in state_dict.keys()): - self.unet.load_attn_procs( - state_dict, - network_alpha=network_alpha, - from_diffusers=from_diffusers) + key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in state_dict.keys() + ): + self.unet.load_attn_procs(state_dict, network_alpha=network_alpha, from_diffusers=from_diffusers) warn_message = "You have saved the LoRA weights using the old format. To convert the old LoRA weights to the new format, you can first load them in a dictionary and then create a new dictionary like the following: `new_state_dict = {f'unet'.{module_name}: params for module_name, params in old_state_dict.items()}`." warnings.warn(warn_message) @@ -1050,15 +1022,13 @@ def text_encoder_lora_attn_procs(self): def _remove_text_encoder_monkey_patch(self): # Loop over the nn.MultiHeadAttention module of text_encoder - for name, attn_module in self.text_encoder.named_sublayers( - include_self=True): + for name, attn_module in self.text_encoder.named_sublayers(include_self=True): if name.endswith(TEXT_ENCODER_ATTN_MODULE): # Loop over the LoRA layers for ( - _, - text_encoder_attr, - ) in self._lora_attn_processor_attr_to_text_encoder_attr.items( - ): + _, + text_encoder_attr, + ) in self._lora_attn_processor_attr_to_text_encoder_attr.items(): # Retrieve the q/k/v/out projection of nn.MultiHeadAttention module = attn_module.get_sublayer(text_encoder_attr) if hasattr(module, "old_forward"): @@ -1071,8 +1041,7 @@ def _remove_text_encoder_monkey_patch(self): # del processor delattr(attn_module, "processor") - def _modify_text_encoder(self, - attn_processors: Dict[str, LoRAAttnProcessor]): + def _modify_text_encoder(self, attn_processors: Dict[str, LoRAAttnProcessor]): r""" Monkey-patches the forward passes of attention modules of the text encoder. @@ -1085,19 +1054,16 @@ def _modify_text_encoder(self, self._remove_text_encoder_monkey_patch() # Loop over the nn.MultiHeadAttention module of text_encoder - for name, attn_module in self.text_encoder.named_sublayers( - include_self=True): + for name, attn_module in self.text_encoder.named_sublayers(include_self=True): if name.endswith(TEXT_ENCODER_ATTN_MODULE): # Loop over the LoRA layers for ( - attn_proc_attr, - text_encoder_attr, - ) in self._lora_attn_processor_attr_to_text_encoder_attr.items( - ): + attn_proc_attr, + text_encoder_attr, + ) in self._lora_attn_processor_attr_to_text_encoder_attr.items(): # Retrieve the q/k/v/out projection of nn.MultiHeadAttention and its corresponding LoRA layer. module = attn_module.get_sublayer(text_encoder_attr) - lora_layer = attn_processors[name].get_sublayer( - attn_proc_attr) + lora_layer = attn_processors[name].get_sublayer(attn_proc_attr) # save old_forward to module that can be used to remove monkey-patch old_forward = module.old_forward = module.forward @@ -1105,8 +1071,7 @@ def _modify_text_encoder(self, # for more detail, see https://github.com/huggingface/diffusers/pull/3490#issuecomment-1555059060 def make_new_forward(old_forward, lora_layer): def new_forward(x): - result = old_forward( - x) + self.lora_scale * lora_layer(x) + result = old_forward(x) + self.lora_scale * lora_layer(x) return result return new_forward @@ -1127,10 +1092,10 @@ def _lora_attn_processor_attr_to_text_encoder_attr(self): } def _load_text_encoder_attn_procs( - self, - pretrained_model_name_or_path_or_dict: Union[str, Dict[ - str, paddle.Tensor]], - **kwargs, ): + self, + pretrained_model_name_or_path_or_dict: Union[str, Dict[str, paddle.Tensor]], + **kwargs, + ): r""" Load pretrained attention processor layers for [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel). @@ -1184,8 +1149,9 @@ def _load_text_encoder_attn_procs( """ from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB) - cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub - else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)) + cache_dir = ( + kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE) + ) from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) @@ -1198,8 +1164,7 @@ def _load_text_encoder_attn_procs( use_safetensors = kwargs.pop("use_safetensors", None) network_alpha = kwargs.pop("network_alpha", None) - if from_diffusers and use_safetensors and not is_safetensors_available( - ): + if from_diffusers and use_safetensors and not is_safetensors_available(): raise ValueError( "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors" ) @@ -1215,13 +1180,12 @@ def _load_text_encoder_attn_procs( if from_diffusers: # Let's first try to load .safetensors weights if (use_safetensors and weight_name is None) or ( - weight_name is not None and - weight_name.endswith(".safetensors")): + weight_name is not None and weight_name.endswith(".safetensors") + ): try: model_file = _get_model_file( pretrained_model_name_or_path_or_dict, - weights_name=weight_name or - TORCH_LORA_WEIGHT_NAME_SAFE, + weights_name=weight_name or TORCH_LORA_WEIGHT_NAME_SAFE, cache_dir=cache_dir, force_download=force_download, resume_download=resume_download, @@ -1231,7 +1195,8 @@ def _load_text_encoder_attn_procs( revision=revision, subfolder=subfolder, user_agent=user_agent, - from_hf_hub=from_hf_hub, ) + from_hf_hub=from_hf_hub, + ) state_dict = smart_load(model_file) except Exception: model_file = None @@ -1249,7 +1214,8 @@ def _load_text_encoder_attn_procs( revision=revision, subfolder=subfolder, user_agent=user_agent, - from_hf_hub=from_hf_hub, ) + from_hf_hub=from_hf_hub, + ) state_dict = smart_load(model_file) else: model_file = _get_model_file( @@ -1264,7 +1230,8 @@ def _load_text_encoder_attn_procs( revision=revision, subfolder=subfolder, user_agent=user_agent, - from_hf_hub=from_hf_hub, ) + from_hf_hub=from_hf_hub, + ) state_dict = smart_load(model_file) else: state_dict = pretrained_model_name_or_path_or_dict @@ -1275,55 +1242,48 @@ def _load_text_encoder_attn_procs( is_lora = all("lora" in k for k in state_dict.keys()) if from_diffusers or is_torch_file(model_file): - state_dict = transpose_state_dict( - state_dict, name_mapping={".encoder.": ".transformer."}) + state_dict = transpose_state_dict(state_dict, name_mapping={".encoder.": ".transformer."}) if is_lora: lora_grouped_dict = defaultdict(dict) for key, value in state_dict.items(): - attn_processor_key, sub_key = ".".join(key.split( - ".")[:-3]), ".".join(key.split(".")[-3:]) + attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:]) lora_grouped_dict[attn_processor_key][sub_key] = value.cast( - dtype="float32") # we must cast this to float32 + dtype="float32" + ) # we must cast this to float32 for key, value_dict in lora_grouped_dict.items(): - rank = value_dict["to_k_lora.down.weight"].shape[ - 1] # 0 -> 1, torch vs paddle nn.Linear - cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[ - 0] # 1 -> 0, torch vs paddle nn.Linear - hidden_size = value_dict["to_k_lora.up.weight"].shape[ - 1] # 0 -> 1, torch vs paddle nn.Linear + rank = value_dict["to_k_lora.down.weight"].shape[1] # 0 -> 1, torch vs paddle nn.Linear + cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[0] # 1 -> 0, torch vs paddle nn.Linear + hidden_size = value_dict["to_k_lora.up.weight"].shape[1] # 0 -> 1, torch vs paddle nn.Linear attn_processors[key] = LoRAAttnProcessor( hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=rank, - network_alpha=network_alpha, ) + network_alpha=network_alpha, + ) attn_processors[key].load_dict(value_dict) else: - raise ValueError( - f"{model_file} does not seem to be in the correct format expected by LoRA training." - ) + raise ValueError(f"{model_file} does not seem to be in the correct format expected by LoRA training.") # set correct dtype & device - attn_processors = { - k: v.to(dtype=self.text_encoder.dtype) - for k, v in attn_processors.items() - } + attn_processors = {k: v.to(dtype=self.text_encoder.dtype) for k, v in attn_processors.items()} return attn_processors @classmethod def save_lora_weights( - self, - save_directory: Union[str, os.PathLike], - unet_lora_layers: Dict[str, nn.Layer]=None, - text_encoder_lora_layers: Dict[str, nn.Layer]=None, - is_main_process: bool=True, - weight_name: str=None, - save_function: Callable=None, - safe_serialization: bool=False, - to_diffusers: Optional[bool]=None, ): + self, + save_directory: Union[str, os.PathLike], + unet_lora_layers: Dict[str, nn.Layer] = None, + text_encoder_lora_layers: Dict[str, nn.Layer] = None, + is_main_process: bool = True, + weight_name: str = None, + save_function: Callable = None, + safe_serialization: bool = False, + to_diffusers: Optional[bool] = None, + ): r""" Save the LoRA parameters corresponding to the UNet and the text encoder. Arguments: @@ -1347,16 +1307,11 @@ def save_lora_weights( """ if to_diffusers is None: to_diffusers = TO_DIFFUSERS - if to_diffusers and safe_serialization and not is_safetensors_available( - ): - raise ImportError( - "`safe_serialization` requires the `safetensors library: `pip install safetensors`." - ) + if to_diffusers and safe_serialization and not is_safetensors_available(): + raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.") if os.path.isfile(save_directory): - logger.error( - f"Provided path ({save_directory}) should be a directory, not a file" - ) + logger.error(f"Provided path ({save_directory}) should be a directory, not a file") return os.makedirs(save_directory, exist_ok=True) @@ -1372,8 +1327,7 @@ def save_lora_weights( if text_encoder_lora_layers is not None: text_encoder_lora_state_dict = { f"{self.text_encoder_name}.{module_name}": param - for module_name, param in text_encoder_lora_layers.state_dict() - .items() + for module_name, param in text_encoder_lora_layers.state_dict().items() } state_dict.update(text_encoder_lora_state_dict) # TODO junnyu, rename paramaters. @@ -1394,16 +1348,13 @@ def save_lora_weights( if safe_serialization: if is_torch_available(): _save_function = safetensors.torch.save_file - state_dict = convert_state_dict( - state_dict, framework="torch") + state_dict = convert_state_dict(state_dict, framework="torch") else: _save_function = safetensors.numpy.save_file - state_dict = convert_state_dict( - state_dict, framework="numpy") + state_dict = convert_state_dict(state_dict, framework="numpy") def save_function(weights, filename): - return _save_function( - weights, filename, metadata={"format": "pt"}) + return _save_function(weights, filename, metadata={"format": "pt"}) else: if not is_torch_available(): @@ -1411,17 +1362,13 @@ def save_function(weights, filename): "`to_diffusers=True` with `safe_serialization=False` requires the `torch library: `pip install torch`." ) save_function = torch.save - state_dict = convert_state_dict( - state_dict, framework="torch") - state_dict = transpose_state_dict( - state_dict, name_mapping={".transformer.": ".encoder."}) + state_dict = convert_state_dict(state_dict, framework="torch") + state_dict = transpose_state_dict(state_dict, name_mapping={".transformer.": ".encoder."}) else: save_function = paddle.save save_function(state_dict, os.path.join(save_directory, weight_name)) - logger.info( - f"Model weights saved in {os.path.join(save_directory, weight_name)}" - ) + logger.info(f"Model weights saved in {os.path.join(save_directory, weight_name)}") def _convert_kohya_lora_to_diffusers(self, state_dict): unet_state_dict = {} @@ -1442,62 +1389,36 @@ def _convert_kohya_lora_to_diffusers(self, state_dict): raise ValueError("Network alpha is not consistent") if lora_name.startswith("lora_unet_"): - diffusers_name = key.replace("lora_unet_", "").replace("_", - ".") - diffusers_name = diffusers_name.replace("down.blocks", - "down_blocks") - diffusers_name = diffusers_name.replace("mid.block", - "mid_block") - diffusers_name = diffusers_name.replace("up.blocks", - "up_blocks") - diffusers_name = diffusers_name.replace( - "transformer.blocks", "transformer_blocks") - diffusers_name = diffusers_name.replace("to.q.lora", - "to_q_lora") - diffusers_name = diffusers_name.replace("to.k.lora", - "to_k_lora") - diffusers_name = diffusers_name.replace("to.v.lora", - "to_v_lora") - diffusers_name = diffusers_name.replace("to.out.0.lora", - "to_out_lora") + diffusers_name = key.replace("lora_unet_", "").replace("_", ".") + diffusers_name = diffusers_name.replace("down.blocks", "down_blocks") + diffusers_name = diffusers_name.replace("mid.block", "mid_block") + diffusers_name = diffusers_name.replace("up.blocks", "up_blocks") + diffusers_name = diffusers_name.replace("transformer.blocks", "transformer_blocks") + diffusers_name = diffusers_name.replace("to.q.lora", "to_q_lora") + diffusers_name = diffusers_name.replace("to.k.lora", "to_k_lora") + diffusers_name = diffusers_name.replace("to.v.lora", "to_v_lora") + diffusers_name = diffusers_name.replace("to.out.0.lora", "to_out_lora") if "transformer_blocks" in diffusers_name: if "attn1" in diffusers_name or "attn2" in diffusers_name: - diffusers_name = diffusers_name.replace( - "attn1", "attn1.processor") - diffusers_name = diffusers_name.replace( - "attn2", "attn2.processor") + diffusers_name = diffusers_name.replace("attn1", "attn1.processor") + diffusers_name = diffusers_name.replace("attn2", "attn2.processor") unet_state_dict[diffusers_name] = value - unet_state_dict[diffusers_name.replace( - ".down.", ".up.")] = state_dict[lora_name_up] + unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict[lora_name_up] elif lora_name.startswith("lora_te_"): - diffusers_name = key.replace("lora_te_", "").replace("_", - ".") - diffusers_name = diffusers_name.replace("text.model", - "text_model") - diffusers_name = diffusers_name.replace("self.attn", - "self_attn") - diffusers_name = diffusers_name.replace("q.proj.lora", - "to_q_lora") - diffusers_name = diffusers_name.replace("k.proj.lora", - "to_k_lora") - diffusers_name = diffusers_name.replace("v.proj.lora", - "to_v_lora") - diffusers_name = diffusers_name.replace("out.proj.lora", - "to_out_lora") + diffusers_name = key.replace("lora_te_", "").replace("_", ".") + diffusers_name = diffusers_name.replace("text.model", "text_model") + diffusers_name = diffusers_name.replace("self.attn", "self_attn") + diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora") + diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora") + diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora") + diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora") if "self_attn" in diffusers_name: te_state_dict[diffusers_name] = value - te_state_dict[diffusers_name.replace( - ".down.", ".up.")] = state_dict[lora_name_up] + te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict[lora_name_up] - unet_state_dict = { - f"{UNET_NAME}.{module_name}": params - for module_name, params in unet_state_dict.items() - } - te_state_dict = { - f"{TEXT_ENCODER_NAME}.{module_name}": params - for module_name, params in te_state_dict.items() - } - new_state_dict = { ** unet_state_dict, ** te_state_dict} + unet_state_dict = {f"{UNET_NAME}.{module_name}": params for module_name, params in unet_state_dict.items()} + te_state_dict = {f"{TEXT_ENCODER_NAME}.{module_name}": params for module_name, params in te_state_dict.items()} + new_state_dict = {**unet_state_dict, **te_state_dict} return new_state_dict, network_alpha @@ -1582,12 +1503,14 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs): ``` """ # import here to avoid circular dependency - from .pipelines.stable_diffusion.convert_from_ckpt import \ - download_from_original_stable_diffusion_ckpt + from .pipelines.stable_diffusion.convert_from_ckpt import ( + download_from_original_stable_diffusion_ckpt, + ) from_hf_hub = "huggingface.co" in pretrained_model_link_or_path or "hf.co" - cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub - else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)) + cache_dir = ( + kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE) + ) resume_download = kwargs.pop("resume_download", False) force_download = kwargs.pop("force_download", False) proxies = kwargs.pop("proxies", None) @@ -1631,22 +1554,20 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs): pretrained_model_link_or_path = str(pretrained_model_link_or_path) if os.path.isfile(pretrained_model_link_or_path): checkpoint_path = pretrained_model_link_or_path - elif pretrained_model_link_or_path.startswith( - "http://") or pretrained_model_link_or_path.startswith( - "https://"): + elif pretrained_model_link_or_path.startswith("http://") or pretrained_model_link_or_path.startswith( + "https://" + ): # HF Hub models - if any(p in pretrained_model_link_or_path - for p in ["huggingface.co", "hf.co"]): + if any(p in pretrained_model_link_or_path for p in ["huggingface.co", "hf.co"]): # remove huggingface url for prefix in [ - "https://huggingface.co/", - "huggingface.co/", - "hf.co/", - "https://hf.co/", + "https://huggingface.co/", + "huggingface.co/", + "hf.co/", + "https://hf.co/", ]: if pretrained_model_link_or_path.startswith(prefix): - pretrained_model_link_or_path = pretrained_model_link_or_path[ - len(prefix):] + pretrained_model_link_or_path = pretrained_model_link_or_path[len(prefix) :] # Code based on diffusers.pipelines.pipeline_utils.DiffusionPipeline.from_pretrained ckpt_path = Path(pretrained_model_link_or_path) @@ -1656,10 +1577,10 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs): file_path = str(Path().joinpath(*ckpt_path.parts[2:])) if file_path.startswith("blob/"): - file_path = file_path[len("blob/"):] + file_path = file_path[len("blob/") :] if file_path.startswith("main/"): - file_path = file_path[len("main/"):] + file_path = file_path[len("main/") :] checkpoint_path = hf_hub_download( repo_id, @@ -1670,17 +1591,18 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs): local_files_only=local_files_only, use_auth_token=use_auth_token, revision=revision, - force_download=force_download, ) + force_download=force_download, + ) else: checkpoint_path = ckpt_path else: checkpoint_path = ppdiffusers_url_download( pretrained_model_link_or_path, cache_dir=cache_dir, - filename=http_file_name(pretrained_model_link_or_path) - .strip('"'), + filename=http_file_name(pretrained_model_link_or_path).strip('"'), force_download=force_download, - resume_download=resume_download, ) + resume_download=resume_download, + ) else: checkpoint_path = pretrained_model_link_or_path @@ -1697,18 +1619,20 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs): upcast_attention=upcast_attention, load_safety_checker=load_safety_checker, prediction_type=prediction_type, - paddle_dtype=paddle_dtype, ) + paddle_dtype=paddle_dtype, + ) return pipe def http_file_name( - url: str, - *, - proxies=None, - headers: Optional[Dict[str, str]]=None, - timeout=10.0, - max_retries=0, ): + url: str, + *, + proxies=None, + headers: Optional[Dict[str, str]] = None, + timeout=10.0, + max_retries=0, +): """ Get a remote file name. """ @@ -1720,7 +1644,8 @@ def http_file_name( proxies=proxies, headers=headers, timeout=timeout, - max_retries=max_retries, ) + max_retries=max_retries, + ) hf_raise_for_status(r) displayed_name = url.split("/")[-1] content_disposition = r.headers.get("Content-Disposition") diff --git a/ppdiffusers/ppdiffusers/models/__init__.py b/ppdiffusers/ppdiffusers/models/__init__.py index 3269f70a0217e..19d5a1b254b83 100644 --- a/ppdiffusers/ppdiffusers/models/__init__.py +++ b/ppdiffusers/ppdiffusers/models/__init__.py @@ -14,8 +14,11 @@ # limitations under the License. # flake8: noqa -from ..utils.import_utils import (OptionalDependencyNotAvailable, - is_einops_available, is_paddle_available) +from ..utils.import_utils import ( + OptionalDependencyNotAvailable, + is_einops_available, + is_paddle_available, +) if is_paddle_available(): from .adapter import MultiAdapter, T2IAdapter diff --git a/ppdiffusers/ppdiffusers/models/adapter.py b/ppdiffusers/ppdiffusers/models/adapter.py index f51292032a59c..639118f29b348 100644 --- a/ppdiffusers/ppdiffusers/models/adapter.py +++ b/ppdiffusers/ppdiffusers/models/adapter.py @@ -22,15 +22,7 @@ class BottleneckResnetBlock(paddle.nn.Layer): - def __init__(self, - in_c, - mid_c, - out_c, - down, - ksize=3, - sk=False, - use_conv=True, - proj_ksize=1): + def __init__(self, in_c, mid_c, out_c, down, ksize=3, sk=False, use_conv=True, proj_ksize=1): super().__init__() ps = ksize // 2 proj_pad = proj_ksize // 2 @@ -40,7 +32,8 @@ def __init__(self, out_channels=mid_c, kernel_size=proj_ksize, stride=1, - padding=proj_pad, ) + padding=proj_pad, + ) else: self.conv1 = None if out_c != mid_c: @@ -49,29 +42,27 @@ def __init__(self, out_channels=out_c, kernel_size=proj_ksize, stride=1, - padding=proj_pad, ) + padding=proj_pad, + ) else: self.conv2 = None - self.block1 = paddle.nn.Conv2D( - in_channels=mid_c, - out_channels=mid_c, - kernel_size=3, - stride=1, - padding=1) + self.block1 = paddle.nn.Conv2D(in_channels=mid_c, out_channels=mid_c, kernel_size=3, stride=1, padding=1) self.act = paddle.nn.ReLU() self.block2 = paddle.nn.Conv2D( in_channels=mid_c, out_channels=mid_c, kernel_size=ksize, stride=1, - padding=ps, ) + padding=ps, + ) if sk is False: self.conv_shortcut = paddle.nn.Conv2D( in_channels=in_c, out_channels=mid_c, kernel_size=ksize, stride=1, - padding=ps, ) + padding=ps, + ) else: self.conv_shortcut = None self.down = down @@ -136,20 +127,20 @@ class T2IAdapter(ModelMixin, ConfigMixin): @register_to_config def __init__( - self, - block_out_channels: List[int]=[320, 640, 1280, 1280], - block_mid_channels: Optional[List[int]]=None, - num_res_blocks: int=3, - channels_in: int=3, - kernel_size: int=3, - proj_kernel_size: int=1, - res_block_skip: bool=True, - use_conv: bool=False, - input_scale_factor: int=8, ): + self, + block_out_channels: List[int] = [320, 640, 1280, 1280], + block_mid_channels: Optional[List[int]] = None, + num_res_blocks: int = 3, + channels_in: int = 3, + kernel_size: int = 3, + proj_kernel_size: int = 1, + res_block_skip: bool = True, + use_conv: bool = False, + input_scale_factor: int = 8, + ): super(T2IAdapter, self).__init__() self.num_downsample_blocks = len(block_out_channels) - self.unshuffle = paddle.nn.PixelUnshuffle( - downscale_factor=input_scale_factor) + self.unshuffle = paddle.nn.PixelUnshuffle(downscale_factor=input_scale_factor) self.num_res_blocks = num_res_blocks self.body = [] if block_mid_channels is None: @@ -166,7 +157,9 @@ def __init__( ksize=kernel_size, proj_ksize=proj_kernel_size, sk=res_block_skip, - use_conv=use_conv, )) + use_conv=use_conv, + ) + ) elif j == num_res_blocks - 1: self.body.append( BottleneckResnetBlock( @@ -177,7 +170,9 @@ def __init__( ksize=kernel_size, proj_ksize=proj_kernel_size, sk=res_block_skip, - use_conv=use_conv, )) + use_conv=use_conv, + ) + ) else: self.body.append( BottleneckResnetBlock( @@ -188,7 +183,9 @@ def __init__( ksize=kernel_size, proj_ksize=proj_kernel_size, sk=res_block_skip, - use_conv=use_conv, )) + use_conv=use_conv, + ) + ) self.body = paddle.nn.LayerList(sublayers=self.body) if block_mid_channels[0] == block_out_channels[0]: self.conv_in = paddle.nn.Conv2D( @@ -196,14 +193,16 @@ def __init__( out_channels=block_mid_channels[0], kernel_size=3, stride=1, - padding=1, ) + padding=1, + ) else: self.conv_in = paddle.nn.Conv2D( in_channels=channels_in * input_scale_factor**2, out_channels=block_mid_channels[0], kernel_size=proj_kernel_size, stride=1, - padding=proj_kernel_size // 2, ) + padding=proj_kernel_size // 2, + ) def forward(self, x: paddle.Tensor) -> List[paddle.Tensor]: """ @@ -241,9 +240,7 @@ def __init__(self, adapters: List[T2IAdapter]): self.num_adapter = len(adapters) self.adapters = paddle.nn.LayerList(sublayers=adapters) - def forward( - self, xs: paddle.Tensor, - adapter_weights: Optional[List[float]]=None) -> List[paddle.Tensor]: + def forward(self, xs: paddle.Tensor, adapter_weights: Optional[List[float]] = None) -> List[paddle.Tensor]: """ Args: xs (`torch.Tensor`): @@ -254,8 +251,7 @@ def forward( them together. """ if adapter_weights is None: - adapter_weights = paddle.to_tensor([1 / self.num_adapter] * - self.num_adapter) + adapter_weights = paddle.to_tensor([1 / self.num_adapter] * self.num_adapter) else: adapter_weights = paddle.to_tensor(adapter_weights) if xs.shape[1] % self.num_adapter != 0: diff --git a/ppdiffusers/ppdiffusers/models/attention.py b/ppdiffusers/ppdiffusers/models/attention.py index 47ae9ef9aa303..199e115810a3e 100644 --- a/ppdiffusers/ppdiffusers/models/attention.py +++ b/ppdiffusers/ppdiffusers/models/attention.py @@ -24,7 +24,7 @@ from .embeddings import CombinedTimestepLabelEmbeddings -def drop_path(input, drop_prob: float=0.0, training: bool=False): +def drop_path(input, drop_prob: float = 0.0, training: bool = False): """ Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). @@ -37,8 +37,7 @@ def drop_path(input, drop_prob: float=0.0, training: bool=False): if drop_prob == 0.0 or not training: return input keep_prob = 1 - drop_prob - shape = (input.shape[0], ) + (1, ) * ( - input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets random_tensor = keep_prob + paddle.rand(shape, dtype=input.dtype) random_tensor = paddle.floor(random_tensor) # binarize output = (input / keep_prob) * random_tensor @@ -48,7 +47,7 @@ def drop_path(input, drop_prob: float=0.0, training: bool=False): class DropPath(nn.Layer): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" - def __init__(self, drop_prob: Optional[float]=None) -> None: + def __init__(self, drop_prob: Optional[float] = None) -> None: super().__init__() self.drop_prob = drop_prob @@ -61,12 +60,13 @@ def extra_repr(self) -> str: class Mlp(nn.Layer): def __init__( - self, - in_features, - hidden_features=None, - out_features=None, - act_layer=nn.GELU, - drop=0.0, ): + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, + ): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features @@ -103,22 +103,21 @@ class AttentionBlock(nn.Layer): # IMPORTANT;TODO(Patrick, William) - this class will be deprecated soon. Do not use it anymore def __init__( - self, - channels: int, - num_head_channels: Optional[int]=None, - norm_num_groups: int=32, - rescale_output_factor: float=1.0, - eps: float=1e-5, ): + self, + channels: int, + num_head_channels: Optional[int] = None, + norm_num_groups: int = 32, + rescale_output_factor: float = 1.0, + eps: float = 1e-5, + ): super().__init__() self.channels = channels - self.num_heads = (channels // num_head_channels - if num_head_channels is not None else 1) + self.num_heads = channels // num_head_channels if num_head_channels is not None else 1 self.head_size = self.channels // self.num_heads self.scale = 1 / math.sqrt(self.channels / self.num_heads) - self.group_norm = nn.GroupNorm( - num_channels=channels, num_groups=norm_num_groups, epsilon=eps) + self.group_norm = nn.GroupNorm(num_channels=channels, num_groups=norm_num_groups, epsilon=eps) # define q,k,v as linear layers self.query = nn.Linear(channels, channels) @@ -132,10 +131,7 @@ def __init__( self._use_2_5_attn = True self._attention_op = None - def reshape_heads_to_batch_dim(self, - tensor, - transpose=True, - merge_head_and_batch=False): + def reshape_heads_to_batch_dim(self, tensor, transpose=True, merge_head_and_batch=False): tensor = tensor.reshape([0, 0, self.num_heads, self.head_size]) # currently we donot use `unmerge_head_and_batch` if transpose or merge_head_and_batch: @@ -145,15 +141,11 @@ def reshape_heads_to_batch_dim(self, tensor = tensor.flatten(0, 1) return tensor - def reshape_batch_dim_to_heads(self, - tensor, - transpose=True, - unmerge_head_and_batch=False): + def reshape_batch_dim_to_heads(self, tensor, transpose=True, unmerge_head_and_batch=False): # currently we donot use `unmerge_head_and_batch` if unmerge_head_and_batch: seq_len = tensor.shape[1] - tensor = tensor.reshape( - [-1, self.num_heads, seq_len, self.head_size]) + tensor = tensor.reshape([-1, self.num_heads, seq_len, self.head_size]) if transpose or unmerge_head_and_batch: tensor = tensor.transpose([0, 2, 1, 3]) @@ -162,9 +154,10 @@ def reshape_batch_dim_to_heads(self, return tensor def set_use_memory_efficient_attention_xformers( - self, - use_memory_efficient_attention_xformers: bool, - attention_op: Optional[str]=None, ): + self, + use_memory_efficient_attention_xformers: bool, + attention_op: Optional[str] = None, + ): # remove this PR: https://github.com/PaddlePaddle/Paddle/pull/56045 # if self.head_size > 128 and attention_op == "flash": # attention_op = "cutlass" @@ -176,18 +169,15 @@ def set_use_memory_efficient_attention_xformers( else: try: _ = F.scaled_dot_product_attention_( - paddle.ones( - (1, 1, 2, 40), dtype=paddle.float16), - paddle.ones( - (1, 1, 2, 40), dtype=paddle.float16), - paddle.ones( - (1, 1, 2, 40), dtype=paddle.float16), - attention_op=attention_op, ) + paddle.ones((1, 1, 2, 40), dtype=paddle.float16), + paddle.ones((1, 1, 2, 40), dtype=paddle.float16), + paddle.ones((1, 1, 2, 40), dtype=paddle.float16), + attention_op=attention_op, + ) except Exception as e: raise e - self._use_memory_efficient_attention_xformers = ( - use_memory_efficient_attention_xformers) + self._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers self._attention_op = attention_op def forward(self, hidden_states): @@ -197,8 +187,7 @@ def forward(self, hidden_states): # norm hidden_states = self.group_norm(hidden_states) - hidden_states = hidden_states.reshape( - [batch, channel, height * width]).transpose([0, 2, 1]) + hidden_states = hidden_states.reshape([batch, channel, height * width]).transpose([0, 2, 1]) # proj to q, k, v query_proj = self.query(hidden_states) @@ -206,14 +195,14 @@ def forward(self, hidden_states): value_proj = self.value(hidden_states) query_proj = self.reshape_heads_to_batch_dim( - query_proj, - transpose=not self._use_memory_efficient_attention_xformers) + query_proj, transpose=not self._use_memory_efficient_attention_xformers + ) key_proj = self.reshape_heads_to_batch_dim( - key_proj, - transpose=not self._use_memory_efficient_attention_xformers) + key_proj, transpose=not self._use_memory_efficient_attention_xformers + ) value_proj = self.reshape_heads_to_batch_dim( - value_proj, - transpose=not self._use_memory_efficient_attention_xformers) + value_proj, transpose=not self._use_memory_efficient_attention_xformers + ) if self._use_memory_efficient_attention_xformers: hidden_states = F.scaled_dot_product_attention_( @@ -224,25 +213,22 @@ def forward(self, hidden_states): scale=self.scale, dropout_p=0.0, training=self.training, - attention_op=self._attention_op, ) + attention_op=self._attention_op, + ) else: - attention_scores = (paddle.matmul( - query_proj, key_proj, transpose_y=True) * self.scale) - attention_probs = F.softmax( - attention_scores.cast("float32"), - axis=-1).cast(attention_scores.dtype) + attention_scores = paddle.matmul(query_proj, key_proj, transpose_y=True) * self.scale + attention_probs = F.softmax(attention_scores.cast("float32"), axis=-1).cast(attention_scores.dtype) hidden_states = paddle.matmul(attention_probs, value_proj) # reshape hidden_states hidden_states = self.reshape_batch_dim_to_heads( - hidden_states, - transpose=not self._use_memory_efficient_attention_xformers) + hidden_states, transpose=not self._use_memory_efficient_attention_xformers + ) # compute next hidden_states hidden_states = self.proj_attn(hidden_states) - hidden_states = hidden_states.transpose([0, 2, 1]).reshape( - [batch, channel, height, width]) + hidden_states = hidden_states.transpose([0, 2, 1]).reshape([batch, channel, height, width]) # res connect and rescale hidden_states = (hidden_states + residual) / self.rescale_output_factor @@ -271,31 +257,29 @@ class BasicTransformerBlock(nn.Layer): """ def __init__( - self, - dim: int, - num_attention_heads: int, - attention_head_dim: int, - dropout=0.0, - cross_attention_dim: Optional[int]=None, - activation_fn: str="geglu", - num_embeds_ada_norm: Optional[int]=None, - attention_bias: bool=False, - only_cross_attention: bool=False, - double_self_attention: bool=False, - upcast_attention: bool=False, - norm_elementwise_affine: bool=True, - norm_type: str="layer_norm", - final_dropout: bool=False, ): + self, + dim: int, + num_attention_heads: int, + attention_head_dim: int, + dropout=0.0, + cross_attention_dim: Optional[int] = None, + activation_fn: str = "geglu", + num_embeds_ada_norm: Optional[int] = None, + attention_bias: bool = False, + only_cross_attention: bool = False, + double_self_attention: bool = False, + upcast_attention: bool = False, + norm_elementwise_affine: bool = True, + norm_type: str = "layer_norm", + final_dropout: bool = False, + ): super().__init__() self.only_cross_attention = only_cross_attention - self.use_ada_layer_norm_zero = ( - num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero" - self.use_ada_layer_norm = ( - num_embeds_ada_norm is not None) and norm_type == "ada_norm" + self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero" + self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm" - if norm_type in ("ada_norm", "ada_norm_zero" - ) and num_embeds_ada_norm is None: + if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None: raise ValueError( f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to" f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}." @@ -320,22 +304,21 @@ def __init__( dim_head=attention_head_dim, dropout=dropout, bias=attention_bias, - cross_attention_dim=cross_attention_dim - if only_cross_attention else None, - upcast_attention=upcast_attention, ) + cross_attention_dim=cross_attention_dim if only_cross_attention else None, + upcast_attention=upcast_attention, + ) # 2. Cross-Attn if cross_attention_dim is not None or double_self_attention: # We currently only use AdaLayerNormZero for self attention where there will only be one attention block. # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during # the second cross attention block. - self.norm2 = (AdaLayerNorm(dim, num_embeds_ada_norm) - if self.use_ada_layer_norm else - nn.LayerNorm(dim, **norm_kwargs)) + self.norm2 = ( + AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim, **norm_kwargs) + ) self.attn2 = Attention( query_dim=dim, - cross_attention_dim=cross_attention_dim - if not double_self_attention else None, + cross_attention_dim=cross_attention_dim if not double_self_attention else None, heads=num_attention_heads, dim_head=attention_head_dim, dropout=dropout, @@ -352,46 +335,45 @@ def __init__( dim, dropout=dropout, activation_fn=activation_fn, - final_dropout=final_dropout, ) + final_dropout=final_dropout, + ) def forward( - self, - hidden_states, - attention_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - timestep=None, - cross_attention_kwargs=None, - class_labels=None, ): + self, + hidden_states, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + timestep=None, + cross_attention_kwargs=None, + class_labels=None, + ): # Notice that normalization is always applied before the real computation in the following blocks. # 1. Self-Attention if self.use_ada_layer_norm: norm_hidden_states = self.norm1(hidden_states, timestep) elif self.use_ada_layer_norm_zero: norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1( - hidden_states, - timestep, - class_labels, - hidden_dtype=hidden_states.dtype) + hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype + ) else: norm_hidden_states = self.norm1(hidden_states) - cross_attention_kwargs = (cross_attention_kwargs if - cross_attention_kwargs is not None else {}) + cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} attn_output = self.attn1( norm_hidden_states, - encoder_hidden_states=encoder_hidden_states - if self.only_cross_attention else None, + encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, attention_mask=attention_mask, - **cross_attention_kwargs, ) + **cross_attention_kwargs, + ) if self.use_ada_layer_norm_zero: attn_output = gate_msa.unsqueeze(1) * attn_output hidden_states = attn_output + hidden_states if self.attn2 is not None: - norm_hidden_states = (self.norm2(hidden_states, timestep) - if self.use_ada_layer_norm else - self.norm2(hidden_states)) + norm_hidden_states = ( + self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states) + ) # TODO (Birch-San): Here we should prepare the encoder_attention mask correctly # prepare attention mask here @@ -400,15 +382,15 @@ def forward( norm_hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=encoder_attention_mask, - **cross_attention_kwargs, ) + **cross_attention_kwargs, + ) hidden_states = attn_output + hidden_states # 3. Feed-forward norm_hidden_states = self.norm3(hidden_states) if self.use_ada_layer_norm_zero: - norm_hidden_states = (norm_hidden_states * - (1 + scale_mlp[:, None]) + shift_mlp[:, None]) + norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None] ff_output = self.ff(norm_hidden_states) @@ -434,13 +416,14 @@ class FeedForward(nn.Layer): """ def __init__( - self, - dim: int, - dim_out: Optional[int]=None, - mult: int=4, - dropout: float=0.0, - activation_fn: str="geglu", - final_dropout: bool=False, ): + self, + dim: int, + dim_out: Optional[int] = None, + mult: int = 4, + dropout: float = 0.0, + activation_fn: str = "geglu", + final_dropout: bool = False, + ): super().__init__() inner_dim = int(dim * mult) dim_out = dim_out if dim_out is not None else dim @@ -476,7 +459,7 @@ class GELU(nn.Layer): GELU activation function with tanh approximation support with `approximate="tanh"`. """ - def __init__(self, dim_in: int, dim_out: int, approximate: str="none"): + def __init__(self, dim_in: int, dim_out: int, approximate: str = "none"): super().__init__() self.proj = nn.Linear(dim_in, dim_out) self.approximate = approximate @@ -552,22 +535,17 @@ class AdaLayerNormZero(nn.Layer): def __init__(self, embedding_dim, num_embeddings): super().__init__() - self.emb = CombinedTimestepLabelEmbeddings(num_embeddings, - embedding_dim) + self.emb = CombinedTimestepLabelEmbeddings(num_embeddings, embedding_dim) self.silu = nn.Silu() - self.linear = nn.Linear( - embedding_dim, 6 * embedding_dim, bias_attr=True) + self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias_attr=True) # elementwise_affine=False norm_kwargs = {"weight_attr": False, "bias_attr": False} self.norm = nn.LayerNorm(embedding_dim, epsilon=1e-6, **norm_kwargs) def forward(self, x, timestep, class_labels, hidden_dtype=None): - emb = self.linear( - self.silu( - self.emb(timestep, class_labels, hidden_dtype=hidden_dtype))) - shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk( - 6, axis=1) + emb = self.linear(self.silu(self.emb(timestep, class_labels, hidden_dtype=hidden_dtype))) + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, axis=1) x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None] return x, gate_msa, shift_mlp, scale_mlp, gate_mlp @@ -578,12 +556,13 @@ class AdaGroupNorm(nn.Layer): """ def __init__( - self, - embedding_dim: int, - out_dim: int, - num_groups: int, - act_fn: Optional[str]=None, - eps: float=1e-5, ): + self, + embedding_dim: int, + out_dim: int, + num_groups: int, + act_fn: Optional[str] = None, + eps: float = 1e-5, + ): super().__init__() self.num_groups = num_groups self.eps = eps @@ -600,8 +579,7 @@ def __init__( self.linear = nn.Linear(embedding_dim, out_dim * 2) # elementwise_affine=False norm_kwargs = {"weight_attr": False, "bias_attr": False} - self.group_norm = nn.GroupNorm( - num_groups, out_dim, epsilon=eps, **norm_kwargs) + self.group_norm = nn.GroupNorm(num_groups, out_dim, epsilon=eps, **norm_kwargs) self.group_norm.weight = None self.group_norm.bias = None diff --git a/ppdiffusers/ppdiffusers/models/attention_processor.py b/ppdiffusers/ppdiffusers/models/attention_processor.py index 506c08b6c76b0..e2c4770f3398a 100644 --- a/ppdiffusers/ppdiffusers/models/attention_processor.py +++ b/ppdiffusers/ppdiffusers/models/attention_processor.py @@ -40,27 +40,27 @@ class Attention(nn.Layer): """ def __init__( - self, - query_dim: int, - cross_attention_dim: Optional[int]=None, - heads: int=8, - dim_head: int=64, - dropout: float=0.0, - bias=False, - upcast_attention: bool=False, - upcast_softmax: bool=False, - cross_attention_norm: Optional[str]=None, - cross_attention_norm_num_groups: int=32, - added_kv_proj_dim: Optional[int]=None, - norm_num_groups: Optional[int]=None, - out_bias: bool=True, - scale_qk: bool=True, - only_cross_attention: bool=False, - processor: Optional["AttnProcessor"]=None, ): + self, + query_dim: int, + cross_attention_dim: Optional[int] = None, + heads: int = 8, + dim_head: int = 64, + dropout: float = 0.0, + bias=False, + upcast_attention: bool = False, + upcast_softmax: bool = False, + cross_attention_norm: Optional[str] = None, + cross_attention_norm_num_groups: int = 32, + added_kv_proj_dim: Optional[int] = None, + norm_num_groups: Optional[int] = None, + out_bias: bool = True, + scale_qk: bool = True, + only_cross_attention: bool = False, + processor: Optional["AttnProcessor"] = None, + ): super().__init__() inner_dim = dim_head * heads - cross_attention_dim = (cross_attention_dim if - cross_attention_dim is not None else query_dim) + cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim self.upcast_attention = upcast_attention self.upcast_softmax = upcast_softmax @@ -82,10 +82,7 @@ def __init__( ) if norm_num_groups is not None: - self.group_norm = nn.GroupNorm( - num_channels=query_dim, - num_groups=norm_num_groups, - epsilon=1e-5) + self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, epsilon=1e-5) else: self.group_norm = None @@ -107,7 +104,8 @@ def __init__( self.norm_cross = nn.GroupNorm( num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, - epsilon=1e-5, ) + epsilon=1e-5, + ) else: raise ValueError( f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'" @@ -117,10 +115,8 @@ def __init__( if not self.only_cross_attention: # only relevant for the `AddedKVProcessor` classes - self.to_k = nn.Linear( - cross_attention_dim, inner_dim, bias_attr=bias) - self.to_v = nn.Linear( - cross_attention_dim, inner_dim, bias_attr=bias) + self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias_attr=bias) + self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias_attr=bias) else: self.to_k = None self.to_v = None @@ -140,15 +136,17 @@ def __init__( self.set_processor(processor) def set_use_memory_efficient_attention_xformers( - self, - use_memory_efficient_attention_xformers: bool, - attention_op: Optional[str]=None, ): + self, + use_memory_efficient_attention_xformers: bool, + attention_op: Optional[str] = None, + ): is_lora = hasattr(self, "processor") and isinstance( - self.processor, (LoRAAttnProcessor, LoRAXFormersAttnProcessor)) + self.processor, (LoRAAttnProcessor, LoRAXFormersAttnProcessor) + ) is_custom_diffusion = hasattr(self, "processor") and isinstance( self.processor, - (CustomDiffusionAttnProcessor, - CustomDiffusionXFormersAttnProcessor), ) + (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor), + ) is_added_kv = self.added_kv_proj_dim is not None if use_memory_efficient_attention_xformers: # if self.added_kv_proj_dim is not None: @@ -167,13 +165,11 @@ def set_use_memory_efficient_attention_xformers( try: # Make sure we can run the memory efficient attention _ = F.scaled_dot_product_attention_( - paddle.ones( - (1, 1, 2, 40), dtype=paddle.float16), - paddle.ones( - (1, 1, 2, 40), dtype=paddle.float16), - paddle.ones( - (1, 1, 2, 40), dtype=paddle.float16), - attention_op=attention_op, ) + paddle.ones((1, 1, 2, 40), dtype=paddle.float16), + paddle.ones((1, 1, 2, 40), dtype=paddle.float16), + paddle.ones((1, 1, 2, 40), dtype=paddle.float16), + attention_op=attention_op, + ) except Exception as e: raise e # remove this PR: https://github.com/PaddlePaddle/Paddle/pull/56045 @@ -184,7 +180,8 @@ def set_use_memory_efficient_attention_xformers( hidden_size=self.processor.hidden_size, cross_attention_dim=self.processor.cross_attention_dim, rank=self.processor.rank, - attention_op=attention_op, ) + attention_op=attention_op, + ) # we must cast dtype processor.to(dtype=self.dtype) processor.load_dict(self.processor.state_dict()) @@ -194,13 +191,13 @@ def set_use_memory_efficient_attention_xformers( train_q_out=self.processor.train_q_out, hidden_size=self.processor.hidden_size, cross_attention_dim=self.processor.cross_attention_dim, - attention_op=attention_op, ) + attention_op=attention_op, + ) # we must cast dtype processor.to(dtype=self.dtype) processor.load_dict(self.processor.state_dict()) elif is_added_kv: - processor = XFormersAttnAddedKVProcessor( - attention_op=attention_op) + processor = XFormersAttnAddedKVProcessor(attention_op=attention_op) else: processor = XFormersAttnProcessor(attention_op=attention_op) else: @@ -208,7 +205,8 @@ def set_use_memory_efficient_attention_xformers( processor = LoRAAttnProcessor( hidden_size=self.processor.hidden_size, cross_attention_dim=self.processor.cross_attention_dim, - rank=self.processor.rank, ) + rank=self.processor.rank, + ) # we must cast dtype processor.to(dtype=self.dtype) processor.load_dict(self.processor.state_dict()) @@ -217,7 +215,8 @@ def set_use_memory_efficient_attention_xformers( train_kv=self.processor.train_kv, train_q_out=self.processor.train_q_out, hidden_size=self.processor.hidden_size, - cross_attention_dim=self.processor.cross_attention_dim, ) + cross_attention_dim=self.processor.cross_attention_dim, + ) # we must cast dtype processor.to(dtype=self.dtype) processor.load_dict(self.processor.state_dict()) @@ -230,9 +229,7 @@ def set_use_memory_efficient_attention_xformers( def set_attention_slice(self, slice_size): if slice_size is not None and slice_size > self.sliceable_head_dim: - raise ValueError( - f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}." - ) + raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.") if slice_size is not None and self.added_kv_proj_dim is not None: processor = SlicedAttnAddedKVProcessor(slice_size) @@ -248,22 +245,19 @@ def set_attention_slice(self, slice_size): def set_processor(self, processor: "AttnProcessor"): # if current processor is in `self._sub_layers` and if passed `processor` is not, we need to # pop `processor` from `self._sub_layers` - if (hasattr(self, "processor") and - isinstance(self.processor, nn.Layer) and - not isinstance(processor, nn.Layer)): - logger.info( - f"You are removing possibly trained weights of {self.processor} with {processor}" - ) + if hasattr(self, "processor") and isinstance(self.processor, nn.Layer) and not isinstance(processor, nn.Layer): + logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}") self._sub_layers.pop("processor") self.processor = processor def forward( - self, - hidden_states, - encoder_hidden_states=None, - attention_mask=None, - **cross_attention_kwargs, ): + self, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + **cross_attention_kwargs, + ): # The `Attention` class can call different attention processors / attention functions # here we simply pass along all tensors to the selected processor class # For standard processors that are defined here, `**cross_attention_kwargs` is empty @@ -272,14 +266,14 @@ def forward( hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, - **cross_attention_kwargs, ) + **cross_attention_kwargs, + ) def batch_to_head_dim(self, tensor, transpose=True, in_dim=4): if in_dim == 3: head_size = self.heads batch_size, seq_len, dim = tensor.shape - tensor = tensor.reshape( - [batch_size // head_size, head_size, seq_len, dim]) + tensor = tensor.reshape([batch_size // head_size, head_size, seq_len, dim]) if transpose: tensor = tensor.transpose([0, 2, 1, 3]) tensor = tensor.reshape([0, 0, tensor.shape[2] * tensor.shape[3]]) @@ -301,8 +295,7 @@ def get_attention_scores(self, query, key, attention_mask=None): query = query.cast(paddle.float32) key = key.cast(paddle.float32) - attention_scores = paddle.matmul( - query, key, transpose_y=True) * self.scale + attention_scores = paddle.matmul(query, key, transpose_y=True) * self.scale if attention_mask is not None: attention_scores = attention_scores + attention_mask @@ -317,12 +310,7 @@ def get_attention_scores(self, query, key, attention_mask=None): return attention_probs - def prepare_attention_mask(self, - attention_mask, - target_length, - batch_size=None, - out_dim=4, - transpose=True): + def prepare_attention_mask(self, attention_mask, target_length, batch_size=None, out_dim=4, transpose=True): if batch_size is None: deprecate( "batch_size=None", @@ -331,7 +319,8 @@ def prepare_attention_mask(self, "Not passing the `batch_size` parameter to `prepare_attention_mask` can lead to incorrect" " attention mask preparation and is deprecated behavior. Please make sure to pass `batch_size` to" " `prepare_attention_mask` when preparing the attention_mask." - ), ) + ), + ) batch_size = 1 num_heads = self.heads @@ -339,21 +328,15 @@ def prepare_attention_mask(self, return attention_mask if attention_mask.shape[-1] != target_length: - attention_mask = F.pad(attention_mask, (0, target_length), - value=0.0, - data_format="NCL") + attention_mask = F.pad(attention_mask, (0, target_length), value=0.0, data_format="NCL") if out_dim == 3: if attention_mask.shape[0] < batch_size * num_heads: - attention_mask = attention_mask.repeat_interleave( - num_heads, axis=0) + attention_mask = attention_mask.repeat_interleave(num_heads, axis=0) elif out_dim == 4: attention_mask = attention_mask.unsqueeze(1) if attention_mask.shape[0] < batch_size * num_heads: - attention_mask = attention_mask.repeat_interleave( - num_heads, axis=1) - attention_mask = paddle.reshape( - attention_mask, - [batch_size, num_heads, -1, attention_mask.shape[-1]]) + attention_mask = attention_mask.repeat_interleave(num_heads, axis=1) + attention_mask = paddle.reshape(attention_mask, [batch_size, num_heads, -1, attention_mask.shape[-1]]) if attention_mask.ndim == 4: if not transpose: @@ -361,9 +344,7 @@ def prepare_attention_mask(self, return attention_mask def norm_encoder_hidden_states(self, encoder_hidden_states): - assert ( - self.norm_cross is not None - ), "self.norm_cross must be defined to call self.norm_encoder_hidden_states" + assert self.norm_cross is not None, "self.norm_cross must be defined to call self.norm_encoder_hidden_states" if isinstance(self.norm_cross, nn.LayerNorm): encoder_hidden_states = self.norm_cross(encoder_hidden_states) @@ -384,24 +365,23 @@ def norm_encoder_hidden_states(self, encoder_hidden_states): class AttnProcessor: def __call__( - self, - attn: Attention, - hidden_states, - encoder_hidden_states=None, - attention_mask=None, - **cross_attention_kwargs, ): - batch_size, sequence_length, _ = (hidden_states.shape - if encoder_hidden_states is None else - encoder_hidden_states.shape) - attention_mask = attn.prepare_attention_mask( - attention_mask, sequence_length, batch_size) + self, + attn: Attention, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + **cross_attention_kwargs, + ): + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) query = attn.to_q(hidden_states) if encoder_hidden_states is None: encoder_hidden_states = hidden_states elif attn.norm_cross: - encoder_hidden_states = attn.norm_encoder_hidden_states( - encoder_hidden_states) + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) key = attn.to_k(encoder_hidden_states) value = attn.to_v(encoder_hidden_states) @@ -427,9 +407,7 @@ def __init__(self, in_features, out_features, rank=4, network_alpha=None): super().__init__() if rank > min(in_features, out_features): - raise ValueError( - f"LoRA rank {rank} must be less or equal than {min(in_features, out_features)}" - ) + raise ValueError(f"LoRA rank {rank} must be less or equal than {min(in_features, out_features)}") self.down = nn.Linear(in_features, rank, bias_attr=False) self.up = nn.Linear(rank, out_features, bias_attr=False) @@ -469,39 +447,31 @@ class LoRAAttnProcessor(nn.Layer): Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs. """ - def __init__(self, - hidden_size, - cross_attention_dim=None, - rank=4, - network_alpha=None): + def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None): super().__init__() self.hidden_size = hidden_size self.cross_attention_dim = cross_attention_dim self.rank = rank - self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, - network_alpha) - self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, - hidden_size, rank, network_alpha) - self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, - hidden_size, rank, network_alpha) - self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, - network_alpha) + self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) def __call__( - self, - attn: Attention, - hidden_states, - encoder_hidden_states=None, - attention_mask=None, - scale=1.0, - **cross_attention_kwargs, ): - batch_size, sequence_length, _ = (hidden_states.shape - if encoder_hidden_states is None else - encoder_hidden_states.shape) - attention_mask = attn.prepare_attention_mask( - attention_mask, sequence_length, batch_size) + self, + attn: Attention, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + scale=1.0, + **cross_attention_kwargs, + ): + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states) query = attn.head_to_batch_dim(query) @@ -509,13 +479,10 @@ def __call__( if encoder_hidden_states is None: encoder_hidden_states = hidden_states elif attn.norm_cross: - encoder_hidden_states = attn.norm_encoder_hidden_states( - encoder_hidden_states) + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) - key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora( - encoder_hidden_states) - value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora( - encoder_hidden_states) + key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states) key = attn.head_to_batch_dim(key) value = attn.head_to_batch_dim(value) @@ -525,8 +492,7 @@ def __call__( hidden_states = attn.batch_to_head_dim(hidden_states) # linear proj - hidden_states = attn.to_out[0]( - hidden_states) + scale * self.to_out_lora(hidden_states) + hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states) # dropout hidden_states = attn.to_out[1](hidden_states) @@ -535,13 +501,14 @@ def __call__( class CustomDiffusionAttnProcessor(nn.Layer): def __init__( - self, - train_kv=True, - train_q_out=True, - hidden_size=None, - cross_attention_dim=None, - out_bias=True, - dropout=0.0, ): + self, + train_kv=True, + train_q_out=True, + hidden_size=None, + cross_attention_dim=None, + out_bias=True, + dropout=0.0, + ): super().__init__() self.train_kv = train_kv self.train_q_out = train_q_out @@ -551,35 +518,26 @@ def __init__( # `_custom_diffusion` id for easy serialization and loading. if self.train_kv: - self.to_k_custom_diffusion = nn.Linear( - cross_attention_dim or hidden_size, - hidden_size, - bias_attr=False) - self.to_v_custom_diffusion = nn.Linear( - cross_attention_dim or hidden_size, - hidden_size, - bias_attr=False) + self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias_attr=False) + self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias_attr=False) if self.train_q_out: - self.to_q_custom_diffusion = nn.Linear( - hidden_size, hidden_size, bias_attr=False) + self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias_attr=False) self.to_out_custom_diffusion = nn.LayerList([]) - self.to_out_custom_diffusion.append( - nn.Linear( - hidden_size, hidden_size, bias_attr=out_bias)) + self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias_attr=out_bias)) self.to_out_custom_diffusion.append(nn.Dropout(dropout)) def __call__( - self, - attn: Attention, - hidden_states, - encoder_hidden_states=None, - attention_mask=None, - **cross_attention_kwargs, ): - batch_size, sequence_length, _ = (hidden_states.shape - if encoder_hidden_states is None else - encoder_hidden_states.shape) - attention_mask = attn.prepare_attention_mask( - attention_mask, sequence_length, batch_size) + self, + attn: Attention, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + **cross_attention_kwargs, + ): + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) if self.train_q_out: query = self.to_q_custom_diffusion(hidden_states) else: @@ -591,8 +549,7 @@ def __call__( else: crossattn = True if attn.norm_cross: - encoder_hidden_states = attn.norm_encoder_hidden_states( - encoder_hidden_states) + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) if self.train_kv: key = self.to_k_custom_diffusion(encoder_hidden_states) @@ -631,40 +588,35 @@ def __call__( class AttnAddedKVProcessor: def __call__( - self, - attn: Attention, - hidden_states, - encoder_hidden_states=None, - attention_mask=None, - **cross_attention_kwargs, ): + self, + attn: Attention, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + **cross_attention_kwargs, + ): residual = hidden_states - hidden_states = hidden_states.reshape( - [hidden_states.shape[0], hidden_states.shape[1], -1]).transpose( - [0, 2, 1]) + hidden_states = hidden_states.reshape([hidden_states.shape[0], hidden_states.shape[1], -1]).transpose( + [0, 2, 1] + ) batch_size, sequence_length, _ = hidden_states.shape - attention_mask = attn.prepare_attention_mask( - attention_mask, sequence_length, batch_size) + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) if encoder_hidden_states is None: encoder_hidden_states = hidden_states elif attn.norm_cross: - encoder_hidden_states = attn.norm_encoder_hidden_states( - encoder_hidden_states) + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) - hidden_states = attn.group_norm(hidden_states.transpose( - [0, 2, 1])).transpose([0, 2, 1]) + hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1]) query = attn.to_q(hidden_states) query = attn.head_to_batch_dim(query) encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) - encoder_hidden_states_value_proj = attn.add_v_proj( - encoder_hidden_states) - encoder_hidden_states_key_proj = attn.head_to_batch_dim( - encoder_hidden_states_key_proj) - encoder_hidden_states_value_proj = attn.head_to_batch_dim( - encoder_hidden_states_value_proj) + encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj) + encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj) if not attn.only_cross_attention: key = attn.to_k(hidden_states) @@ -672,8 +624,7 @@ def __call__( key = attn.head_to_batch_dim(key) value = attn.head_to_batch_dim(value) key = paddle.concat([encoder_hidden_states_key_proj, key], axis=2) - value = paddle.concat( - [encoder_hidden_states_value_proj, value], axis=2) + value = paddle.concat([encoder_hidden_states_value_proj, value], axis=2) else: key = encoder_hidden_states_key_proj value = encoder_hidden_states_value_proj @@ -687,53 +638,47 @@ def __call__( # dropout hidden_states = attn.to_out[1](hidden_states) - hidden_states = hidden_states.transpose( - [0, 2, 1]).reshape(residual.shape) + hidden_states = hidden_states.transpose([0, 2, 1]).reshape(residual.shape) hidden_states = hidden_states + residual return hidden_states class XFormersAttnAddedKVProcessor: - def __init__(self, attention_op: Optional[str]=None): + def __init__(self, attention_op: Optional[str] = None): assert attention_op in [None, "auto", "cutlass", "flash"] self.attention_op = attention_op def __call__( - self, - attn: Attention, - hidden_states, - encoder_hidden_states=None, - attention_mask=None, - **cross_attention_kwargs, ): + self, + attn: Attention, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + **cross_attention_kwargs, + ): residual = hidden_states - hidden_states = hidden_states.reshape( - [hidden_states.shape[0], hidden_states.shape[1], -1]).transpose( - [0, 2, 1]) + hidden_states = hidden_states.reshape([hidden_states.shape[0], hidden_states.shape[1], -1]).transpose( + [0, 2, 1] + ) batch_size, sequence_length, _ = hidden_states.shape - attention_mask = attn.prepare_attention_mask( - attention_mask, sequence_length, batch_size, transpose=False) + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, transpose=False) if encoder_hidden_states is None: encoder_hidden_states = hidden_states elif attn.norm_cross: - encoder_hidden_states = attn.norm_encoder_hidden_states( - encoder_hidden_states) + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) - hidden_states = attn.group_norm(hidden_states.transpose( - [0, 2, 1])).transpose([0, 2, 1]) + hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1]) query = attn.to_q(hidden_states) query = attn.head_to_batch_dim(query, transpose=False) encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) - encoder_hidden_states_value_proj = attn.add_v_proj( - encoder_hidden_states) - encoder_hidden_states_key_proj = attn.head_to_batch_dim( - encoder_hidden_states_key_proj, transpose=False) - encoder_hidden_states_value_proj = attn.head_to_batch_dim( - encoder_hidden_states_value_proj, transpose=False) + encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj, transpose=False) + encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj, transpose=False) if not attn.only_cross_attention: key = attn.to_k(hidden_states) @@ -741,8 +686,7 @@ def __call__( key = attn.head_to_batch_dim(key, transpose=False) value = attn.head_to_batch_dim(value, transpose=False) key = paddle.concat([encoder_hidden_states_key_proj, key], axis=1) - value = paddle.concat( - [encoder_hidden_states_value_proj, value], axis=1) + value = paddle.concat([encoder_hidden_states_value_proj, value], axis=1) else: key = encoder_hidden_states_key_proj value = encoder_hidden_states_value_proj @@ -755,7 +699,8 @@ def __call__( scale=attn.scale, dropout_p=0.0, training=attn.training, - attention_op=self.attention_op, ) + attention_op=self.attention_op, + ) hidden_states = attn.batch_to_head_dim(hidden_states, transpose=False) # linear proj @@ -763,39 +708,37 @@ def __call__( # dropout hidden_states = attn.to_out[1](hidden_states) - hidden_states = hidden_states.transpose( - [0, 2, 1]).reshape(residual.shape) + hidden_states = hidden_states.transpose([0, 2, 1]).reshape(residual.shape) hidden_states = hidden_states + residual return hidden_states class XFormersAttnProcessor: - def __init__(self, attention_op: Optional[str]=None): + def __init__(self, attention_op: Optional[str] = None): assert attention_op in [None, "auto", "cutlass", "flash"] self.attention_op = attention_op def __call__( - self, - attn: Attention, - hidden_states, - encoder_hidden_states=None, - attention_mask=None, - **cross_attention_kwargs, ): - batch_size, sequence_length, _ = (hidden_states.shape - if encoder_hidden_states is None else - encoder_hidden_states.shape) - - attention_mask = attn.prepare_attention_mask( - attention_mask, sequence_length, batch_size, transpose=False) + self, + attn: Attention, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + **cross_attention_kwargs, + ): + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, transpose=False) query = attn.to_q(hidden_states) if encoder_hidden_states is None: encoder_hidden_states = hidden_states elif attn.norm_cross: - encoder_hidden_states = attn.norm_encoder_hidden_states( - encoder_hidden_states) + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) key = attn.to_k(encoder_hidden_states) value = attn.to_v(encoder_hidden_states) @@ -813,7 +756,8 @@ def __call__( scale=attn.scale, dropout_p=0.0, training=attn.training, - attention_op=self.attention_op, ) + attention_op=self.attention_op, + ) # hidden_states = hidden_states.cast(query.dtype) hidden_states = attn.batch_to_head_dim(hidden_states, transpose=False) @@ -847,12 +791,13 @@ class LoRAXFormersAttnProcessor(nn.Layer): """ def __init__( - self, - hidden_size, - cross_attention_dim, - rank=4, - attention_op: Optional[str]=None, - network_alpha=None, ): + self, + hidden_size, + cross_attention_dim, + rank=4, + attention_op: Optional[str] = None, + network_alpha=None, + ): super().__init__() self.hidden_size = hidden_size @@ -860,28 +805,24 @@ def __init__( self.rank = rank self.attention_op = attention_op - self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, - network_alpha) - self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, - hidden_size, rank, network_alpha) - self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, - hidden_size, rank, network_alpha) - self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, - network_alpha) + self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) def __call__( - self, - attn: Attention, - hidden_states, - encoder_hidden_states=None, - attention_mask=None, - scale=1.0, - **cross_attention_kwargs, ): - batch_size, sequence_length, _ = (hidden_states.shape - if encoder_hidden_states is None else - encoder_hidden_states.shape) - attention_mask = attn.prepare_attention_mask( - attention_mask, sequence_length, batch_size, transpose=False) + self, + attn: Attention, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + scale=1.0, + **cross_attention_kwargs, + ): + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, transpose=False) query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states) query = attn.head_to_batch_dim(query, transpose=False) @@ -889,13 +830,10 @@ def __call__( if encoder_hidden_states is None: encoder_hidden_states = hidden_states elif attn.norm_cross: - encoder_hidden_states = attn.norm_encoder_hidden_states( - encoder_hidden_states) + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) - key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora( - encoder_hidden_states) - value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora( - encoder_hidden_states) + key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states) key = attn.head_to_batch_dim(key, transpose=False) value = attn.head_to_batch_dim(value, transpose=False) @@ -908,13 +846,13 @@ def __call__( scale=attn.scale, dropout_p=0.0, training=attn.training, - attention_op=self.attention_op, ) + attention_op=self.attention_op, + ) hidden_states = attn.batch_to_head_dim(hidden_states, transpose=False) # linear proj - hidden_states = attn.to_out[0]( - hidden_states) + scale * self.to_out_lora(hidden_states) + hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states) # dropout hidden_states = attn.to_out[1](hidden_states) @@ -923,14 +861,15 @@ def __call__( class CustomDiffusionXFormersAttnProcessor(nn.Layer): def __init__( - self, - train_kv=True, - train_q_out=False, - hidden_size=None, - cross_attention_dim=None, - out_bias=True, - dropout=0.0, - attention_op: Optional[str]=None, ): + self, + train_kv=True, + train_q_out=False, + hidden_size=None, + cross_attention_dim=None, + out_bias=True, + dropout=0.0, + attention_op: Optional[str] = None, + ): super().__init__() assert attention_op in [None, "auto", "cutlass", "flash"] self.train_kv = train_kv @@ -942,36 +881,27 @@ def __init__( # `_custom_diffusion` id for easy serialization and loading. if self.train_kv: - self.to_k_custom_diffusion = nn.Linear( - cross_attention_dim or hidden_size, - hidden_size, - bias_attr=False) - self.to_v_custom_diffusion = nn.Linear( - cross_attention_dim or hidden_size, - hidden_size, - bias_attr=False) + self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias_attr=False) + self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias_attr=False) if self.train_q_out: - self.to_q_custom_diffusion = nn.Linear( - hidden_size, hidden_size, bias_attr=False) + self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias_attr=False) self.to_out_custom_diffusion = nn.LayerList([]) - self.to_out_custom_diffusion.append( - nn.Linear( - hidden_size, hidden_size, bias_attr=out_bias)) + self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias_attr=out_bias)) self.to_out_custom_diffusion.append(nn.Dropout(dropout)) def __call__( - self, - attn: Attention, - hidden_states, - encoder_hidden_states=None, - attention_mask=None, - **cross_attention_kwargs, ): - batch_size, sequence_length, _ = (hidden_states.shape - if encoder_hidden_states is None else - encoder_hidden_states.shape) - - attention_mask = attn.prepare_attention_mask( - attention_mask, sequence_length, batch_size, transpose=False) + self, + attn: Attention, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + **cross_attention_kwargs, + ): + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, transpose=False) if self.train_q_out: query = self.to_q_custom_diffusion(hidden_states) @@ -984,8 +914,7 @@ def __call__( else: crossattn = True if attn.norm_cross: - encoder_hidden_states = attn.norm_encoder_hidden_states( - encoder_hidden_states) + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) if self.train_kv: key = self.to_k_custom_diffusion(encoder_hidden_states) @@ -1013,7 +942,8 @@ def __call__( scale=attn.scale, dropout_p=0.0, training=attn.training, - attention_op=self.attention_op, ) + attention_op=self.attention_op, + ) # hidden_states = hidden_states.cast(query.dtype) hidden_states = attn.batch_to_head_dim(hidden_states, transpose=False) @@ -1035,17 +965,17 @@ def __init__(self, slice_size): self.slice_size = slice_size def __call__( - self, - attn: Attention, - hidden_states, - encoder_hidden_states=None, - attention_mask=None, - **cross_attention_kwargs, ): - batch_size, sequence_length, _ = (hidden_states.shape - if encoder_hidden_states is None else - encoder_hidden_states.shape) - attention_mask = attn.prepare_attention_mask( - attention_mask, sequence_length, batch_size, out_dim=3) + self, + attn: Attention, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + **cross_attention_kwargs, + ): + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, out_dim=3) query = attn.to_q(hidden_states) query = attn.head_to_batch_dim(query) @@ -1053,8 +983,7 @@ def __call__( if encoder_hidden_states is None: encoder_hidden_states = hidden_states elif attn.norm_cross: - encoder_hidden_states = attn.norm_encoder_hidden_states( - encoder_hidden_states) + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) key = attn.to_k(encoder_hidden_states) value = attn.to_v(encoder_hidden_states) @@ -1067,27 +996,23 @@ def __call__( batch_size_attention = query.shape[0] query_len = query.shape[1] - hidden_states = paddle.zeros( - (batch_size_attention, query_len, attn.head_dim), dtype=query.dtype) + hidden_states = paddle.zeros((batch_size_attention, query_len, attn.head_dim), dtype=query.dtype) for i in range(batch_size_attention // self.slice_size): start_idx = i * self.slice_size end_idx = (i + 1) * self.slice_size query_slice = query[start_idx:end_idx] key_slice = key[start_idx:end_idx] - attn_mask_slice = (attention_mask[start_idx:end_idx] - if attention_mask is not None else None) + attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None - attn_slice = attn.get_attention_scores(query_slice, key_slice, - attn_mask_slice) + attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice) attn_slice = paddle.matmul(attn_slice, value[start_idx:end_idx]) hidden_states[start_idx:end_idx] = attn_slice # reshape back to [bs, num_heads, seqlen, head_dim] - hidden_states = hidden_states.reshape( - [-1, attn.heads, query_len, attn.head_dim]) + hidden_states = hidden_states.reshape([-1, attn.heads, query_len, attn.head_dim]) hidden_states = attn.batch_to_head_dim(hidden_states) # linear proj @@ -1103,42 +1028,37 @@ def __init__(self, slice_size): self.slice_size = slice_size def __call__( - self, - attn: "Attention", - hidden_states, - encoder_hidden_states=None, - attention_mask=None, - **cross_attention_kwargs, ): + self, + attn: "Attention", + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + **cross_attention_kwargs, + ): residual = hidden_states - hidden_states = hidden_states.reshape( - [hidden_states.shape[0], hidden_states.shape[1], -1]).transpose( - [0, 2, 1]) + hidden_states = hidden_states.reshape([hidden_states.shape[0], hidden_states.shape[1], -1]).transpose( + [0, 2, 1] + ) batch_size, sequence_length, _ = hidden_states.shape - attention_mask = attn.prepare_attention_mask( - attention_mask, sequence_length, batch_size, out_dim=3) + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, out_dim=3) if encoder_hidden_states is None: encoder_hidden_states = hidden_states elif attn.norm_cross: - encoder_hidden_states = attn.norm_encoder_hidden_states( - encoder_hidden_states) + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) - hidden_states = attn.group_norm(hidden_states.transpose( - [0, 2, 1])).transpose([0, 2, 1]) + hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1]) query = attn.to_q(hidden_states) query = attn.head_to_batch_dim(query) encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) - encoder_hidden_states_value_proj = attn.add_v_proj( - encoder_hidden_states) + encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) - encoder_hidden_states_key_proj = attn.head_to_batch_dim( - encoder_hidden_states_key_proj) - encoder_hidden_states_value_proj = attn.head_to_batch_dim( - encoder_hidden_states_value_proj) + encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj) + encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj) if not attn.only_cross_attention: key = attn.to_k(hidden_states) @@ -1146,8 +1066,7 @@ def __call__( key = attn.head_to_batch_dim(key) value = attn.head_to_batch_dim(value) key = paddle.concat([encoder_hidden_states_key_proj, key], axis=2) - value = paddle.concat( - [encoder_hidden_states_value_proj, value], axis=2) + value = paddle.concat([encoder_hidden_states_value_proj, value], axis=2) else: key = encoder_hidden_states_key_proj value = encoder_hidden_states_value_proj @@ -1159,8 +1078,7 @@ def __call__( batch_size_attention = query.shape[0] query_len = query.shape[1] - hidden_states = paddle.zeros( - (batch_size_attention, query_len, attn.head_dim), dtype=query.dtype) + hidden_states = paddle.zeros((batch_size_attention, query_len, attn.head_dim), dtype=query.dtype) for i in range(batch_size_attention // self.slice_size): start_idx = i * self.slice_size @@ -1168,19 +1086,16 @@ def __call__( query_slice = query[start_idx:end_idx] key_slice = key[start_idx:end_idx] - attn_mask_slice = (attention_mask[start_idx:end_idx] - if attention_mask is not None else None) + attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None - attn_slice = attn.get_attention_scores(query_slice, key_slice, - attn_mask_slice) + attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice) attn_slice = paddle.matmul(attn_slice, value[start_idx:end_idx]) hidden_states[start_idx:end_idx] = attn_slice # reshape back to [bs, num_heads, seqlen, head_dim] - hidden_states = hidden_states.reshape( - [-1, attn.heads, query_len, attn.head_dim]) + hidden_states = hidden_states.reshape([-1, attn.heads, query_len, attn.head_dim]) hidden_states = attn.batch_to_head_dim(hidden_states) @@ -1189,8 +1104,7 @@ def __call__( # dropout hidden_states = attn.to_out[1](hidden_states) - hidden_states = hidden_states.transpose( - [0, 2, 1]).reshape(residual.shape) + hidden_states = hidden_states.transpose([0, 2, 1]).reshape(residual.shape) hidden_states = hidden_states + residual return hidden_states @@ -1200,9 +1114,17 @@ def __call__( AttnAddedKVProcessor2_5 = XFormersAttnAddedKVProcessor LoRAAttnProcessor2_5 = LoRAXFormersAttnProcessor AttentionProcessor = Union[ - AttnProcessor, AttnProcessor2_5, XFormersAttnProcessor, SlicedAttnProcessor, - AttnAddedKVProcessor, SlicedAttnAddedKVProcessor, AttnAddedKVProcessor2_5, - XFormersAttnAddedKVProcessor, LoRAAttnProcessor, LoRAXFormersAttnProcessor, - LoRAAttnProcessor2_5, CustomDiffusionAttnProcessor, + AttnProcessor, + AttnProcessor2_5, + XFormersAttnProcessor, + SlicedAttnProcessor, + AttnAddedKVProcessor, + SlicedAttnAddedKVProcessor, + AttnAddedKVProcessor2_5, + XFormersAttnAddedKVProcessor, + LoRAAttnProcessor, + LoRAXFormersAttnProcessor, + LoRAAttnProcessor2_5, + CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor, ] diff --git a/ppdiffusers/ppdiffusers/models/autoencoder_kl.py b/ppdiffusers/ppdiffusers/models/autoencoder_kl.py index 3d3b531d927e3..69d1b0fb98bb2 100644 --- a/ppdiffusers/ppdiffusers/models/autoencoder_kl.py +++ b/ppdiffusers/ppdiffusers/models/autoencoder_kl.py @@ -69,29 +69,30 @@ class AutoencoderKL(ModelMixin, ConfigMixin): @register_to_config def __init__( - self, - in_channels: int=3, - out_channels: int=3, - down_block_types: Tuple[str]=("DownEncoderBlock2D", ), - down_block_out_channels: Tuple[int]=None, - up_block_types: Tuple[str]=("UpDecoderBlock2D", ), - up_block_out_channels: Tuple[int]=None, - block_out_channels: Tuple[int]=(64, ), - layers_per_block: int=1, - act_fn: str="silu", - latent_channels: int=4, - norm_num_groups: int=32, - sample_size: int=32, - scaling_factor: float=0.18215, ): + self, + in_channels: int = 3, + out_channels: int = 3, + down_block_types: Tuple[str] = ("DownEncoderBlock2D",), + down_block_out_channels: Tuple[int] = None, + up_block_types: Tuple[str] = ("UpDecoderBlock2D",), + up_block_out_channels: Tuple[int] = None, + block_out_channels: Tuple[int] = (64,), + layers_per_block: int = 1, + act_fn: str = "silu", + latent_channels: int = 4, + norm_num_groups: int = 32, + sample_size: int = 32, + scaling_factor: float = 0.18215, + ): super().__init__() # if down_block_out_channels not givien, we will use block_out_channels - _down_block_out_channels = (self.config.block_out_channels - if down_block_out_channels is None else - self.config.down_block_out_channels) + _down_block_out_channels = ( + self.config.block_out_channels if down_block_out_channels is None else self.config.down_block_out_channels + ) # if up_block_out_channels not givien, we will use block_out_channels - _up_block_out_channels = (self.config.block_out_channels - if up_block_out_channels is None else - self.config.up_block_out_channels) + _up_block_out_channels = ( + self.config.block_out_channels if up_block_out_channels is None else self.config.up_block_out_channels + ) # pass init params to Encoder self.encoder = Encoder( @@ -102,7 +103,8 @@ def __init__( layers_per_block=layers_per_block, act_fn=act_fn, norm_num_groups=norm_num_groups, - double_z=True, ) + double_z=True, + ) # pass init params to Decoder self.decoder = Decoder( @@ -112,7 +114,8 @@ def __init__( block_out_channels=_up_block_out_channels, layers_per_block=layers_per_block, norm_num_groups=norm_num_groups, - act_fn=act_fn, ) + act_fn=act_fn, + ) self.quant_conv = nn.Conv2D(2 * latent_channels, 2 * latent_channels, 1) self.post_quant_conv = nn.Conv2D(latent_channels, latent_channels, 1) @@ -122,18 +125,19 @@ def __init__( # only relevant if vae tiling is enabled self.tile_sample_min_size = self.config.sample_size - sample_size = (self.config.sample_size[0] - if isinstance(self.config.sample_size, (list, tuple)) - else self.config.sample_size) - self.tile_latent_min_size = int(sample_size / - (2**(len(_up_block_out_channels) - 1))) + sample_size = ( + self.config.sample_size[0] + if isinstance(self.config.sample_size, (list, tuple)) + else self.config.sample_size + ) + self.tile_latent_min_size = int(sample_size / (2 ** (len(_up_block_out_channels) - 1))) self.tile_overlap_factor = 0.25 def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, (Encoder, Decoder)): module.gradient_checkpointing = value - def enable_tiling(self, use_tiling: bool=True): + def enable_tiling(self, use_tiling: bool = True): r""" Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in several steps. This is useful to save a large amount of memory and to allow @@ -163,12 +167,10 @@ def disable_slicing(self): self.use_slicing = False @apply_forward_hook - def encode(self, x: paddle.Tensor, - return_dict: bool=True) -> AutoencoderKLOutput: + def encode(self, x: paddle.Tensor, return_dict: bool = True) -> AutoencoderKLOutput: # TODO junnyu, support float16 x = x.cast(self.encoder.conv_in.weight.dtype) - if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or - x.shape[-2] > self.tile_sample_min_size): + if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size): return self.tiled_encode(x, return_dict=return_dict) h = self.encoder(x) @@ -176,57 +178,49 @@ def encode(self, x: paddle.Tensor, posterior = DiagonalGaussianDistribution(moments) if not return_dict: - return (posterior, ) + return (posterior,) return AutoencoderKLOutput(latent_dist=posterior) - def _decode(self, z: paddle.Tensor, - return_dict: bool=True) -> Union[DecoderOutput, paddle.Tensor]: - if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or - z.shape[-2] > self.tile_latent_min_size): + def _decode(self, z: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOutput, paddle.Tensor]: + if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size): return self.tiled_decode(z, return_dict=return_dict) z = self.post_quant_conv(z) dec = self.decoder(z) if not return_dict: - return (dec, ) + return (dec,) return DecoderOutput(sample=dec) @apply_forward_hook - def decode(self, z: paddle.Tensor, - return_dict: bool=True) -> Union[DecoderOutput, paddle.Tensor]: + def decode(self, z: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOutput, paddle.Tensor]: # TODO junnyu, add this to support pure fp16 z = z.cast(self.post_quant_conv.weight.dtype) if self.use_slicing and z.shape[0] > 1: # split、chunk paddle vs pytorch may have some difference - decoded_slices = [ - self._decode(z_slice).sample for z_slice in z.chunk(z.shape[0]) - ] + decoded_slices = [self._decode(z_slice).sample for z_slice in z.chunk(z.shape[0])] decoded = paddle.concat(decoded_slices) else: decoded = self._decode(z).sample if not return_dict: - return (decoded, ) + return (decoded,) return DecoderOutput(sample=decoded) def blend_v(self, a, b, blend_extent): for y in range(min(a.shape[2], b.shape[2], blend_extent)): - b[:, :, y, :] = a[:, :, -blend_extent + y, :] * ( - 1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent) + b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent) return b def blend_h(self, a, b, blend_extent): for x in range(min(a.shape[3], b.shape[3], blend_extent)): - b[:, :, :, x] = a[:, :, :, -blend_extent + x] * ( - 1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent) + b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent) return b - def tiled_encode(self, x: paddle.Tensor, - return_dict: bool=True) -> AutoencoderKLOutput: + def tiled_encode(self, x: paddle.Tensor, return_dict: bool = True) -> AutoencoderKLOutput: r"""Encode a batch of images using a tiled encoder. Args: When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several @@ -237,8 +231,7 @@ def tiled_encode(self, x: paddle.Tensor, x (`paddle.Tensor`): Input batch of images. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`AutoencoderKLOutput`] instead of a plain tuple. """ - overlap_size = int(self.tile_sample_min_size * - (1 - self.tile_overlap_factor)) + overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor)) blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor) row_limit = self.tile_latent_min_size - blend_extent @@ -247,8 +240,12 @@ def tiled_encode(self, x: paddle.Tensor, for i in range(0, x.shape[2], overlap_size): row = [] for j in range(0, x.shape[3], overlap_size): - tile = x[:, :, i:i + self.tile_sample_min_size, j:j + - self.tile_sample_min_size, ] + tile = x[ + :, + :, + i : i + self.tile_sample_min_size, + j : j + self.tile_sample_min_size, + ] tile = self.encoder(tile) tile = self.quant_conv(tile) row.append(tile) @@ -270,13 +267,11 @@ def tiled_encode(self, x: paddle.Tensor, posterior = DiagonalGaussianDistribution(moments) if not return_dict: - return (posterior, ) + return (posterior,) return AutoencoderKLOutput(latent_dist=posterior) - def tiled_decode( - self, z: paddle.Tensor, - return_dict: bool=True) -> Union[DecoderOutput, paddle.Tensor]: + def tiled_decode(self, z: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOutput, paddle.Tensor]: r"""Decode a batch of images using a tiled decoder. Args: When this option is enabled, the VAE will split the input tensor into tiles to compute decoding in several @@ -288,8 +283,7 @@ def tiled_decode( `True`): Whether or not to return a [`DecoderOutput`] instead of a plain tuple. """ - overlap_size = int(self.tile_latent_min_size * - (1 - self.tile_overlap_factor)) + overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor)) blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor) row_limit = self.tile_sample_min_size - blend_extent @@ -299,8 +293,12 @@ def tiled_decode( for i in range(0, z.shape[2], overlap_size): row = [] for j in range(0, z.shape[3], overlap_size): - tile = z[:, :, i:i + self.tile_latent_min_size, j:j + - self.tile_latent_min_size, ] + tile = z[ + :, + :, + i : i + self.tile_latent_min_size, + j : j + self.tile_latent_min_size, + ] tile = self.post_quant_conv(tile) decoded = self.decoder(tile) row.append(decoded) @@ -320,17 +318,17 @@ def tiled_decode( dec = paddle.concat(result_rows, axis=2) if not return_dict: - return (dec, ) + return (dec,) return DecoderOutput(sample=dec) def forward( - self, - sample: paddle.Tensor, - sample_posterior: bool=False, - return_dict: bool=True, - generator: Optional[paddle.Generator]=None, ) -> Union[ - DecoderOutput, paddle.Tensor]: + self, + sample: paddle.Tensor, + sample_posterior: bool = False, + return_dict: bool = True, + generator: Optional[paddle.Generator] = None, + ) -> Union[DecoderOutput, paddle.Tensor]: r""" Args: sample (`paddle.Tensor`): Input sample. @@ -348,6 +346,6 @@ def forward( dec = self.decode(z).sample if not return_dict: - return (dec, ) + return (dec,) return DecoderOutput(sample=dec) diff --git a/ppdiffusers/ppdiffusers/models/controlnet.py b/ppdiffusers/ppdiffusers/models/controlnet.py index 6662f2904992c..2ac640f58f21e 100644 --- a/ppdiffusers/ppdiffusers/models/controlnet.py +++ b/ppdiffusers/ppdiffusers/models/controlnet.py @@ -25,8 +25,12 @@ from .attention_processor import AttentionProcessor, AttnProcessor from .embeddings import TimestepEmbedding, Timesteps from .modeling_utils import ModelMixin -from .unet_2d_blocks import (CrossAttnDownBlock2D, DownBlock2D, - UNetMidBlock2DCrossAttn, get_down_block) +from .unet_2d_blocks import ( + CrossAttnDownBlock2D, + DownBlock2D, + UNetMidBlock2DCrossAttn, + get_down_block, +) from .unet_2d_condition import UNet2DConditionModel logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -54,37 +58,31 @@ class ControlNetConditioningEmbedding(nn.Layer): """ def __init__( - self, - conditioning_embedding_channels: int, - conditioning_channels: int=3, - block_out_channels: Tuple[int]=(16, 32, 96, 256), ): + self, + conditioning_embedding_channels: int, + conditioning_channels: int = 3, + block_out_channels: Tuple[int] = (16, 32, 96, 256), + ): super().__init__() - self.conv_in = nn.Conv2D( - conditioning_channels, - block_out_channels[0], - kernel_size=3, - padding=1) + self.conv_in = nn.Conv2D(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1) self.blocks = nn.LayerList([]) for i in range(len(block_out_channels) - 1): channel_in = block_out_channels[i] channel_out = block_out_channels[i + 1] - self.blocks.append( - nn.Conv2D( - channel_in, channel_in, kernel_size=3, padding=1)) - self.blocks.append( - nn.Conv2D( - channel_in, channel_out, kernel_size=3, padding=1, - stride=2)) + self.blocks.append(nn.Conv2D(channel_in, channel_in, kernel_size=3, padding=1)) + self.blocks.append(nn.Conv2D(channel_in, channel_out, kernel_size=3, padding=1, stride=2)) self.conv_out = zero_module( nn.Conv2D( block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, - padding=1, )) + padding=1, + ) + ) def forward(self, conditioning): embedding = self.conv_in(conditioning) @@ -104,36 +102,37 @@ class ControlNetModel(ModelMixin, ConfigMixin): @register_to_config def __init__( - self, - in_channels: int=4, - flip_sin_to_cos: bool=True, - freq_shift: int=0, - down_block_types: Tuple[str]=( - "CrossAttnDownBlock2D", - "CrossAttnDownBlock2D", - "CrossAttnDownBlock2D", - "DownBlock2D", ), - only_cross_attention: Union[bool, Tuple[bool]]=False, - block_out_channels: Tuple[int]=(320, 640, 1280, 1280), - layers_per_block: int=2, - downsample_padding: int=1, - mid_block_scale_factor: float=1, - act_fn: str="silu", - norm_num_groups: Optional[int]=32, - norm_eps: float=1e-5, - cross_attention_dim: int=1280, - attention_head_dim: Union[int, Tuple[int]]=8, - use_linear_projection: bool=False, - class_embed_type: Optional[str]=None, - num_class_embeds: Optional[int]=None, - upcast_attention: bool=False, - resnet_time_scale_shift: str="default", - projection_class_embeddings_input_dim: Optional[int]=None, - controlnet_conditioning_channel_order: str="rgb", - conditioning_embedding_out_channels: Optional[Tuple[int]]=(16, 32, - 96, 256), - global_pool_conditions: bool=False, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int = 4, + flip_sin_to_cos: bool = True, + freq_shift: int = 0, + down_block_types: Tuple[str] = ( + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + "DownBlock2D", + ), + only_cross_attention: Union[bool, Tuple[bool]] = False, + block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + layers_per_block: int = 2, + downsample_padding: int = 1, + mid_block_scale_factor: float = 1, + act_fn: str = "silu", + norm_num_groups: Optional[int] = 32, + norm_eps: float = 1e-5, + cross_attention_dim: int = 1280, + attention_head_dim: Union[int, Tuple[int]] = 8, + use_linear_projection: bool = False, + class_embed_type: Optional[str] = None, + num_class_embeds: Optional[int] = None, + upcast_attention: bool = False, + resnet_time_scale_shift: str = "default", + projection_class_embeddings_input_dim: Optional[int] = None, + controlnet_conditioning_channel_order: str = "rgb", + conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256), + global_pool_conditions: bool = False, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() # Check inputs @@ -142,16 +141,12 @@ def __init__( f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}." ) - if not isinstance( - only_cross_attention, - bool) and len(only_cross_attention) != len(down_block_types): + if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types): raise ValueError( f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}." ) - if not isinstance( - attention_head_dim, - int) and len(attention_head_dim) != len(down_block_types): + if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types): raise ValueError( f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}." ) @@ -163,27 +158,26 @@ def __init__( in_channels, block_out_channels[0], kernel_size=conv_in_kernel, - padding=conv_in_padding, ) + padding=conv_in_padding, + ) # time time_embed_dim = block_out_channels[0] * 4 - self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, - freq_shift) + self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift) timestep_input_dim = block_out_channels[0] self.time_embedding = TimestepEmbedding( timestep_input_dim, time_embed_dim, - act_fn=act_fn, ) + act_fn=act_fn, + ) # class embedding if class_embed_type is None and num_class_embeds is not None: - self.class_embedding = nn.Embedding(num_class_embeds, - time_embed_dim) + self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim) elif class_embed_type == "timestep": - self.class_embedding = TimestepEmbedding(timestep_input_dim, - time_embed_dim) + self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim) elif class_embed_type == "identity": self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim) elif class_embed_type == "projection": @@ -198,25 +192,24 @@ def __init__( # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations. # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings. # As a result, `TimestepEmbedding` can be passed arbitrary vectors. - self.class_embedding = TimestepEmbedding( - projection_class_embeddings_input_dim, time_embed_dim) + self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim) else: self.class_embedding = None # control net conditioning embedding self.controlnet_cond_embedding = ControlNetConditioningEmbedding( conditioning_embedding_channels=block_out_channels[0], - block_out_channels=conditioning_embedding_out_channels, ) + block_out_channels=conditioning_embedding_out_channels, + ) self.down_blocks = nn.LayerList([]) self.controlnet_down_blocks = nn.LayerList([]) if isinstance(only_cross_attention, bool): - only_cross_attention = [only_cross_attention] * len( - down_block_types) + only_cross_attention = [only_cross_attention] * len(down_block_types) if isinstance(attention_head_dim, int): - attention_head_dim = (attention_head_dim, ) * len(down_block_types) + attention_head_dim = (attention_head_dim,) * len(down_block_types) # pre_temb_act_fun opt self.resnet_pre_temb_non_linearity = resnet_pre_temb_non_linearity @@ -233,8 +226,7 @@ def __init__( # down output_channel = block_out_channels[0] - controlnet_block = nn.Conv2D( - output_channel, output_channel, kernel_size=1) + controlnet_block = nn.Conv2D(output_channel, output_channel, kernel_size=1) controlnet_block = zero_module(controlnet_block) self.controlnet_down_blocks.append(controlnet_block) @@ -260,27 +252,24 @@ def __init__( only_cross_attention=only_cross_attention[i], upcast_attention=upcast_attention, resnet_time_scale_shift=resnet_time_scale_shift, - resnet_pre_temb_non_linearity=self. - resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=self.resnet_pre_temb_non_linearity, + ) self.down_blocks.append(down_block) for _ in range(layers_per_block): - controlnet_block = nn.Conv2D( - output_channel, output_channel, kernel_size=1) + controlnet_block = nn.Conv2D(output_channel, output_channel, kernel_size=1) controlnet_block = zero_module(controlnet_block) self.controlnet_down_blocks.append(controlnet_block) if not is_final_block: - controlnet_block = nn.Conv2D( - output_channel, output_channel, kernel_size=1) + controlnet_block = nn.Conv2D(output_channel, output_channel, kernel_size=1) controlnet_block = zero_module(controlnet_block) self.controlnet_down_blocks.append(controlnet_block) # mid mid_block_channel = block_out_channels[-1] - controlnet_block = nn.Conv2D( - mid_block_channel, mid_block_channel, kernel_size=1) + controlnet_block = nn.Conv2D(mid_block_channel, mid_block_channel, kernel_size=1) controlnet_block = zero_module(controlnet_block) self.controlnet_mid_block = controlnet_block @@ -296,16 +285,17 @@ def __init__( resnet_groups=norm_num_groups, use_linear_projection=use_linear_projection, upcast_attention=upcast_attention, - resnet_pre_temb_non_linearity=self.resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=self.resnet_pre_temb_non_linearity, + ) @classmethod def from_unet( - cls, - unet: UNet2DConditionModel, - controlnet_conditioning_channel_order: str="rgb", - conditioning_embedding_out_channels: Optional[Tuple[int]]=(16, 32, - 96, 256), - load_weights_from_unet: bool=True, ): + cls, + unet: UNet2DConditionModel, + controlnet_conditioning_channel_order: str = "rgb", + conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256), + load_weights_from_unet: bool = True, + ): r""" Instantiate Controlnet class from UNet2DConditionModel. Parameters: @@ -333,22 +323,19 @@ def from_unet( num_class_embeds=unet.config.num_class_embeds, upcast_attention=unet.config.upcast_attention, resnet_time_scale_shift=unet.config.resnet_time_scale_shift, - projection_class_embeddings_input_dim=unet.config. - projection_class_embeddings_input_dim, + projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim, controlnet_conditioning_channel_order=controlnet_conditioning_channel_order, conditioning_embedding_out_channels=conditioning_embedding_out_channels, - resnet_pre_temb_non_linearity=unet.config. - resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=unet.config.resnet_pre_temb_non_linearity, + ) if load_weights_from_unet: controlnet.conv_in.load_dict(unet.conv_in.state_dict()) controlnet.time_proj.load_dict(unet.time_proj.state_dict()) - controlnet.time_embedding.load_dict(unet.time_embedding.state_dict( - )) + controlnet.time_embedding.load_dict(unet.time_embedding.state_dict()) if controlnet.class_embedding: - controlnet.class_embedding.load_dict( - unet.class_embedding.state_dict()) + controlnet.class_embedding.load_dict(unet.class_embedding.state_dict()) controlnet.down_blocks.load_dict(unet.down_blocks.state_dict()) controlnet.mid_block.load_dict(unet.mid_block.state_dict()) @@ -365,16 +352,12 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]: # set recursively processors = {} - def fn_recursive_add_processors( - name: str, - module: nn.Layer, - processors: Dict[str, AttentionProcessor]): + def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]): if hasattr(module, "set_processor"): processors[f"{name}.processor"] = module.processor for sub_name, child in module.named_children(): - fn_recursive_add_processors(f"{name}.{sub_name}", child, - processors) + fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) return processors @@ -383,9 +366,7 @@ def fn_recursive_add_processors( return processors - def set_attn_processor(self, - processor: Union[AttentionProcessor, Dict[ - str, AttentionProcessor]]): + def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): r""" Parameters: `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`): @@ -409,8 +390,7 @@ def fn_recursive_attn_processor(name: str, module: nn.Layer, processor): module.set_processor(processor.pop(f"{name}.processor")) for sub_name, child in module.named_children(): - fn_recursive_attn_processor(f"{name}.{sub_name}", child, - processor) + fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) for name, module in self.named_children(): fn_recursive_attn_processor(name, module, processor) @@ -457,8 +437,7 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer): # make smallest slice possible slice_size = num_sliceable_layers * [1] - slice_size = (num_sliceable_layers * [slice_size] - if not isinstance(slice_size, list) else slice_size) + slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size if len(slice_size) != len(sliceable_head_dims): raise ValueError( @@ -470,14 +449,12 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer): size = slice_size[i] dim = sliceable_head_dims[i] if size is not None and size > dim: - raise ValueError( - f"size {size} has to be smaller or equal to {dim}.") + raise ValueError(f"size {size} has to be smaller or equal to {dim}.") # Recursively walk through all the children. # Any children which exposes the set_attention_slice method # gets the message - def fn_recursive_set_attention_slice(module: nn.Layer, - slice_size: List[int]): + def fn_recursive_set_attention_slice(module: nn.Layer, slice_size: List[int]): if hasattr(module, "set_attention_slice"): module.set_attention_slice(slice_size.pop()) @@ -493,18 +470,19 @@ def _set_gradient_checkpointing(self, module, value=False): module.gradient_checkpointing = value def forward( - self, - sample: paddle.Tensor, - timestep: Union[paddle.Tensor, float, int], - encoder_hidden_states: paddle.Tensor, - controlnet_cond: paddle.Tensor, - conditioning_scale: Union[List[float], float]=1.0, - class_labels: Optional[paddle.Tensor]=None, - timestep_cond: Optional[paddle.Tensor]=None, - attention_mask: Optional[paddle.Tensor]=None, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - guess_mode: bool=False, - return_dict: bool=True, ) -> Union[ControlNetOutput, Tuple]: + self, + sample: paddle.Tensor, + timestep: Union[paddle.Tensor, float, int], + encoder_hidden_states: paddle.Tensor, + controlnet_cond: paddle.Tensor, + conditioning_scale: Union[List[float], float] = 1.0, + class_labels: Optional[paddle.Tensor] = None, + timestep_cond: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guess_mode: bool = False, + return_dict: bool = True, + ) -> Union[ControlNetOutput, Tuple]: # TODO junnyu, add this to support pure fp16 sample = sample.cast(self.dtype) @@ -517,9 +495,7 @@ def forward( elif channel_order == "bgr": controlnet_cond = paddle.flip(controlnet_cond, axis=[1]) else: - raise ValueError( - f"unknown `controlnet_conditioning_channel_order`: {channel_order}" - ) + raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}") # prepare attention_mask if attention_mask is not None: @@ -534,7 +510,11 @@ def forward( timesteps = timesteps[None] # broadcast to batch dimension in a way that's compatible with ONNX/Core ML - timesteps = timesteps.expand([sample.shape[0], ]) + timesteps = timesteps.expand( + [ + sample.shape[0], + ] + ) t_emb = self.time_proj(timesteps) @@ -547,8 +527,7 @@ def forward( if self.class_embedding is not None: if class_labels is None: - raise ValueError( - "class_labels should be provided when num_class_embeds > 0") + raise ValueError("class_labels should be provided when num_class_embeds > 0") # maybe cast it to float16 class_labels = class_labels.cast(self.dtype) @@ -572,20 +551,19 @@ def forward( sample += controlnet_cond # 3. down - down_block_res_samples = (sample, ) + down_block_res_samples = (sample,) for downsample_block in self.down_blocks: - if (hasattr(downsample_block, "has_cross_attention") and - downsample_block.has_cross_attention): + if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention: sample, res_samples = downsample_block( hidden_states=sample, temb=emb, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, - cross_attention_kwargs=cross_attention_kwargs, ) + cross_attention_kwargs=cross_attention_kwargs, + ) else: - sample, res_samples = downsample_block( - hidden_states=sample, temb=emb) + sample, res_samples = downsample_block(hidden_states=sample, temb=emb) down_block_res_samples += res_samples @@ -596,16 +574,16 @@ def forward( emb, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, - cross_attention_kwargs=cross_attention_kwargs, ) + cross_attention_kwargs=cross_attention_kwargs, + ) # 5. Control net blocks controlnet_down_block_res_samples = () - for down_block_res_sample, controlnet_block in zip( - down_block_res_samples, self.controlnet_down_blocks): + for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks): down_block_res_sample = controlnet_block(down_block_res_sample) - controlnet_down_block_res_samples += (down_block_res_sample, ) + controlnet_down_block_res_samples += (down_block_res_sample,) down_block_res_samples = controlnet_down_block_res_samples @@ -613,45 +591,34 @@ def forward( # 6. scaling if guess_mode: - scales = paddle.logspace( - -1, 0, len(down_block_res_samples) + 1) # 0.1 to 1.0 + scales = paddle.logspace(-1, 0, len(down_block_res_samples) + 1) # 0.1 to 1.0 scales *= conditioning_scale - down_block_res_samples = [ - sample * scale - for sample, scale in zip(down_block_res_samples, scales) - ] + down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)] mid_block_res_sample *= scales[-1] # last one else: # add conditioning_scale https://github.com/huggingface/diffusers/pull/2627 if isinstance(conditioning_scale, (float, int)): - down_block_res_samples = [ - sample * conditioning_scale - for sample in down_block_res_samples - ] + down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples] mid_block_res_sample *= conditioning_scale else: down_block_res_samples = [ - sample * ccs - for sample, ccs in zip(down_block_res_samples, - conditioning_scale[:-1]) + sample * ccs for sample, ccs in zip(down_block_res_samples, conditioning_scale[:-1]) ] mid_block_res_sample *= conditioning_scale[-1] if self.config.global_pool_conditions: down_block_res_samples = [ - paddle.mean( - sample, axis=(2, 3), keepdim=True) - for sample in down_block_res_samples + paddle.mean(sample, axis=(2, 3), keepdim=True) for sample in down_block_res_samples ] - mid_block_res_sample = paddle.mean( - mid_block_res_sample, axis=(2, 3), keepdim=True) + mid_block_res_sample = paddle.mean(mid_block_res_sample, axis=(2, 3), keepdim=True) if not return_dict: return (down_block_res_samples, mid_block_res_sample) return ControlNetOutput( down_block_res_samples=down_block_res_samples, - mid_block_res_sample=mid_block_res_sample, ) + mid_block_res_sample=mid_block_res_sample, + ) def zero_module(module): diff --git a/ppdiffusers/ppdiffusers/models/cross_attention.py b/ppdiffusers/ppdiffusers/models/cross_attention.py index 06660a99f385d..10911591e9f36 100644 --- a/ppdiffusers/ppdiffusers/models/cross_attention.py +++ b/ppdiffusers/ppdiffusers/models/cross_attention.py @@ -15,17 +15,21 @@ from .attention_processor import AttentionProcessor # noqa: F401 from .attention_processor import AttnProcessor2_5 # noqa: F401 from .attention_processor import Attention, AttnAddedKVProcessor -from .attention_processor import \ - AttnProcessor as AttnProcessorRename # noqa: F401 +from .attention_processor import AttnProcessor as AttnProcessorRename # noqa: F401 from .attention_processor import ( - LoRAAttnProcessor, LoRALinearLayer, LoRAXFormersAttnProcessor, - SlicedAttnAddedKVProcessor, SlicedAttnProcessor, XFormersAttnProcessor) + LoRAAttnProcessor, + LoRAXFormersAttnProcessor, + SlicedAttnAddedKVProcessor, + SlicedAttnProcessor, + XFormersAttnProcessor, +) deprecate( "cross_attention", "0.18.0", "Importing from cross_attention is deprecated. Please import from diffusers.models.attention_processor instead.", - standard_warn=False, ) + standard_warn=False, +) AttnProcessor = AttentionProcessor @@ -33,86 +37,54 @@ class CrossAttention(Attention): def __init__(self, *args, **kwargs): deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." - deprecate( - "cross_attention", - "0.18.0", - deprecation_message, - standard_warn=False) + deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False) super().__init__(*args, **kwargs) class CrossAttnProcessor(AttnProcessorRename): def __init__(self, *args, **kwargs): deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." - deprecate( - "cross_attention", - "0.18.0", - deprecation_message, - standard_warn=False) + deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False) super().__init__(*args, **kwargs) class LoRACrossAttnProcessor(LoRAAttnProcessor): def __init__(self, *args, **kwargs): deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." - deprecate( - "cross_attention", - "0.18.0", - deprecation_message, - standard_warn=False) + deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False) super().__init__(*args, **kwargs) class CrossAttnAddedKVProcessor(AttnAddedKVProcessor): def __init__(self, *args, **kwargs): deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." - deprecate( - "cross_attention", - "0.18.0", - deprecation_message, - standard_warn=False) + deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False) super().__init__(*args, **kwargs) class XFormersCrossAttnProcessor(XFormersAttnProcessor): def __init__(self, *args, **kwargs): deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." - deprecate( - "cross_attention", - "0.18.0", - deprecation_message, - standard_warn=False) + deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False) super().__init__(*args, **kwargs) class LoRAXFormersCrossAttnProcessor(LoRAXFormersAttnProcessor): def __init__(self, *args, **kwargs): deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." - deprecate( - "cross_attention", - "0.18.0", - deprecation_message, - standard_warn=False) + deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False) super().__init__(*args, **kwargs) class SlicedCrossAttnProcessor(SlicedAttnProcessor): def __init__(self, *args, **kwargs): deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." - deprecate( - "cross_attention", - "0.18.0", - deprecation_message, - standard_warn=False) + deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False) super().__init__(*args, **kwargs) class SlicedCrossAttnAddedKVProcessor(SlicedAttnAddedKVProcessor): def __init__(self, *args, **kwargs): deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." - deprecate( - "cross_attention", - "0.18.0", - deprecation_message, - standard_warn=False) + deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False) super().__init__(*args, **kwargs) diff --git a/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py b/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py index d1f6482176d0d..d6f680e81fc62 100644 --- a/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py +++ b/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py @@ -47,35 +47,40 @@ class DualTransformer2DModel(nn.Layer): """ def __init__( - self, - num_attention_heads: int=16, - attention_head_dim: int=88, - in_channels: Optional[int]=None, - num_layers: int=1, - dropout: float=0.0, - norm_num_groups: int=32, - cross_attention_dim: Optional[int]=None, - attention_bias: bool=False, - sample_size: Optional[int]=None, - num_vector_embeds: Optional[int]=None, - activation_fn: str="geglu", - num_embeds_ada_norm: Optional[int]=None, ): + self, + num_attention_heads: int = 16, + attention_head_dim: int = 88, + in_channels: Optional[int] = None, + num_layers: int = 1, + dropout: float = 0.0, + norm_num_groups: int = 32, + cross_attention_dim: Optional[int] = None, + attention_bias: bool = False, + sample_size: Optional[int] = None, + num_vector_embeds: Optional[int] = None, + activation_fn: str = "geglu", + num_embeds_ada_norm: Optional[int] = None, + ): super().__init__() - self.transformers = nn.LayerList([ - Transformer2DModel( - num_attention_heads=num_attention_heads, - attention_head_dim=attention_head_dim, - in_channels=in_channels, - num_layers=num_layers, - dropout=dropout, - norm_num_groups=norm_num_groups, - cross_attention_dim=cross_attention_dim, - attention_bias=attention_bias, - sample_size=sample_size, - num_vector_embeds=num_vector_embeds, - activation_fn=activation_fn, - num_embeds_ada_norm=num_embeds_ada_norm, ) for _ in range(2) - ]) + self.transformers = nn.LayerList( + [ + Transformer2DModel( + num_attention_heads=num_attention_heads, + attention_head_dim=attention_head_dim, + in_channels=in_channels, + num_layers=num_layers, + dropout=dropout, + norm_num_groups=norm_num_groups, + cross_attention_dim=cross_attention_dim, + attention_bias=attention_bias, + sample_size=sample_size, + num_vector_embeds=num_vector_embeds, + activation_fn=activation_fn, + num_embeds_ada_norm=num_embeds_ada_norm, + ) + for _ in range(2) + ] + ) # Variables that can be set by a pipeline: @@ -91,13 +96,14 @@ def __init__( self.transformer_index_for_condition = [1, 0] def forward( - self, - hidden_states, - encoder_hidden_states, - timestep=None, - attention_mask=None, - cross_attention_kwargs=None, - return_dict: bool=True, ): + self, + hidden_states, + encoder_hidden_states, + timestep=None, + attention_mask=None, + cross_attention_kwargs=None, + return_dict: bool = True, + ): """ Args: hidden_states ( When discrete, `paddle.Tensor` of shape `(batch size, num latent pixels)`. @@ -125,23 +131,22 @@ def forward( # attention_mask is not used yet for i in range(2): # for each of the two transformers, pass the corresponding condition tokens - condition_state = encoder_hidden_states[:, tokens_start:tokens_start - + self.condition_lengths[i]] + condition_state = encoder_hidden_states[:, tokens_start : tokens_start + self.condition_lengths[i]] transformer_index = self.transformer_index_for_condition[i] encoded_state = self.transformers[transformer_index]( input_states, encoder_hidden_states=condition_state, timestep=timestep, cross_attention_kwargs=cross_attention_kwargs, - return_dict=False, )[0] + return_dict=False, + )[0] encoded_states.append(encoded_state - input_states) tokens_start += self.condition_lengths[i] - output_states = encoded_states[0] * self.mix_ratio + encoded_states[ - 1] * (1 - self.mix_ratio) + output_states = encoded_states[0] * self.mix_ratio + encoded_states[1] * (1 - self.mix_ratio) output_states = output_states + input_states if not return_dict: - return (output_states, ) + return (output_states,) return Transformer2DModelOutput(sample=output_states) diff --git a/ppdiffusers/ppdiffusers/models/ema.py b/ppdiffusers/ppdiffusers/models/ema.py index 1d88a8a18c498..b42e0c2ad02ad 100644 --- a/ppdiffusers/ppdiffusers/models/ema.py +++ b/ppdiffusers/ppdiffusers/models/ema.py @@ -34,14 +34,11 @@ def __init__(self, model, decay=0.9999, use_num_upates=True): raise ValueError("Decay must be between 0 and 1") self.m_name2s_name = {} - self.register_buffer( - "decay", paddle.to_tensor( - decay, dtype=paddle.float32)) + self.register_buffer("decay", paddle.to_tensor(decay, dtype=paddle.float32)) self.register_buffer( "num_updates", - paddle.to_tensor( - 0, dtype=paddle.int64) if use_num_upates else paddle.to_tensor( - -1, dtype=paddle.int64), ) + paddle.to_tensor(0, dtype=paddle.int64) if use_num_upates else paddle.to_tensor(-1, dtype=paddle.int64), + ) for name, p in model.named_parameters(): if not p.stop_gradient: @@ -57,8 +54,7 @@ def forward(self, model): if self.num_updates >= 0: self.num_updates += 1 - decay = min(self.decay, - (1 + self.num_updates) / (10 + self.num_updates)) + decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates)) one_minus_decay = 1.0 - decay @@ -79,8 +75,7 @@ def copy_to(self, model): shadow_params = dict(self.named_buffers()) for key in m_param: if not m_param[key].stop_gradient: - m_param[key].copy_(shadow_params[self.m_name2s_name[key]], - False) + m_param[key].copy_(shadow_params[self.m_name2s_name[key]], False) else: assert key not in self.m_name2s_name @@ -91,9 +86,7 @@ def store(self, parameters): parameters: Iterable of `EagerParamBase`; the parameters to be temporarily stored. """ - self.collected_params = [ - param.detach().cpu().clone() for param in parameters - ] + self.collected_params = [param.detach().cpu().clone() for param in parameters] def restore(self, parameters): """ diff --git a/ppdiffusers/ppdiffusers/models/embeddings.py b/ppdiffusers/ppdiffusers/models/embeddings.py index 9527cf3ae055b..4c38ff3d44a98 100644 --- a/ppdiffusers/ppdiffusers/models/embeddings.py +++ b/ppdiffusers/ppdiffusers/models/embeddings.py @@ -21,12 +21,13 @@ def get_timestep_embedding( - timesteps: paddle.Tensor, - embedding_dim: int, - flip_sin_to_cos: bool=False, - downscale_freq_shift: float=1, - scale: float=1, - max_period: int=10000, ): + timesteps: paddle.Tensor, + embedding_dim: int, + flip_sin_to_cos: bool = False, + downscale_freq_shift: float = 1, + scale: float = 1, + max_period: int = 10000, +): """ This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings. @@ -38,8 +39,7 @@ def get_timestep_embedding( assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array" half_dim = embedding_dim // 2 - exponent = -math.log(max_period) * paddle.arange( - start=0, end=half_dim, dtype="float32") + exponent = -math.log(max_period) * paddle.arange(start=0, end=half_dim, dtype="float32") exponent = exponent / (half_dim - downscale_freq_shift) @@ -62,10 +62,7 @@ def get_timestep_embedding( return emb -def get_2d_sincos_pos_embed(embed_dim, - grid_size, - cls_token=False, - extra_tokens=0): +def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0): """ grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) @@ -78,8 +75,7 @@ def get_2d_sincos_pos_embed(embed_dim, grid = grid.reshape([2, 1, grid_size, grid_size]) pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) if cls_token and extra_tokens > 0: - pos_embed = np.concatenate( - [np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0) + pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0) return pos_embed @@ -88,10 +84,8 @@ def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): raise ValueError("embed_dim must be divisible by 2") # use half of dimensions to encode grid_h - emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, - grid[0]) # (H*W, D/2) - emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, - grid[1]) # (H*W, D/2) + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) return emb @@ -122,16 +116,17 @@ class PatchEmbed(nn.Layer): """2D Image to Patch Embedding""" def __init__( - self, - height=224, - width=224, - patch_size=16, - in_channels=3, - embed_dim=768, - layer_norm=False, - flatten=True, - bias=True, - add_pos_embed=True, ): + self, + height=224, + width=224, + patch_size=16, + in_channels=3, + embed_dim=768, + layer_norm=False, + flatten=True, + bias=True, + add_pos_embed=True, + ): super().__init__() num_patches = (height // patch_size) * (width // patch_size) @@ -143,22 +138,22 @@ def __init__( embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, - bias_attr=bias, ) + bias_attr=bias, + ) if layer_norm: # elementwise_affine=False -> weight_attr=False, bias_attr=False - self.norm = nn.LayerNorm( - embed_dim, epsilon=1e-6, weight_attr=False, bias_attr=False) + self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6, weight_attr=False, bias_attr=False) else: self.norm = None self.add_pos_embed = add_pos_embed if add_pos_embed: - pos_embed = get_2d_sincos_pos_embed(embed_dim, - int(num_patches**0.5)) + pos_embed = get_2d_sincos_pos_embed(embed_dim, int(num_patches**0.5)) self.register_buffer( "pos_embed", paddle.to_tensor(pos_embed).cast("float32").unsqueeze(0), - persistable=False, ) + persistable=False, + ) def forward(self, latent): latent = self.proj(latent) @@ -174,20 +169,20 @@ def forward(self, latent): class TimestepEmbedding(nn.Layer): def __init__( - self, - in_channels: int, - time_embed_dim: int, - act_fn: str="silu", - out_dim: int=None, - post_act_fn: Optional[str]=None, - cond_proj_dim=None, ): + self, + in_channels: int, + time_embed_dim: int, + act_fn: str = "silu", + out_dim: int = None, + post_act_fn: Optional[str] = None, + cond_proj_dim=None, + ): super().__init__() self.linear_1 = nn.Linear(in_channels, time_embed_dim) if cond_proj_dim is not None: - self.cond_proj = nn.Linear( - cond_proj_dim, in_channels, bias_attr=False) + self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias_attr=False) else: self.cond_proj = None @@ -198,9 +193,7 @@ def __init__( elif act_fn == "gelu": self.act = nn.GELU() else: - raise ValueError( - f"{act_fn} does not exist. Make sure to define one of 'silu', 'mish', or 'gelu'" - ) + raise ValueError(f"{act_fn} does not exist. Make sure to define one of 'silu', 'mish', or 'gelu'") if out_dim is not None: time_embed_dim_out = out_dim @@ -217,9 +210,7 @@ def __init__( elif post_act_fn == "gelu": self.post_act = nn.GELU() else: - raise ValueError( - f"{post_act_fn} does not exist. Make sure to define one of 'silu', 'mish', or 'gelu'" - ) + raise ValueError(f"{post_act_fn} does not exist. Make sure to define one of 'silu', 'mish', or 'gelu'") def forward(self, sample, condition=None): if condition is not None: @@ -237,10 +228,7 @@ def forward(self, sample, condition=None): class Timesteps(nn.Layer): - def __init__(self, - num_channels: int, - flip_sin_to_cos: bool, - downscale_freq_shift: float): + def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float): super().__init__() self.num_channels = num_channels self.flip_sin_to_cos = flip_sin_to_cos @@ -251,7 +239,8 @@ def forward(self, timesteps): timesteps, self.num_channels, flip_sin_to_cos=self.flip_sin_to_cos, - downscale_freq_shift=self.downscale_freq_shift, ) + downscale_freq_shift=self.downscale_freq_shift, + ) return t_emb @@ -259,20 +248,21 @@ class GaussianFourierProjection(nn.Layer): """Gaussian Fourier embeddings for noise levels.""" def __init__( - self, - embedding_size: int=256, - scale: float=1.0, - set_W_to_weight=True, - log=True, - flip_sin_to_cos=False, ): + self, + embedding_size: int = 256, + scale: float = 1.0, + set_W_to_weight=True, + log=True, + flip_sin_to_cos=False, + ): super().__init__() - self.register_buffer("weight", paddle.randn((embedding_size, )) * scale) + self.register_buffer("weight", paddle.randn((embedding_size,)) * scale) self.log = log self.flip_sin_to_cos = flip_sin_to_cos if set_W_to_weight: # to delete later - self.register_buffer("W", paddle.randn((embedding_size, )) * scale) + self.register_buffer("W", paddle.randn((embedding_size,)) * scale) self.weight = self.W @@ -285,11 +275,9 @@ def forward(self, x): x_proj = x[:, None] * self.weight[None, :] * 2 * np.pi if self.flip_sin_to_cos: - out = paddle.concat( - [paddle.cos(x_proj), paddle.sin(x_proj)], axis=-1) + out = paddle.concat([paddle.cos(x_proj), paddle.sin(x_proj)], axis=-1) else: - out = paddle.concat( - [paddle.sin(x_proj), paddle.cos(x_proj)], axis=-1) + out = paddle.concat([paddle.sin(x_proj), paddle.cos(x_proj)], axis=-1) return out @@ -318,11 +306,12 @@ class ImagePositionalEmbeddings(nn.Layer): """ def __init__( - self, - num_embed: int, - height: int, - width: int, - embed_dim: int, ): + self, + num_embed: int, + height: int, + width: int, + embed_dim: int, + ): super().__init__() self.height = height @@ -337,14 +326,12 @@ def __init__( def forward(self, index): emb = self.emb(index) - height_emb = self.height_emb( - paddle.arange(self.height).reshape([1, self.height])) + height_emb = self.height_emb(paddle.arange(self.height).reshape([1, self.height])) # 1 x H x D -> 1 x H x 1 x D height_emb = height_emb.unsqueeze(2) - width_emb = self.width_emb( - paddle.arange(self.width).reshape([1, self.width])) + width_emb = self.width_emb(paddle.arange(self.width).reshape([1, self.width])) # 1 x W x D -> 1 x 1 x W x D width_emb = width_emb.unsqueeze(1) @@ -354,7 +341,7 @@ def forward(self, index): # 1 x H x W x D -> 1 x L xD pos_emb = pos_emb.reshape([1, self.height * self.width, -1]) - emb = emb + pos_emb[:, :emb.shape[1], :] + emb = emb + pos_emb[:, : emb.shape[1], :] return emb @@ -372,8 +359,7 @@ class LabelEmbedding(nn.Layer): def __init__(self, num_classes, hidden_size, dropout_prob): super().__init__() use_cfg_embedding = dropout_prob > 0 - self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, - hidden_size) + self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size) self.num_classes = num_classes self.dropout_prob = dropout_prob @@ -382,7 +368,12 @@ def token_drop(self, labels, force_drop_ids=None): Drops labels to enable classifier-free guidance. """ if force_drop_ids is None: - drop_ids = (paddle.rand((labels.shape[0], ), ) < self.dropout_prob) + drop_ids = ( + paddle.rand( + (labels.shape[0],), + ) + < self.dropout_prob + ) else: drop_ids = paddle.to_tensor(force_drop_ids == 1) labels = paddle.where(drop_ids, self.num_classes, labels) @@ -400,17 +391,13 @@ class CombinedTimestepLabelEmbeddings(nn.Layer): def __init__(self, num_classes, embedding_dim, class_dropout_prob=0.1): super().__init__() - self.time_proj = Timesteps( - num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=1) - self.timestep_embedder = TimestepEmbedding( - in_channels=256, time_embed_dim=embedding_dim) - self.class_embedder = LabelEmbedding(num_classes, embedding_dim, - class_dropout_prob) + self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=1) + self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim) + self.class_embedder = LabelEmbedding(num_classes, embedding_dim, class_dropout_prob) def forward(self, timestep, class_labels, hidden_dtype=None): timesteps_proj = self.time_proj(timestep) - timesteps_emb = self.timestep_embedder( - timesteps_proj.cast(hidden_dtype)) # (N, D) + timesteps_emb = self.timestep_embedder(timesteps_proj.cast(hidden_dtype)) # (N, D) class_labels = self.class_embedder(class_labels) # (N, D) @@ -420,8 +407,7 @@ def forward(self, timestep, class_labels, hidden_dtype=None): class TextTimeEmbedding(nn.Layer): - def __init__(self, encoder_dim: int, time_embed_dim: int, - num_heads: int=64): + def __init__(self, encoder_dim: int, time_embed_dim: int, num_heads: int = 64): super().__init__() self.norm1 = nn.LayerNorm(encoder_dim) self.pool = AttentionPooling(num_heads, encoder_dim) @@ -443,8 +429,8 @@ def __init__(self, num_heads, embed_dim, dtype=None): super().__init__() self.positional_embedding = self.create_parameter( (1, embed_dim), - default_initializer=nn.initializer.Assign( - paddle.randn((1, embed_dim)) / embed_dim**0.5), ) + default_initializer=nn.initializer.Assign(paddle.randn((1, embed_dim)) / embed_dim**0.5), + ) self.k_proj = nn.Linear(embed_dim, embed_dim) self.q_proj = nn.Linear(embed_dim, embed_dim) self.v_proj = nn.Linear(embed_dim, embed_dim) @@ -466,8 +452,7 @@ def shape(x): x = x.transpose([0, 2, 1]) return x - class_token = x.mean( - axis=1, keepdim=True) + self.positional_embedding.cast(x.dtype) + class_token = x.mean(axis=1, keepdim=True) + self.positional_embedding.cast(x.dtype) x = paddle.concat([class_token, x], axis=1) # (bs, length+1, width) # (bs*n_heads, class_token_length, dim_per_head) @@ -478,10 +463,9 @@ def shape(x): # (bs*n_heads, class_token_length, length+class_token_length): weight = paddle.einsum( - "bct,bcs->bts", q * self.scale, - k * self.scale) # More stable with f16 than dividing afterwards - weight = nn.functional.softmax( - weight.cast("float32"), axis=-1).cast(weight.dtype) + "bct,bcs->bts", q * self.scale, k * self.scale + ) # More stable with f16 than dividing afterwards + weight = nn.functional.softmax(weight.cast("float32"), axis=-1).cast(weight.dtype) # (bs*n_heads, dim_per_head, class_token_length) a = paddle.einsum("bts,bcs->bct", weight, v) diff --git a/ppdiffusers/ppdiffusers/models/lvdm_aemodules3d.py b/ppdiffusers/ppdiffusers/models/lvdm_aemodules3d.py index 192173d39afdf..d3a3befd29063 100644 --- a/ppdiffusers/ppdiffusers/models/lvdm_aemodules3d.py +++ b/ppdiffusers/ppdiffusers/models/lvdm_aemodules3d.py @@ -39,8 +39,9 @@ def hinge_d_loss(logits_real, logits_fake): def vanilla_d_loss(logits_real, logits_fake): d_loss = 0.5 * ( - paddle.mean(x=paddle.nn.functional.softplus(x=-logits_real)) + - paddle.mean(x=paddle.nn.functional.softplus(x=logits_fake))) + paddle.mean(x=paddle.nn.functional.softplus(x=-logits_real)) + + paddle.mean(x=paddle.nn.functional.softplus(x=logits_fake)) + ) return d_loss @@ -52,41 +53,34 @@ def Normalize(in_channels, norm_type="group"): num_channels=in_channels, epsilon=1e-06, weight_attr=None, - bias_attr=None, ) + bias_attr=None, + ) elif norm_type == "batch": return paddle.nn.SyncBatchNorm(in_channels) class ResBlock(paddle.nn.Layer): def __init__( - self, - in_channels, - out_channels=None, - conv_shortcut=False, - dropout=0.0, - norm_type="group", - padding_type="replicate", ): + self, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout=0.0, + norm_type="group", + padding_type="replicate", + ): super().__init__() self.in_channels = in_channels out_channels = in_channels if out_channels is None else out_channels self.out_channels = out_channels self.use_conv_shortcut = conv_shortcut self.norm1 = Normalize(in_channels, norm_type) - self.conv1 = SamePadConv3d( - in_channels, out_channels, kernel_size=3, padding_type=padding_type) + self.conv1 = SamePadConv3d(in_channels, out_channels, kernel_size=3, padding_type=padding_type) self.dropout = paddle.nn.Dropout(p=dropout) self.norm2 = Normalize(in_channels, norm_type) - self.conv2 = SamePadConv3d( - out_channels, - out_channels, - kernel_size=3, - padding_type=padding_type) + self.conv2 = SamePadConv3d(out_channels, out_channels, kernel_size=3, padding_type=padding_type) if self.in_channels != self.out_channels: - self.conv_shortcut = SamePadConv3d( - in_channels, - out_channels, - kernel_size=3, - padding_type=padding_type) + self.conv_shortcut = SamePadConv3d(in_channels, out_channels, kernel_size=3, padding_type=padding_type) def forward(self, x): h = x @@ -103,18 +97,19 @@ def forward(self, x): class SamePadConv3d(paddle.nn.Layer): def __init__( - self, - in_channels, - out_channels, - kernel_size, - stride=1, - bias=True, - padding_type="replicate", ): + self, + in_channels, + out_channels, + kernel_size, + stride=1, + bias=True, + padding_type="replicate", + ): super().__init__() if isinstance(kernel_size, int): - kernel_size = (kernel_size, ) * 3 + kernel_size = (kernel_size,) * 3 if isinstance(stride, int): - stride = (stride, ) * 3 + stride = (stride,) * 3 total_pad = tuple([(k - s) for k, s in zip(kernel_size, stride)]) pad_input = [] for p in total_pad[::-1]: @@ -128,31 +123,31 @@ def __init__( kernel_size=kernel_size, stride=stride, padding=0, - bias_attr=bias, ) + bias_attr=bias, + ) self.weight = self.conv.weight def forward(self, x): return self.conv( - paddle.nn.functional.pad(x=x, - pad=self.pad_input, - mode=self.padding_type, - data_format="NCDHW")) + paddle.nn.functional.pad(x=x, pad=self.pad_input, mode=self.padding_type, data_format="NCDHW") + ) class SamePadConvTranspose3d(paddle.nn.Layer): def __init__( - self, - in_channels, - out_channels, - kernel_size, - stride=1, - bias=True, - padding_type="replicate", ): + self, + in_channels, + out_channels, + kernel_size, + stride=1, + bias=True, + padding_type="replicate", + ): super().__init__() if isinstance(kernel_size, int): - kernel_size = (kernel_size, ) * 3 + kernel_size = (kernel_size,) * 3 if isinstance(stride, int): - stride = (stride, ) * 3 + stride = (stride,) * 3 total_pad = tuple([(k - s) for k, s in zip(kernel_size, stride)]) pad_input = [] for p in total_pad[::-1]: @@ -166,45 +161,38 @@ def __init__( kernel_size=kernel_size, stride=stride, padding=tuple([(k - 1) for k in kernel_size]), - bias_attr=bias, ) + bias_attr=bias, + ) def forward(self, x): return self.convt( - paddle.nn.functional.pad(x=x, - pad=self.pad_input, - mode=self.padding_type, - data_format="NCDHW")) + paddle.nn.functional.pad(x=x, pad=self.pad_input, mode=self.padding_type, data_format="NCDHW") + ) class Encoder(paddle.nn.Layer): def __init__( - self, - n_hiddens, - downsample, - z_channels, - double_z, - image_channel=3, - norm_type="group", - padding_type="replicate", ): + self, + n_hiddens, + downsample, + z_channels, + double_z, + image_channel=3, + norm_type="group", + padding_type="replicate", + ): super().__init__() n_times_downsample = np.array([int(math.log2(d)) for d in downsample]) self.conv_blocks = paddle.nn.LayerList() max_ds = n_times_downsample.max() - self.conv_first = SamePadConv3d( - image_channel, n_hiddens, kernel_size=3, padding_type=padding_type) + self.conv_first = SamePadConv3d(image_channel, n_hiddens, kernel_size=3, padding_type=padding_type) for i in range(max_ds): block = paddle.nn.Layer() in_channels = n_hiddens * 2**i - out_channels = n_hiddens * 2**(i + 1) + out_channels = n_hiddens * 2 ** (i + 1) stride = tuple([(2 if d > 0 else 1) for d in n_times_downsample]) - block.down = SamePadConv3d( - in_channels, - out_channels, - 4, - stride=stride, - padding_type=padding_type) - block.res = ResBlock( - out_channels, out_channels, norm_type=norm_type) + block.down = SamePadConv3d(in_channels, out_channels, 4, stride=stride, padding_type=padding_type) + block.res = ResBlock(out_channels, out_channels, norm_type=norm_type) self.conv_blocks.append(block) n_times_downsample -= 1 self.final_block = paddle.nn.Sequential( @@ -215,7 +203,9 @@ def __init__( 2 * z_channels if double_z else z_channels, kernel_size=3, stride=1, - padding_type=padding_type, ), ) + padding_type=padding_type, + ), + ) self.out_channels = out_channels def forward(self, x): @@ -228,12 +218,7 @@ def forward(self, x): class Decoder(paddle.nn.Layer): - def __init__(self, - n_hiddens, - upsample, - z_channels, - image_channel, - norm_type="group"): + def __init__(self, n_hiddens, upsample, z_channels, image_channel, norm_type="group"): super().__init__() n_times_upsample = np.array([int(math.log2(d)) for d in upsample]) max_us = n_times_upsample.max() @@ -241,20 +226,15 @@ def __init__(self, self.conv_blocks = paddle.nn.LayerList() for i in range(max_us): block = paddle.nn.Layer() - in_channels = in_channels if i == 0 else n_hiddens * 2**( - max_us - i + 1) - out_channels = n_hiddens * 2**(max_us - i) + in_channels = in_channels if i == 0 else n_hiddens * 2 ** (max_us - i + 1) + out_channels = n_hiddens * 2 ** (max_us - i) us = tuple([(2 if d > 0 else 1) for d in n_times_upsample]) - block.up = SamePadConvTranspose3d( - in_channels, out_channels, 4, stride=us) - block.res1 = ResBlock( - out_channels, out_channels, norm_type=norm_type) - block.res2 = ResBlock( - out_channels, out_channels, norm_type=norm_type) + block.up = SamePadConvTranspose3d(in_channels, out_channels, 4, stride=us) + block.res1 = ResBlock(out_channels, out_channels, norm_type=norm_type) + block.res2 = ResBlock(out_channels, out_channels, norm_type=norm_type) self.conv_blocks.append(block) n_times_upsample -= 1 - self.conv_out = SamePadConv3d( - out_channels, image_channel, kernel_size=3) + self.conv_out = SamePadConv3d(out_channels, image_channel, kernel_size=3) def forward(self, x): h = x diff --git a/ppdiffusers/ppdiffusers/models/lvdm_attention_temporal.py b/ppdiffusers/ppdiffusers/models/lvdm_attention_temporal.py index 7a934de7f6224..acc73c41c8fdd 100644 --- a/ppdiffusers/ppdiffusers/models/lvdm_attention_temporal.py +++ b/ppdiffusers/ppdiffusers/models/lvdm_attention_temporal.py @@ -17,8 +17,9 @@ from paddle.distributed.fleet.utils import recompute try: - from paddle.incubate.nn.memory_efficient_attention import \ - memory_efficient_attention # noqa + from paddle.incubate.nn.memory_efficient_attention import ( # noqa + memory_efficient_attention, + ) _ppxformers_available = True except: @@ -30,8 +31,15 @@ from einops import rearrange, repeat from ..utils.initializer_utils import constant_, xavier_uniform_ -from .lvdm_util import (GEGLU, Normalize, conv_nd, default, exists, - normalization, zero_module) +from .lvdm_util import ( + GEGLU, + Normalize, + conv_nd, + default, + exists, + normalization, + zero_module, +) def finfo(dtype): @@ -53,15 +61,19 @@ def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0): super().__init__() inner_dim = int(dim * mult) dim_out = default(dim_out, dim) - project_in = (paddle.nn.Sequential( - paddle.nn.Linear( - in_features=dim, out_features=inner_dim), - paddle.nn.GELU(), ) if not glu else GEGLU(dim, inner_dim)) + project_in = ( + paddle.nn.Sequential( + paddle.nn.Linear(in_features=dim, out_features=inner_dim), + paddle.nn.GELU(), + ) + if not glu + else GEGLU(dim, inner_dim) + ) self.net = paddle.nn.Sequential( project_in, paddle.nn.Dropout(p=dropout), - paddle.nn.Linear( - in_features=inner_dim, out_features=dim_out), ) + paddle.nn.Linear(in_features=inner_dim, out_features=dim_out), + ) def forward(self, x): return self.net(x) @@ -74,19 +86,19 @@ def __init__(self, num_units, max_relative_position): super().__init__() self.num_units = num_units self.max_relative_position = max_relative_position - self.embeddings_table = paddle.nn.Parameter( - paddle.empty(shape=[max_relative_position * 2 + 1, num_units])) + self.embeddings_table = paddle.nn.Parameter(paddle.empty(shape=[max_relative_position * 2 + 1, num_units])) xavier_uniform_(self.embeddings_table) def forward(self, length_q, length_k): - device = self.embeddings_table.place + # device = self.embeddings_table.place range_vec_q = paddle.arange(end=length_q) range_vec_k = paddle.arange(end=length_k) distance_mat = range_vec_k[(None), :] - range_vec_q[:, (None)] distance_mat_clipped = paddle.clip( x=distance_mat, min=-self.max_relative_position, - max=self.max_relative_position, ) + max=self.max_relative_position, + ) final_mat = distance_mat_clipped + self.max_relative_position final_mat = final_mat.astype(dtype="int64") embeddings = self.embeddings_table[final_mat] @@ -95,15 +107,16 @@ def forward(self, length_q, length_k): class TemporalCrossAttention(paddle.nn.Layer): def __init__( - self, - query_dim, - context_dim=None, - heads=8, - dim_head=64, - dropout=0.0, - use_relative_position=False, - temporal_length=None, - **kwargs, ): + self, + query_dim, + context_dim=None, + heads=8, + dim_head=64, + dropout=0.0, + use_relative_position=False, + temporal_length=None, + **kwargs, + ): super().__init__() inner_dim = dim_head * heads context_dim = default(context_dim, query_dim) @@ -112,22 +125,17 @@ def __init__( self.heads = heads self.temporal_length = temporal_length self.use_relative_position = use_relative_position - self.to_q = paddle.nn.Linear( - in_features=query_dim, out_features=inner_dim, bias_attr=False) - self.to_k = paddle.nn.Linear( - in_features=context_dim, out_features=inner_dim, bias_attr=False) - self.to_v = paddle.nn.Linear( - in_features=context_dim, out_features=inner_dim, bias_attr=False) + self.to_q = paddle.nn.Linear(in_features=query_dim, out_features=inner_dim, bias_attr=False) + self.to_k = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False) + self.to_v = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False) self.to_out = paddle.nn.Sequential( - paddle.nn.Linear( - in_features=inner_dim, out_features=query_dim), - paddle.nn.Dropout(p=dropout), ) + paddle.nn.Linear(in_features=inner_dim, out_features=query_dim), + paddle.nn.Dropout(p=dropout), + ) if use_relative_position: assert temporal_length is not None - self.relative_position_k = RelativePosition( - num_units=dim_head, max_relative_position=temporal_length) - self.relative_position_v = RelativePosition( - num_units=dim_head, max_relative_position=temporal_length) + self.relative_position_k = RelativePosition(num_units=dim_head, max_relative_position=temporal_length) + self.relative_position_v = RelativePosition(num_units=dim_head, max_relative_position=temporal_length) constant_(self.to_q.weight, 0) constant_(self.to_k.weight, 0) constant_(self.to_v.weight, 0) @@ -162,32 +170,23 @@ def forward(self, x, context=None, mask=None): class CrossAttention(paddle.nn.Layer): - def __init__(self, - query_dim, - context_dim=None, - heads=8, - dim_head=64, - dropout=0.0, - **kwargs): + def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0, **kwargs): super().__init__() inner_dim = dim_head * heads context_dim = default(context_dim, query_dim) self.scale = dim_head**-0.5 self.heads = heads - self.to_q = paddle.nn.Linear( - in_features=query_dim, out_features=inner_dim, bias_attr=False) - self.to_k = paddle.nn.Linear( - in_features=context_dim, out_features=inner_dim, bias_attr=False) - self.to_v = paddle.nn.Linear( - in_features=context_dim, out_features=inner_dim, bias_attr=False) + self.to_q = paddle.nn.Linear(in_features=query_dim, out_features=inner_dim, bias_attr=False) + self.to_k = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False) + self.to_v = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False) self.to_out = paddle.nn.Sequential( - paddle.nn.Linear( - in_features=inner_dim, out_features=query_dim), - paddle.nn.Dropout(p=dropout), ) + paddle.nn.Linear(in_features=inner_dim, out_features=query_dim), + paddle.nn.Dropout(p=dropout), + ) def forward(self, x, context=None, mask=None): h = self.heads - b = x.shape[0] + # b = x.shape[0] q = self.to_q(x) context = default(context, x) k = self.to_k(context) @@ -206,13 +205,7 @@ def forward(self, x, context=None, mask=None): class MemoryEfficientCrossAttention(paddle.nn.Layer): - def __init__(self, - query_dim, - context_dim=None, - heads=8, - dim_head=64, - dropout=0.0, - **kwargs): + def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0, **kwargs): super().__init__() print( f"Setting up {self.__class__.__name__}. Query dim is {query_dim}, context_dim is {context_dim} and using {heads} heads." @@ -221,16 +214,13 @@ def __init__(self, context_dim = default(context_dim, query_dim) self.heads = heads self.dim_head = dim_head - self.to_q = paddle.nn.Linear( - in_features=query_dim, out_features=inner_dim, bias_attr=False) - self.to_k = paddle.nn.Linear( - in_features=context_dim, out_features=inner_dim, bias_attr=False) - self.to_v = paddle.nn.Linear( - in_features=context_dim, out_features=inner_dim, bias_attr=False) + self.to_q = paddle.nn.Linear(in_features=query_dim, out_features=inner_dim, bias_attr=False) + self.to_k = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False) + self.to_v = paddle.nn.Linear(in_features=context_dim, out_features=inner_dim, bias_attr=False) self.to_out = paddle.nn.Sequential( - paddle.nn.Linear( - in_features=inner_dim, out_features=query_dim), - paddle.nn.Dropout(p=dropout), ) + paddle.nn.Linear(in_features=inner_dim, out_features=query_dim), + paddle.nn.Dropout(p=dropout), + ) self.attention_op = "cutlass" def forward(self, x, context=None, mask=None): @@ -239,8 +229,7 @@ def forward(self, x, context=None, mask=None): k = self.to_k(context) v = self.to_v(context) b, _, _ = q.shape - q, k, v = map(lambda t: t.reshape([0, 0, self.heads, self.dim_head]), - (q, k, v)) + q, k, v = map(lambda t: t.reshape([0, 0, self.heads, self.dim_head]), (q, k, v)) out = F.scaled_dot_product_attention_( q, k, @@ -248,7 +237,8 @@ def forward(self, x, context=None, mask=None): attn_mask=None, dropout_p=0.0, attention_op=self.attention_op, - training=True, ) + training=True, + ) if exists(mask): raise NotImplementedError out = out.reshape([0, 0, self.heads * self.dim_head]) @@ -261,63 +251,46 @@ class BasicTransformerBlockST(paddle.nn.Layer): """ def __init__( - self, - dim, - n_heads, - d_head, - dropout=0.0, - context_dim=None, - gated_ff=True, - checkpoint=True, - temporal_length=None, - use_relative_position=True, - **kwargs, ): + self, + dim, + n_heads, + d_head, + dropout=0.0, + context_dim=None, + gated_ff=True, + checkpoint=True, + temporal_length=None, + use_relative_position=True, + **kwargs, + ): super().__init__() if _ppxformers_available: self.attn1 = MemoryEfficientCrossAttention( - query_dim=dim, - heads=n_heads, - dim_head=d_head, - dropout=dropout, - **kwargs) + query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout, **kwargs + ) self.attn2 = MemoryEfficientCrossAttention( query_dim=dim, context_dim=context_dim, heads=n_heads, dim_head=d_head, dropout=dropout, - **kwargs, ) + **kwargs, + ) else: - self.attn1 = CrossAttention( - query_dim=dim, - heads=n_heads, - dim_head=d_head, - dropout=dropout, - **kwargs) + self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout, **kwargs) self.attn2 = CrossAttention( query_dim=dim, context_dim=context_dim, heads=n_heads, dim_head=d_head, dropout=dropout, - **kwargs, ) + **kwargs, + ) self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff) - self.norm1 = paddle.nn.LayerNorm( - normalized_shape=dim, - epsilon=1e-05, - weight_attr=None, - bias_attr=None) - self.norm2 = paddle.nn.LayerNorm( - normalized_shape=dim, - epsilon=1e-05, - weight_attr=None, - bias_attr=None) - self.norm3 = paddle.nn.LayerNorm( - normalized_shape=dim, - epsilon=1e-05, - weight_attr=None, - bias_attr=None) + self.norm1 = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-05, weight_attr=None, bias_attr=None) + self.norm2 = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-05, weight_attr=None, bias_attr=None) + self.norm3 = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-05, weight_attr=None, bias_attr=None) self.checkpoint = checkpoint self.attn1_tmp = TemporalCrossAttention( query_dim=dim, @@ -326,7 +299,8 @@ def __init__( dropout=dropout, temporal_length=temporal_length, use_relative_position=use_relative_position, - **kwargs, ) + **kwargs, + ) self.attn2_tmp = TemporalCrossAttention( query_dim=dim, heads=n_heads, @@ -335,17 +309,10 @@ def __init__( context_dim=None, temporal_length=temporal_length, use_relative_position=use_relative_position, - **kwargs, ) - self.norm4 = paddle.nn.LayerNorm( - normalized_shape=dim, - epsilon=1e-05, - weight_attr=None, - bias_attr=None) - self.norm5 = paddle.nn.LayerNorm( - normalized_shape=dim, - epsilon=1e-05, - weight_attr=None, - bias_attr=None) + **kwargs, + ) + self.norm4 = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-05, weight_attr=None, bias_attr=None) + self.norm5 = paddle.nn.LayerNorm(normalized_shape=dim, epsilon=1e-05, weight_attr=None, bias_attr=None) def forward(self, x, context=None, **kwargs): if self.checkpoint: @@ -366,8 +333,7 @@ def _forward(self, x, context=None, mask=None): if context is not None: context_ = [] for i in range(context.shape[0]): - context_.append(context[i].unsqueeze(axis=0).tile( - repeat_times=[t, 1, 1])) + context_.append(context[i].unsqueeze(axis=0).tile(repeat_times=[t, 1, 1])) context_ = paddle.concat(x=context_, axis=0) else: context_ = None @@ -389,16 +355,17 @@ class SpatialTemporalTransformer(paddle.nn.Layer): """ def __init__( - self, - in_channels, - n_heads, - d_head, - depth=1, - dropout=0.0, - context_dim=None, - temporal_length=None, - use_relative_position=True, - **kwargs, ): + self, + in_channels, + n_heads, + d_head, + depth=1, + dropout=0.0, + context_dim=None, + temporal_length=None, + use_relative_position=True, + **kwargs, + ): super().__init__() self.in_channels = in_channels inner_dim = n_heads * d_head @@ -408,25 +375,32 @@ def __init__( out_channels=inner_dim, kernel_size=1, stride=1, - padding=0, ) - self.transformer_blocks = paddle.nn.LayerList(sublayers=[ - BasicTransformerBlockST( - inner_dim, - n_heads, - d_head, - dropout=dropout, - context_dim=context_dim, - temporal_length=temporal_length, - use_relative_position=use_relative_position, - **kwargs, ) for d in range(depth) - ]) + padding=0, + ) + self.transformer_blocks = paddle.nn.LayerList( + sublayers=[ + BasicTransformerBlockST( + inner_dim, + n_heads, + d_head, + dropout=dropout, + context_dim=context_dim, + temporal_length=temporal_length, + use_relative_position=use_relative_position, + **kwargs, + ) + for d in range(depth) + ] + ) self.proj_out = zero_module( paddle.nn.Conv3D( in_channels=inner_dim, out_channels=in_channels, kernel_size=1, stride=1, - padding=0, )) + padding=0, + ) + ) def forward(self, x, context=None, **kwargs): assert x.dim() == 5, f"x shape = {x.shape}" @@ -441,13 +415,14 @@ def forward(self, x, context=None, **kwargs): class STAttentionBlock(paddle.nn.Layer): def __init__( - self, - channels, - num_heads=1, - num_head_channels=-1, - use_checkpoint=False, - temporal_length=16, - use_relative_position=False, ): + self, + channels, + num_heads=1, + num_head_channels=-1, + use_checkpoint=False, + temporal_length=16, + use_relative_position=False, + ): super().__init__() if num_head_channels == -1: self.num_heads = num_heads @@ -468,10 +443,12 @@ def __init__( if use_relative_position: self.relative_position_k = RelativePosition( num_units=channels // self.num_heads, - max_relative_position=temporal_length, ) + max_relative_position=temporal_length, + ) self.relative_position_v = RelativePosition( num_units=channels // self.num_heads, - max_relative_position=temporal_length, ) + max_relative_position=temporal_length, + ) self.proj_out_s = zero_module(conv_nd(1, channels, channels, 1)) self.proj_out_t = zero_module(conv_nd(1, channels, channels, 1)) @@ -512,22 +489,21 @@ def forward(self, qkv, rp=None, mask=None): weight = paddle.einsum( "bct,bcs->bts", (q * scale).reshape([bs * self.n_heads, ch, length]), - (k * scale).reshape([bs * self.n_heads, ch, length]), ) + (k * scale).reshape([bs * self.n_heads, ch, length]), + ) if rp is not None: k_rp, v_rp = rp weight2 = paddle.einsum( "bct,tsc->bst", (q * scale).reshape([bs * self.n_heads, ch, length]), - k_rp, ) + k_rp, + ) weight += weight2 if mask is not None: INF = -100000000.0 - weight = paddle.where( - mask == 0, weight.astype(dtype="float32"), INF) - weight = paddle.nn.functional.softmax( - x=weight.astype(dtype="float32"), axis=-1).astype(weight.dtype) - a = paddle.einsum("bts,bcs->bct", weight, - v.reshape([bs * self.n_heads, ch, length])) + weight = paddle.where(mask == 0, weight.astype(dtype="float32"), INF) + weight = paddle.nn.functional.softmax(x=weight.astype(dtype="float32"), axis=-1).astype(weight.dtype) + a = paddle.einsum("bts,bcs->bct", weight, v.reshape([bs * self.n_heads, ch, length])) if rp is not None: x = paddle.einsum("bts,tsc->btc", weight, v_rp) perm_3 = list(range(x.ndim)) diff --git a/ppdiffusers/ppdiffusers/models/lvdm_distributions.py b/ppdiffusers/ppdiffusers/models/lvdm_distributions.py index e2b9a88f4c4e5..a66cf086f85d7 100644 --- a/ppdiffusers/ppdiffusers/models/lvdm_distributions.py +++ b/ppdiffusers/ppdiffusers/models/lvdm_distributions.py @@ -58,21 +58,26 @@ def kl(self, other=None): elif other is None: return 0.5 * paddle.sum( x=paddle.pow(x=self.mean, y=2) + self.var - 1.0 - self.logvar, - axis=[1, 2, 3], ) + axis=[1, 2, 3], + ) else: return 0.5 * paddle.sum( - x=paddle.pow(x=self.mean - other.mean, y=2) / other.var + - self.var / other.var - 1.0 - self.logvar + other.logvar, - axis=[1, 2, 3], ) + x=paddle.pow(x=self.mean - other.mean, y=2) / other.var + + self.var / other.var + - 1.0 + - self.logvar + + other.logvar, + axis=[1, 2, 3], + ) def nll(self, sample, dims=[1, 2, 3]): if self.deterministic: return paddle.to_tensor(data=[0.0], dtype="float32") logtwopi = np.log(2.0 * np.pi) return 0.5 * paddle.sum( - x=logtwopi + self.logvar + paddle.pow(x=sample - self.mean, y=2) / - self.var, - axis=dims, ) + x=logtwopi + self.logvar + paddle.pow(x=sample - self.mean, y=2) / self.var, + axis=dims, + ) def mode(self): return self.mean @@ -91,11 +96,11 @@ def normal_kl(mean1, logvar1, mean2, logvar2): tensor = obj break assert tensor is not None, "at least one argument must be a Tensor" - logvar1, logvar2 = [ - (x if isinstance(x, paddle.Tensor) else paddle.to_tensor(data=x)) - for x in (logvar1, logvar2) - ] + logvar1, logvar2 = [(x if isinstance(x, paddle.Tensor) else paddle.to_tensor(data=x)) for x in (logvar1, logvar2)] return 0.5 * ( - -1.0 + logvar2 - logvar1 + paddle.exp(x=(logvar1 - logvar2 - ).astype("float32")) + - (mean1 - mean2)**2 * paddle.exp(x=(-logvar2).astype("float32"))) + -1.0 + + logvar2 + - logvar1 + + paddle.exp(x=(logvar1 - logvar2).astype("float32")) + + (mean1 - mean2) ** 2 * paddle.exp(x=(-logvar2).astype("float32")) + ) diff --git a/ppdiffusers/ppdiffusers/models/lvdm_unet_3d.py b/ppdiffusers/ppdiffusers/models/lvdm_unet_3d.py index a48a260f655dd..512431be11300 100644 --- a/ppdiffusers/ppdiffusers/models/lvdm_unet_3d.py +++ b/ppdiffusers/ppdiffusers/models/lvdm_unet_3d.py @@ -21,10 +21,16 @@ from ..configuration_utils import ConfigMixin, register_to_config from ..utils import BaseOutput -from .lvdm_attention_temporal import (SpatialTemporalTransformer, - STAttentionBlock) -from .lvdm_util import (avg_pool_nd, conv_nd, linear, nonlinearity, - normalization, timestep_embedding, zero_module) +from .lvdm_attention_temporal import SpatialTemporalTransformer, STAttentionBlock +from .lvdm_util import ( + avg_pool_nd, + conv_nd, + linear, + nonlinearity, + normalization, + timestep_embedding, + zero_module, +) from .modeling_utils import ModelMixin @@ -87,13 +93,14 @@ class Upsample(paddle.nn.Layer): """ def __init__( - self, - channels, - use_conv, - dims=2, - out_channels=None, - kernel_size_t=3, - padding_t=1, ): + self, + channels, + use_conv, + dims=2, + out_channels=None, + kernel_size_t=3, + padding_t=1, + ): super().__init__() self.channels = channels self.out_channels = out_channels or channels @@ -105,7 +112,8 @@ def __init__( self.channels, self.out_channels, (kernel_size_t, 3, 3), - padding=(padding_t, 1, 1), ) + padding=(padding_t, 1, 1), + ) def forward(self, x): assert x.shape[1] == self.channels @@ -114,10 +122,10 @@ def forward(self, x): x=x, size=(x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest", - data_format="NCDHW", ) + data_format="NCDHW", + ) else: - x = paddle.nn.functional.interpolate( - x=x, scale_factor=2, mode="nearest") + x = paddle.nn.functional.interpolate(x=x, scale_factor=2, mode="nearest") if self.use_conv: x = self.conv(x) return x @@ -133,13 +141,14 @@ class Downsample(paddle.nn.Layer): """ def __init__( - self, - channels, - use_conv, - dims=2, - out_channels=None, - kernel_size_t=3, - padding_t=1, ): + self, + channels, + use_conv, + dims=2, + out_channels=None, + kernel_size_t=3, + padding_t=1, + ): super().__init__() self.channels = channels self.out_channels = out_channels or channels @@ -153,7 +162,8 @@ def __init__( self.out_channels, (kernel_size_t, 3, 3), stride=stride, - padding=(padding_t, 1, 1), ) + padding=(padding_t, 1, 1), + ) else: assert self.channels == self.out_channels self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride) @@ -179,21 +189,23 @@ class ResBlock(TimestepBlock): :param down: if True, use this block for downsampling. """ - def __init__(self, - channels, - emb_channels, - dropout, - out_channels=None, - use_conv=False, - use_scale_shift_norm=False, - dims=2, - use_checkpoint=False, - up=False, - down=False, - kernel_size_t=3, - padding_t=1, - nonlinearity_type="silu", - **kwargs): + def __init__( + self, + channels, + emb_channels, + dropout, + out_channels=None, + use_conv=False, + use_scale_shift_norm=False, + dims=2, + use_checkpoint=False, + up=False, + down=False, + kernel_size_t=3, + padding_t=1, + nonlinearity_type="silu", + **kwargs + ): super().__init__() self.channels = channels self.emb_channels = emb_channels @@ -211,42 +223,25 @@ def __init__(self, channels, self.out_channels, (kernel_size_t, 3, 3), - padding=(padding_t, 1, 1), ), ) + padding=(padding_t, 1, 1), + ), + ) self.updown = up or down if up: - self.h_upd = Upsample( - channels, - False, - dims, - kernel_size_t=kernel_size_t, - padding_t=padding_t) - self.x_upd = Upsample( - channels, - False, - dims, - kernel_size_t=kernel_size_t, - padding_t=padding_t) + self.h_upd = Upsample(channels, False, dims, kernel_size_t=kernel_size_t, padding_t=padding_t) + self.x_upd = Upsample(channels, False, dims, kernel_size_t=kernel_size_t, padding_t=padding_t) elif down: - self.h_upd = Downsample( - channels, - False, - dims, - kernel_size_t=kernel_size_t, - padding_t=padding_t) - self.x_upd = Downsample( - channels, - False, - dims, - kernel_size_t=kernel_size_t, - padding_t=padding_t) + self.h_upd = Downsample(channels, False, dims, kernel_size_t=kernel_size_t, padding_t=padding_t) + self.x_upd = Downsample(channels, False, dims, kernel_size_t=kernel_size_t, padding_t=padding_t) else: self.h_upd = self.x_upd = paddle.nn.Identity() self.emb_layers = paddle.nn.Sequential( nonlinearity(nonlinearity_type), linear( emb_channels, - 2 * self.out_channels - if use_scale_shift_norm else self.out_channels, ), ) + 2 * self.out_channels if use_scale_shift_norm else self.out_channels, + ), + ) self.out_layers = paddle.nn.Sequential( normalization(self.out_channels), nonlinearity(nonlinearity_type), @@ -257,7 +252,10 @@ def __init__(self, self.out_channels, self.out_channels, (kernel_size_t, 3, 3), - padding=(padding_t, 1, 1), )), ) + padding=(padding_t, 1, 1), + ) + ), + ) if self.out_channels == channels: self.skip_connection = paddle.nn.Identity() elif use_conv: @@ -266,7 +264,8 @@ def __init__(self, channels, self.out_channels, (kernel_size_t, 3, 3), - padding=(padding_t, 1, 1), ) + padding=(padding_t, 1, 1), + ) else: self.skip_connection = conv_nd(dims, channels, self.out_channels, 1) @@ -317,11 +316,9 @@ def _forward(self, x, emb): # return STTransformerClass -def make_spatialtemporal_transformer(module_name="attention_temporal", - class_name="SpatialTemporalTransformer"): +def make_spatialtemporal_transformer(module_name="attention_temporal", class_name="SpatialTemporalTransformer"): # Todo: Support loading more types of transformers - assert (module_name == "attention_temporal" and - class_name == "SpatialTemporalTransformer") + assert module_name == "attention_temporal" and class_name == "SpatialTemporalTransformer" return SpatialTemporalTransformer @@ -354,37 +351,39 @@ class LVDMUNet3DModel(ModelMixin, ConfigMixin): """ @register_to_config - def __init__(self, - image_size, - in_channels, - model_channels, - out_channels, - num_res_blocks, - attention_resolutions, - dropout=0, - channel_mult=(1, 2, 4, 8), - conv_resample=True, - dims=3, - num_classes=None, - use_checkpoint=False, - use_fp16=False, - num_heads=-1, - num_head_channels=-1, - num_heads_upsample=-1, - use_scale_shift_norm=False, - resblock_updown=False, - transformer_depth=1, - context_dim=None, - legacy=True, - kernel_size_t=1, - padding_t=1, - use_temporal_transformer=False, - temporal_length=None, - use_relative_position=False, - nonlinearity_type="silu", - ST_transformer_module="attention_temporal", - ST_transformer_class="SpatialTemporalTransformer", - **kwargs): + def __init__( + self, + image_size, + in_channels, + model_channels, + out_channels, + num_res_blocks, + attention_resolutions, + dropout=0, + channel_mult=(1, 2, 4, 8), + conv_resample=True, + dims=3, + num_classes=None, + use_checkpoint=False, + use_fp16=False, + num_heads=-1, + num_head_channels=-1, + num_heads_upsample=-1, + use_scale_shift_norm=False, + resblock_updown=False, + transformer_depth=1, + context_dim=None, + legacy=True, + kernel_size_t=1, + padding_t=1, + use_temporal_transformer=False, + temporal_length=None, + use_relative_position=False, + nonlinearity_type="silu", + ST_transformer_module="attention_temporal", + ST_transformer_class="SpatialTemporalTransformer", + **kwargs + ): super().__init__() if use_temporal_transformer: assert ( @@ -401,11 +400,9 @@ def __init__(self, if num_heads_upsample == -1: num_heads_upsample = num_heads if num_heads == -1: - assert (num_head_channels != -1 - ), "Either num_heads or num_head_channels has to be set" + assert num_head_channels != -1, "Either num_heads or num_head_channels has to be set" if num_head_channels == -1: - assert (num_heads != -1 - ), "Either num_heads or num_head_channels has to be set" + assert num_heads != -1, "Either num_heads or num_head_channels has to be set" self.image_size = image_size self.in_channels = in_channels self.model_channels = model_channels @@ -430,20 +427,26 @@ def __init__(self, self.time_embed = paddle.nn.Sequential( linear(model_channels, time_embed_dim), nonlinearity(nonlinearity_type), - linear(time_embed_dim, time_embed_dim), ) + linear(time_embed_dim, time_embed_dim), + ) if self.num_classes is not None: self.label_emb = paddle.nn.Embedding(num_classes, time_embed_dim) STTransformerClass = make_spatialtemporal_transformer( - module_name=ST_transformer_module, class_name=ST_transformer_class) - self.input_blocks = paddle.nn.LayerList(sublayers=[ - TimestepEmbedSequential( - conv_nd( - dims, - in_channels, - model_channels, - (kernel_size_t, 3, 3), - padding=(padding_t, 1, 1), )) - ]) + module_name=ST_transformer_module, class_name=ST_transformer_class + ) + self.input_blocks = paddle.nn.LayerList( + sublayers=[ + TimestepEmbedSequential( + conv_nd( + dims, + in_channels, + model_channels, + (kernel_size_t, 3, 3), + padding=(padding_t, 1, 1), + ) + ) + ] + ) self._feature_size = model_channels input_block_chans = [model_channels] ch = model_channels @@ -462,7 +465,8 @@ def __init__(self, kernel_size_t=kernel_size_t, padding_t=padding_t, nonlinearity_type=nonlinearity_type, - **kwargs) + **kwargs, + ) ] ch = mult * model_channels if ds in attention_resolutions: @@ -472,8 +476,7 @@ def __init__(self, num_heads = ch // num_head_channels dim_head = num_head_channels if legacy: - dim_head = (ch // num_heads if use_temporal_transformer - else num_head_channels) + dim_head = ch // num_heads if use_temporal_transformer else num_head_channels layers.append( STAttentionBlock( ch, @@ -481,8 +484,10 @@ def __init__(self, num_heads=num_heads, num_head_channels=dim_head, temporal_length=temporal_length, - use_relative_position=use_relative_position, ) - if not use_temporal_transformer else STTransformerClass( + use_relative_position=use_relative_position, + ) + if not use_temporal_transformer + else STTransformerClass( ch, num_heads, dim_head, @@ -490,7 +495,9 @@ def __init__(self, context_dim=context_dim, temporal_length=temporal_length, use_relative_position=use_relative_position, - **kwargs)) + **kwargs, + ) + ) self.input_blocks.append(TimestepEmbedSequential(*layers)) self._feature_size += ch input_block_chans.append(ch) @@ -510,13 +517,19 @@ def __init__(self, kernel_size_t=kernel_size_t, padding_t=padding_t, nonlinearity_type=nonlinearity_type, - **kwargs) if resblock_updown else Downsample( - ch, - conv_resample, - dims=dims, - out_channels=out_ch, - kernel_size_t=kernel_size_t, - padding_t=padding_t, ))) + **kwargs, + ) + if resblock_updown + else Downsample( + ch, + conv_resample, + dims=dims, + out_channels=out_ch, + kernel_size_t=kernel_size_t, + padding_t=padding_t, + ) + ) + ) ch = out_ch input_block_chans.append(ch) ds *= 2 @@ -527,8 +540,7 @@ def __init__(self, num_heads = ch // num_head_channels dim_head = num_head_channels if legacy: - dim_head = (ch // num_heads - if use_temporal_transformer else num_head_channels) + dim_head = ch // num_heads if use_temporal_transformer else num_head_channels self.middle_block = TimestepEmbedSequential( ResBlock( ch, @@ -540,15 +552,18 @@ def __init__(self, kernel_size_t=kernel_size_t, padding_t=padding_t, nonlinearity_type=nonlinearity_type, - **kwargs), + **kwargs, + ), STAttentionBlock( ch, use_checkpoint=use_checkpoint, num_heads=num_heads, num_head_channels=dim_head, temporal_length=temporal_length, - use_relative_position=use_relative_position, ) - if not use_temporal_transformer else STTransformerClass( + use_relative_position=use_relative_position, + ) + if not use_temporal_transformer + else STTransformerClass( ch, num_heads, dim_head, @@ -556,7 +571,8 @@ def __init__(self, context_dim=context_dim, temporal_length=temporal_length, use_relative_position=use_relative_position, - **kwargs), + **kwargs, + ), ResBlock( ch, time_embed_dim, @@ -567,7 +583,9 @@ def __init__(self, kernel_size_t=kernel_size_t, padding_t=padding_t, nonlinearity_type=nonlinearity_type, - **kwargs), ) + **kwargs, + ), + ) self._feature_size += ch self.output_blocks = paddle.nn.LayerList(sublayers=[]) for level, mult in list(enumerate(channel_mult))[::-1]: @@ -585,7 +603,8 @@ def __init__(self, kernel_size_t=kernel_size_t, padding_t=padding_t, nonlinearity_type=nonlinearity_type, - **kwargs) + **kwargs, + ) ] ch = model_channels * mult if ds in attention_resolutions: @@ -595,8 +614,7 @@ def __init__(self, num_heads = ch // num_head_channels dim_head = num_head_channels if legacy: - dim_head = (ch // num_heads if use_temporal_transformer - else num_head_channels) + dim_head = ch // num_heads if use_temporal_transformer else num_head_channels layers.append( STAttentionBlock( ch, @@ -604,8 +622,10 @@ def __init__(self, num_heads=num_heads, num_head_channels=dim_head, temporal_length=temporal_length, - use_relative_position=use_relative_position, ) - if not use_temporal_transformer else STTransformerClass( + use_relative_position=use_relative_position, + ) + if not use_temporal_transformer + else STTransformerClass( ch, num_heads, dim_head, @@ -613,7 +633,9 @@ def __init__(self, context_dim=context_dim, temporal_length=temporal_length, use_relative_position=use_relative_position, - **kwargs)) + **kwargs, + ) + ) if level and i == num_res_blocks: out_ch = ch layers.append( @@ -629,13 +651,18 @@ def __init__(self, kernel_size_t=kernel_size_t, padding_t=padding_t, nonlinearity_type=nonlinearity_type, - **kwargs) if resblock_updown else Upsample( - ch, - conv_resample, - dims=dims, - out_channels=out_ch, - kernel_size_t=kernel_size_t, - padding_t=padding_t, )) + **kwargs, + ) + if resblock_updown + else Upsample( + ch, + conv_resample, + dims=dims, + out_channels=out_ch, + kernel_size_t=kernel_size_t, + padding_t=padding_t, + ) + ) ds //= 2 self.output_blocks.append(TimestepEmbedSequential(*layers)) self._feature_size += ch @@ -648,7 +675,10 @@ def __init__(self, model_channels, out_channels, (kernel_size_t, 3, 3), - padding=(padding_t, 1, 1), )), ) + padding=(padding_t, 1, 1), + ) + ), + ) def convert_to_fp16(self): """ @@ -666,13 +696,7 @@ def convert_to_fp32(self): self.middle_block.apply(fn=convert_module_to_f32) self.output_blocks.apply(fn=convert_module_to_f32) - def forward(self, - x, - timesteps=None, - time_emb_replace=None, - context=None, - y=None, - **kwargs): + def forward(self, x, timesteps=None, time_emb_replace=None, context=None, y=None, **kwargs): """ Apply the model to an input batch. :param x: an [N x C x ...] Tensor of inputs. @@ -683,13 +707,12 @@ def forward(self, """ hs = [] if time_emb_replace is None: - t_emb = timestep_embedding( - timesteps, self.model_channels, repeat_only=False) + t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False) emb = self.time_embed(t_emb) else: emb = time_emb_replace if y is not None: - assert y.shape == (x.shape[0], ) + assert y.shape == (x.shape[0],) emb = emb + self.label_emb(y) h = x.astype(self.dtype) for module in self.input_blocks: @@ -711,42 +734,30 @@ class FrameInterpPredUNet(LVDMUNet3DModel): may need to input `mask` to indicate condition, as well as noise level `s` for condition augmentation. """ - def __init__(self, - image_size, - in_channels, - cond_aug_mode=None, - *args, - **kwargs): + def __init__(self, image_size, in_channels, cond_aug_mode=None, *args, **kwargs): super().__init__(image_size, in_channels, *args, **kwargs) if cond_aug_mode == "time_embed": self.time_embed_cond = paddle.nn.Sequential( linear(self.model_channels, self.time_embed_dim), nonlinearity(self.nonlinearity_type), - linear(self.time_embed_dim, self.time_embed_dim), ) + linear(self.time_embed_dim, self.time_embed_dim), + ) elif cond_aug_mode == "learned_embed": pass - def forward(self, - x, - timesteps, - context=None, - y=None, - s=None, - mask=None, - **kwargs): + def forward(self, x, timesteps, context=None, y=None, s=None, mask=None, **kwargs): if s is not None: - s_emb = timestep_embedding( - s, self.model_channels, repeat_only=False) + s_emb = timestep_embedding(s, self.model_channels, repeat_only=False) s_emb = self.time_embed_cond(s_emb) - t_emb = timestep_embedding( - timesteps, self.model_channels, repeat_only=False) + t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False) emb = self.time_embed(t_emb) assert emb.dim() == 2 mask_ = mask[:, :, :, (0), (0)] t = mask.shape[2] - emb_mix = (emb.unsqueeze(axis=2).tile(repeat_times=[1, 1, t]) * - (1 - mask_) + s_emb.unsqueeze(axis=2).tile( - repeat_times=[1, 1, t]) * mask_) + emb_mix = ( + emb.unsqueeze(axis=2).tile(repeat_times=[1, 1, t]) * (1 - mask_) + + s_emb.unsqueeze(axis=2).tile(repeat_times=[1, 1, t]) * mask_ + ) assert emb_mix.dim() == 3 emb_mix = rearrange(emb_mix, "b c t -> b t c") time_emb_replace = emb_mix @@ -754,10 +765,4 @@ def forward(self, else: time_emb_replace = None timesteps = timesteps - return super().forward( - x, - timesteps, - time_emb_replace=time_emb_replace, - context=context, - y=y, - **kwargs) + return super().forward(x, timesteps, time_emb_replace=time_emb_replace, context=context, y=y, **kwargs) diff --git a/ppdiffusers/ppdiffusers/models/lvdm_util.py b/ppdiffusers/ppdiffusers/models/lvdm_util.py index 18551f6900d0f..a3c8faa7fb7fe 100644 --- a/ppdiffusers/ppdiffusers/models/lvdm_util.py +++ b/ppdiffusers/ppdiffusers/models/lvdm_util.py @@ -27,7 +27,7 @@ def make_interp_mask_with_bothsidescond(t, device, n_interp1, n_interp2): """ mask = paddle.zeros(shape=[t]) mask[:n_interp1] = 1 - mask[t - n_interp2:] = 1 + mask[t - n_interp2 :] = 1 return mask @@ -42,14 +42,15 @@ def make_interp_mask_with_framestride(t, device, frame_stride): def random_temporal_masking( - input_shape, - p_interp, - p_pred, - device, - n_interp1=1, - n_interp2=1, - n_prevs=[1], - interp_frame_stride=None, ): + input_shape, + p_interp, + p_pred, + device, + n_interp1=1, + n_interp2=1, + n_prevs=[1], + interp_frame_stride=None, +): """return mask for masking input, where 1 indicates given real image as condition, 0 indicates noisy samples. """ @@ -61,11 +62,9 @@ def random_temporal_masking( r = random.random() if r < p_interp: if interp_frame_stride is not None: - mask[i] = make_interp_mask_with_framestride(t, device, - interp_frame_stride) + mask[i] = make_interp_mask_with_framestride(t, device, interp_frame_stride) else: - mask[i] = make_interp_mask_with_bothsidescond( - t, device, n_interp1, n_interp2) + mask[i] = make_interp_mask_with_bothsidescond(t, device, n_interp1, n_interp2) elif p_interp <= r < p_interp + p_pred: n_pred = random.choice(n_prevs) mask[(i), :n_pred] = 1 @@ -76,51 +75,35 @@ def random_temporal_masking( return mask -def make_beta_schedule(schedule, - n_timestep, - linear_start=0.0001, - linear_end=0.02, - cosine_s=0.008): +def make_beta_schedule(schedule, n_timestep, linear_start=0.0001, linear_end=0.02, cosine_s=0.008): if schedule == "linear": - betas = (paddle.linspace( - start=linear_start**0.5, stop=linear_end**0.5, - num=n_timestep).astype("float64")**2) + betas = ( + paddle.linspace(start=linear_start**0.5, stop=linear_end**0.5, num=n_timestep).astype("float64") ** 2 + ) elif schedule == "cosine": - timesteps = (paddle.arange(end=n_timestep + 1).astype("float64") / - n_timestep + cosine_s) + timesteps = paddle.arange(end=n_timestep + 1).astype("float64") / n_timestep + cosine_s alphas = timesteps / (1 + cosine_s) * np.pi / 2 alphas = paddle.cos(x=alphas).pow(y=2) alphas = alphas / alphas[0] betas = 1 - alphas[1:] / alphas[:-1] betas = np.clip(betas, a_min=0, a_max=0.999) elif schedule == "sqrt_linear": - betas = paddle.linspace( - start=linear_start, stop=linear_end, - num=n_timestep).astype("float64") + betas = paddle.linspace(start=linear_start, stop=linear_end, num=n_timestep).astype("float64") elif schedule == "sqrt": - betas = (paddle.linspace( - start=linear_start, stop=linear_end, - num=n_timestep).astype("float64")**0.5) + betas = paddle.linspace(start=linear_start, stop=linear_end, num=n_timestep).astype("float64") ** 0.5 else: raise ValueError(f"schedule '{schedule}' unknown.") return betas.numpy() -def make_ddim_timesteps(ddim_discr_method, - num_ddim_timesteps, - num_ddpm_timesteps, - verbose=True): +def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True): if ddim_discr_method == "uniform": c = num_ddpm_timesteps // num_ddim_timesteps ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c))) elif ddim_discr_method == "quad": - ddim_timesteps = (np.linspace(0, - np.sqrt(num_ddpm_timesteps * 0.8), - num_ddim_timesteps)**2).astype(int) + ddim_timesteps = (np.linspace(0, np.sqrt(num_ddpm_timesteps * 0.8), num_ddim_timesteps) ** 2).astype(int) else: - raise NotImplementedError( - f'There is no ddim discretization method called "{ddim_discr_method}"' - ) + raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"') steps_out = ddim_timesteps + 1 if verbose: print(f"Selected timesteps for ddim sampler: {steps_out}") @@ -129,14 +112,10 @@ def make_ddim_timesteps(ddim_discr_method, def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True): alphas = alphacums[ddim_timesteps] - alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]] - .tolist()) - sigmas = eta * np.sqrt( - (1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev)) + alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist()) + sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev)) if verbose: - print( - f"Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}" - ) + print(f"Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}") print( f"For the chosen value of eta, which is {eta}, this results in the following sigma_t schedule for ddim sampler {sigmas}" ) @@ -165,7 +144,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999): def extract_into_tensor(a, t, x_shape): b, *_ = t.shape out = a.take_along_axis(axis=-1, indices=t) - return out.reshape([b, *((1, ) * (len(x_shape) - 1))]) + return out.reshape([b, *((1,) * (len(x_shape) - 1))]) def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False): @@ -179,14 +158,13 @@ def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False): """ if not repeat_only: half = dim // 2 - freqs = paddle.exp(x=(-math.log(max_period) * paddle.arange( - start=0, end=half).astype("float32") / half).astype("float32")) + freqs = paddle.exp( + x=(-math.log(max_period) * paddle.arange(start=0, end=half).astype("float32") / half).astype("float32") + ) args = timesteps[:, (None)].astype(dtype="float32") * freqs[None] - embedding = paddle.concat( - x=[paddle.cos(x=args), paddle.sin(x=args)], axis=-1) + embedding = paddle.concat(x=[paddle.cos(x=args), paddle.sin(x=args)], axis=-1) if dim % 2: - embedding = paddle.concat( - x=[embedding, paddle.zeros_like(x=embedding[:, :1])], axis=-1) + embedding = paddle.concat(x=[embedding, paddle.zeros_like(x=embedding[:, :1])], axis=-1) else: embedding = repeat(timesteps, "b -> b d", d=dim) return embedding @@ -232,7 +210,8 @@ def Normalize(in_channels): num_channels=in_channels, epsilon=1e-06, weight_attr=None, - bias_attr=None, ) + bias_attr=None, + ) def identity(*args, **kwargs): @@ -249,8 +228,7 @@ def nonlinearity(type="silu"): class GEGLU(paddle.nn.Layer): def __init__(self, dim_in, dim_out): super().__init__() - self.proj = paddle.nn.Linear( - in_features=dim_in, out_features=dim_out * 2) + self.proj = paddle.nn.Linear(in_features=dim_in, out_features=dim_out * 2) def forward(self, x): x, gate = self.proj(x).chunk(chunks=2, axis=-1) diff --git a/ppdiffusers/ppdiffusers/models/lvdm_vae.py b/ppdiffusers/ppdiffusers/models/lvdm_vae.py index 88c1e8a5ac1f0..089afdf908e94 100644 --- a/ppdiffusers/ppdiffusers/models/lvdm_vae.py +++ b/ppdiffusers/ppdiffusers/models/lvdm_vae.py @@ -24,11 +24,7 @@ def conv3d(in_channels, out_channels, kernel_size, conv3d_type="SamePadConv3d"): if conv3d_type == "SamePadConv3d": - return SamePadConv3d( - in_channels, - out_channels, - kernel_size=kernel_size, - padding_type="replicate") + return SamePadConv3d(in_channels, out_channels, kernel_size=kernel_size, padding_type="replicate") else: raise NotImplementedError @@ -50,23 +46,24 @@ class AutoencoderKLOutput(BaseOutput): class LVDMAutoencoderKL(ModelMixin, ConfigMixin): @register_to_config def __init__( - self, - n_hiddens=32, - downsample=[4, 8, 8], - z_channels=4, - double_z=True, - image_channel=3, - norm_type="group", - padding_type="replicate", - upsample=[4, 8, 8], - embed_dim=4, - # ckpt_path=None, - # ignore_keys=[], - image_key="image", - monitor=None, - std=1.0, - mean=0.0, - prob=0.2, ): + self, + n_hiddens=32, + downsample=[4, 8, 8], + z_channels=4, + double_z=True, + image_channel=3, + norm_type="group", + padding_type="replicate", + upsample=[4, 8, 8], + embed_dim=4, + # ckpt_path=None, + # ignore_keys=[], + image_key="image", + monitor=None, + std=1.0, + mean=0.0, + prob=0.2, + ): super().__init__() self.image_key = image_key # pass init params to Encoder @@ -77,7 +74,8 @@ def __init__( double_z=double_z, image_channel=image_channel, norm_type=norm_type, - padding_type=padding_type, ) + padding_type=padding_type, + ) # pass init params to Decoder self.decoder = Decoder( @@ -85,7 +83,8 @@ def __init__( upsample=upsample, z_channels=z_channels, image_channel=image_channel, - norm_type="group", ) + norm_type="group", + ) self.quant_conv = conv3d(2 * z_channels, 2 * embed_dim, 1) self.post_quant_conv = conv3d(embed_dim, z_channels, 1) diff --git a/ppdiffusers/ppdiffusers/models/modeling_pytorch_paddle_utils.py b/ppdiffusers/ppdiffusers/models/modeling_pytorch_paddle_utils.py index bf8d26d5beaf5..213b2efdd2ca9 100644 --- a/ppdiffusers/ppdiffusers/models/modeling_pytorch_paddle_utils.py +++ b/ppdiffusers/ppdiffusers/models/modeling_pytorch_paddle_utils.py @@ -20,9 +20,7 @@ ##################### -def convert_pytorch_state_dict_to_paddle(pt_state_dict, - paddle_model: nn.Layer, - sub_layer=None): +def convert_pytorch_state_dict_to_paddle(pt_state_dict, paddle_model: nn.Layer, sub_layer=None): # Step 1: Find Linear layer which need transpose weight linear_need_transpose = [] for k, v in paddle_model.named_sublayers(include_self=True): @@ -51,7 +49,7 @@ def convert_pytorch_state_dict_to_paddle(pt_state_dict, pt_tensor = pt_tensor.T # (2) 0d tensor -> 1d tensor if pt_tensor.ndim == 0: - pt_tensor = pt_tensor.reshape((1, )) + pt_tensor = pt_tensor.reshape((1,)) # (3) name mapping for old_key, new_key in ptname2pdname.items(): pt_key = pt_key.replace(old_key, new_key) @@ -61,10 +59,7 @@ def convert_pytorch_state_dict_to_paddle(pt_state_dict, @classmethod -def convert_pytorch_state_dict_to_paddle_class_method(cls, - pt_state_dict, - paddle_model: nn.Layer, - sub_layer=None): +def convert_pytorch_state_dict_to_paddle_class_method(cls, pt_state_dict, paddle_model: nn.Layer, sub_layer=None): # Step 1: Find Linear layer which need transpose weight linear_need_transpose = [] for k, v in paddle_model.named_sublayers(include_self=True): @@ -96,7 +91,7 @@ def convert_pytorch_state_dict_to_paddle_class_method(cls, pt_tensor = pt_tensor.T # (2) 0d tensor -> 1d tensor if pt_tensor.ndim == 0: - pt_tensor = pt_tensor.reshape((1, )) + pt_tensor = pt_tensor.reshape((1,)) # (3) name mapping for old_key, new_key in ptname2pdname.items(): pt_key = pt_key.replace(old_key, new_key) @@ -137,9 +132,7 @@ def convert_paddle_state_dict_to_pytorch(pd_state_dict, paddle_model: nn.Layer): pd_key = pd_key.replace(new_key, old_key) if hasattr(paddle_model, "paddle_torch_name_mapping"): pd_key = paddle_model.paddle_torch_name_mapping.get(pd_key, pd_key) - pytorch_state_dict[pd_key] = (pd_tensor.contiguous() - if hasattr(pd_tensor, "contiguous") else - pd_tensor) + pytorch_state_dict[pd_key] = pd_tensor.contiguous() if hasattr(pd_tensor, "contiguous") else pd_tensor return pytorch_state_dict diff --git a/ppdiffusers/ppdiffusers/models/modeling_utils.py b/ppdiffusers/ppdiffusers/models/modeling_utils.py index 27514475bc7c2..bf9ed3663d724 100644 --- a/ppdiffusers/ppdiffusers/models/modeling_utils.py +++ b/ppdiffusers/ppdiffusers/models/modeling_utils.py @@ -21,16 +21,33 @@ import paddle import paddle.nn as nn -from ..utils import (CONFIG_NAME, DIFFUSERS_CACHE, FROM_DIFFUSERS, FROM_HF_HUB, - HF_HUB_OFFLINE, LOW_CPU_MEM_USAGE_DEFAULT, - PADDLE_WEIGHTS_NAME, PPDIFFUSERS_CACHE, TO_DIFFUSERS, - TORCH_SAFETENSORS_WEIGHTS_NAME, TORCH_WEIGHTS_NAME, - _add_variant, _get_model_file, deprecate, - is_paddlenlp_available, is_safetensors_available, - is_torch_available, is_torch_file, logging, smart_load) +from ..utils import ( + CONFIG_NAME, + DIFFUSERS_CACHE, + FROM_DIFFUSERS, + FROM_HF_HUB, + HF_HUB_OFFLINE, + LOW_CPU_MEM_USAGE_DEFAULT, + PADDLE_WEIGHTS_NAME, + PPDIFFUSERS_CACHE, + TO_DIFFUSERS, + TORCH_SAFETENSORS_WEIGHTS_NAME, + TORCH_WEIGHTS_NAME, + _add_variant, + _get_model_file, + deprecate, + is_paddlenlp_available, + is_safetensors_available, + is_torch_available, + is_torch_file, + logging, + smart_load, +) from ..version import VERSION as __version__ from .modeling_pytorch_paddle_utils import ( - convert_paddle_state_dict_to_pytorch, convert_pytorch_state_dict_to_paddle) + convert_paddle_state_dict_to_pytorch, + convert_pytorch_state_dict_to_paddle, +) logger = logging.get_logger(__name__) @@ -87,11 +104,7 @@ def convert_state_dict(state_dict, framework="torch"): state_dict = {k: v.cpu().numpy() for k, v in state_dict.items()} return state_dict elif framework in ["paddle", "pd"]: - state_dict = { - k: paddle.to_tensor( - v, place="cpu") - for k, v in state_dict.items() - } + state_dict = {k: paddle.to_tensor(v, place="cpu") for k, v in state_dict.items()} return state_dict else: raise NotImplementedError(f"Not Implemented {framework} framework!") @@ -129,9 +142,7 @@ class ModelMixin(nn.Layer): [`~models.ModelMixin.save_pretrained`]. """ config_name = CONFIG_NAME - _automatically_saved_args = [ - "_ppdiffusers_version", "_class_name", "_name_or_path" - ] + _automatically_saved_args = ["_ppdiffusers_version", "_class_name", "_name_or_path"] _supports_gradient_checkpointing = False def __init__(self): @@ -144,8 +155,7 @@ def __getattr__(self, name: str) -> Any: https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module """ - is_in_config = "_internal_dict" in self.__dict__ and hasattr( - self.__dict__["_internal_dict"], name) + is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name) is_attribute = name in self.__dict__ if is_in_config and not is_attribute: @@ -155,7 +165,8 @@ def __getattr__(self, name: str) -> Any: "1.0.0", deprecation_message, standard_warn=False, - stacklevel=3, ) + stacklevel=3, + ) return self._internal_dict[name] # call PyTorch's https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module @@ -171,7 +182,8 @@ def is_gradient_checkpointing(self) -> bool: """ return any( hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing - for m in self.sublayers(include_self=True)) + for m in self.sublayers(include_self=True) + ) def enable_gradient_checkpointing(self): """ @@ -181,9 +193,7 @@ def enable_gradient_checkpointing(self): activations". """ if not self._supports_gradient_checkpointing: - raise ValueError( - f"{self.__class__.__name__} does not support gradient checkpointing." - ) + raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.") self.apply(partial(self._set_gradient_checkpointing, value=True)) def disable_gradient_checkpointing(self): @@ -196,15 +206,13 @@ def disable_gradient_checkpointing(self): if self._supports_gradient_checkpointing: self.apply(partial(self._set_gradient_checkpointing, value=False)) - def set_use_memory_efficient_attention_xformers( - self, valid: bool, attention_op: Optional[str]=None) -> None: + def set_use_memory_efficient_attention_xformers(self, valid: bool, attention_op: Optional[str] = None) -> None: # Recursively walk through all the children. # Any children which exposes the set_use_memory_efficient_attention_xformers method # gets the message def fn_recursive_set_mem_eff(module: nn.Layer): if hasattr(module, "set_use_memory_efficient_attention_xformers"): - module.set_use_memory_efficient_attention_xformers(valid, - attention_op) + module.set_use_memory_efficient_attention_xformers(valid, attention_op) for child in module.children(): fn_recursive_set_mem_eff(child) @@ -213,8 +221,7 @@ def fn_recursive_set_mem_eff(module: nn.Layer): if isinstance(module, nn.Layer): fn_recursive_set_mem_eff(module) - def enable_xformers_memory_efficient_attention( - self, attention_op: Optional[str]=None): + def enable_xformers_memory_efficient_attention(self, attention_op: Optional[str] = None): r""" Enable memory efficient attention as implemented in xformers. @@ -249,13 +256,14 @@ def disable_xformers_memory_efficient_attention(self): self.set_use_memory_efficient_attention_xformers(False) def save_pretrained( - self, - save_directory: Union[str, os.PathLike], - is_main_process: bool=True, - save_function: Callable=None, - safe_serialization: bool=False, - variant: Optional[str]=None, - to_diffusers: Optional[bool]=None, ): + self, + save_directory: Union[str, os.PathLike], + is_main_process: bool = True, + save_function: Callable = None, + safe_serialization: bool = False, + variant: Optional[str] = None, + to_diffusers: Optional[bool] = None, + ): """ Save a model and its configuration file to a directory, so that it can be re-loaded using the `[`~models.ModelMixin.from_pretrained`]` class method. @@ -280,16 +288,11 @@ def save_pretrained( """ if to_diffusers is None: to_diffusers = TO_DIFFUSERS - if to_diffusers and safe_serialization and not is_safetensors_available( - ): - raise ImportError( - "`safe_serialization` requires the `safetensors library: `pip install safetensors`." - ) + if to_diffusers and safe_serialization and not is_safetensors_available(): + raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.") if os.path.isfile(save_directory): - logger.error( - f"Provided path ({save_directory}) should be a directory, not a file" - ) + logger.error(f"Provided path ({save_directory}) should be a directory, not a file") return os.makedirs(save_directory, exist_ok=True) @@ -314,14 +317,11 @@ def save_pretrained( if safe_serialization: if is_torch_available(): save_function = safetensors_torch_save_file - state_dict = convert_state_dict( - state_dict, framework="torch") + state_dict = convert_state_dict(state_dict, framework="torch") else: save_function = safetensors_numpy_save_file - state_dict = convert_state_dict( - state_dict, framework="numpy") - weights_name = _add_variant(TORCH_SAFETENSORS_WEIGHTS_NAME, - variant) + state_dict = convert_state_dict(state_dict, framework="numpy") + weights_name = _add_variant(TORCH_SAFETENSORS_WEIGHTS_NAME, variant) else: if not is_torch_available(): raise ImportError( @@ -329,11 +329,9 @@ def save_pretrained( ) save_function = torch.save weights_name = _add_variant(TORCH_WEIGHTS_NAME, variant) - state_dict = convert_state_dict( - state_dict, framework="torch") + state_dict = convert_state_dict(state_dict, framework="torch") - state_dict = convert_paddle_state_dict_to_pytorch(state_dict, - model_to_save) + state_dict = convert_paddle_state_dict_to_pytorch(state_dict, model_to_save) else: save_function = paddle.save weights_name = _add_variant(PADDLE_WEIGHTS_NAME, variant) @@ -341,15 +339,10 @@ def save_pretrained( # Save the model save_function(state_dict, os.path.join(save_directory, weights_name)) - logger.info( - f"Model weights saved in {os.path.join(save_directory, weights_name)}" - ) + logger.info(f"Model weights saved in {os.path.join(save_directory, weights_name)}") @classmethod - def from_pretrained( - cls, - pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], - **kwargs): + def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs): r""" Instantiate a pretrained pytorch model from a pre-trained model configuration. @@ -425,8 +418,9 @@ def from_pretrained( """ from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB) - cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub - else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)) + cache_dir = ( + kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE) + ) ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False) force_download = kwargs.pop("force_download", False) from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS) @@ -439,13 +433,11 @@ def from_pretrained( paddle_dtype = kwargs.pop("paddle_dtype", None) subfolder = kwargs.pop("subfolder", None) ignore_keys = kwargs.pop("ignore_keys", None) - low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", - LOW_CPU_MEM_USAGE_DEFAULT) + low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", LOW_CPU_MEM_USAGE_DEFAULT) variant = kwargs.pop("variant", None) use_safetensors = kwargs.pop("use_safetensors", None) - if from_diffusers and use_safetensors and not is_safetensors_available( - ): + if from_diffusers and use_safetensors and not is_safetensors_available(): raise ValueError( "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors" ) @@ -476,7 +468,8 @@ def from_pretrained( subfolder=subfolder, user_agent=user_agent, from_hf_hub=from_hf_hub, # whether or not from_hf_hub - **kwargs, ) + **kwargs, + ) # This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the # Load model @@ -486,8 +479,7 @@ def from_pretrained( try: model_file = _get_model_file( pretrained_model_name_or_path, - weights_name=_add_variant( - TORCH_SAFETENSORS_WEIGHTS_NAME, variant), + weights_name=_add_variant(TORCH_SAFETENSORS_WEIGHTS_NAME, variant), cache_dir=cache_dir, force_download=force_download, resume_download=resume_download, @@ -498,7 +490,8 @@ def from_pretrained( subfolder=subfolder, user_agent=user_agent, commit_hash=commit_hash, - from_hf_hub=from_hf_hub, ) + from_hf_hub=from_hf_hub, + ) # try load model_file with paddle / torch / safetensor state_dict = smart_load(model_file) except Exception: @@ -518,7 +511,8 @@ def from_pretrained( subfolder=subfolder, user_agent=user_agent, commit_hash=commit_hash, - from_hf_hub=from_hf_hub, ) + from_hf_hub=from_hf_hub, + ) # try load model_file with paddle / torch / safetensor state_dict = smart_load(model_file) else: @@ -535,18 +529,19 @@ def from_pretrained( subfolder=subfolder, user_agent=user_agent, commit_hash=commit_hash, - from_hf_hub=from_hf_hub, ) + from_hf_hub=from_hf_hub, + ) # try load model_file with paddle / torch / safetensor state_dict = smart_load(model_file) init_contexts = [] - dtype = set(v.dtype for v in state_dict.values() - if paddle.is_tensor(v) and paddle.is_floating_point(v)) + dtype = set(v.dtype for v in state_dict.values() if paddle.is_tensor(v) and paddle.is_floating_point(v)) if len(dtype) > 1 and paddle.float32 not in dtype: raise ValueError( f"The weights of the model file {model_file} have a mixture of incompatible dtypes {dtype}. Please" - f" make sure that {model_file} weights have only one dtype.") + f" make sure that {model_file} weights have only one dtype." + ) elif len(dtype) > 1 and paddle.float32 in dtype: dtype = paddle.float32 elif len(dtype) == 0: @@ -580,21 +575,16 @@ def from_pretrained( for k in keys: for ik in ignore_keys: if k.startswith(ik): - logger.warning( - "Deleting key {} from state_dict.".format(k)) + logger.warning("Deleting key {} from state_dict.".format(k)) del state_dict[k] - ( + (model, missing_keys, unexpected_keys, mismatched_keys, error_msgs,) = cls._load_pretrained_model( model, - missing_keys, - unexpected_keys, - mismatched_keys, - error_msgs, ) = cls._load_pretrained_model( - model, - state_dict, - model_file, - pretrained_model_name_or_path, - ignore_mismatched_sizes=ignore_mismatched_sizes, ) + state_dict, + model_file, + pretrained_model_name_or_path, + ignore_mismatched_sizes=ignore_mismatched_sizes, + ) loading_info = { "missing_keys": missing_keys, @@ -621,12 +611,13 @@ def from_pretrained( @classmethod def _load_pretrained_model( - cls, - model, - state_dict, - resolved_archive_file, - pretrained_model_name_or_path, - ignore_mismatched_sizes=False, ): + cls, + model, + state_dict, + resolved_archive_file, + pretrained_model_name_or_path, + ignore_mismatched_sizes=False, + ): # Retrieve missing & unexpected_keys model_state_dict = model.state_dict() loaded_keys = list(state_dict.keys()) @@ -642,21 +633,25 @@ def _load_pretrained_model( model_to_load = model def _find_mismatched_keys( - state_dict, - model_state_dict, - loaded_keys, - ignore_mismatched_sizes, ): + state_dict, + model_state_dict, + loaded_keys, + ignore_mismatched_sizes, + ): mismatched_keys = [] for checkpoint_key in loaded_keys: model_key = checkpoint_key - if model_key in model_state_dict and list(state_dict[ - checkpoint_key].shape) != list(model_state_dict[ - model_key].shape): - mismatched_keys.append(( - checkpoint_key, - state_dict[checkpoint_key].shape, - model_state_dict[model_key].shape, )) + if model_key in model_state_dict and list(state_dict[checkpoint_key].shape) != list( + model_state_dict[model_key].shape + ): + mismatched_keys.append( + ( + checkpoint_key, + state_dict[checkpoint_key].shape, + model_state_dict[model_key].shape, + ) + ) del state_dict[checkpoint_key] if ignore_mismatched_sizes: mismatched_keys = [] @@ -668,7 +663,8 @@ def _find_mismatched_keys( state_dict, model_state_dict, original_loaded_keys, - ignore_mismatched_sizes, ) + ignore_mismatched_sizes, + ) error_msgs = [] for key_name, loaded_shape, model_shape in mismatched_keys: error_msgs.append( @@ -679,10 +675,10 @@ def _find_mismatched_keys( if len(error_msgs) > 0: error_msg = "\n\t".join(error_msgs) if "size mismatch" in error_msg: - error_msg += "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method." - raise RuntimeError( - f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}" - ) + error_msg += ( + "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method." + ) + raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}") if len(unexpected_keys) > 0: logger.warning( @@ -693,11 +689,10 @@ def _find_mismatched_keys( " BertForPreTraining model).\n- This IS NOT expected if you are initializing" f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly" " identical (initializing a BertForSequenceClassification model from a" - " BertForSequenceClassification model).") - else: - logger.info( - f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n" + " BertForSequenceClassification model)." ) + else: + logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n") if len(missing_keys) > 0: logger.warning( f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at" @@ -709,17 +704,21 @@ def _find_mismatched_keys( f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at" f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the" f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions" - " without further training.") + " without further training." + ) if len(mismatched_keys) > 0: - mismatched_warning = "\n".join([ - f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated" - for key, shape1, shape2 in mismatched_keys - ]) + mismatched_warning = "\n".join( + [ + f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated" + for key, shape1, shape2 in mismatched_keys + ] + ) logger.warning( f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at" f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not" f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be" - " able to use it for predictions and inference.") + " able to use it for predictions and inference." + ) return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs @@ -738,9 +737,7 @@ def dtype(self) -> paddle.dtype: """ return get_parameter_dtype(self) - def num_parameters(self, - only_trainable: bool=False, - exclude_embeddings: bool=False) -> int: + def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int: """ Get number of (optionally, trainable or non-embeddings) parameters in the module. @@ -762,14 +759,11 @@ def num_parameters(self, if isinstance(module_type, nn.Embedding) ] non_embedding_parameters = [ - parameter for name, parameter in self.named_parameters() - if name not in embedding_param_names + parameter for name, parameter in self.named_parameters() if name not in embedding_param_names ] - return sum(p.numel() for p in non_embedding_parameters - if not p.stop_gradient or not only_trainable) + return sum(p.numel() for p in non_embedding_parameters if not p.stop_gradient or not only_trainable) else: - return sum(p.numel() for p in self.parameters() - if not p.stop_gradient or not only_trainable) + return sum(p.numel() for p in self.parameters() if not p.stop_gradient or not only_trainable) def unfreeze_params(params): diff --git a/ppdiffusers/ppdiffusers/models/prior_transformer.py b/ppdiffusers/ppdiffusers/models/prior_transformer.py index 8d1b6af0782a0..90c1da6ee3232 100644 --- a/ppdiffusers/ppdiffusers/models/prior_transformer.py +++ b/ppdiffusers/ppdiffusers/models/prior_transformer.py @@ -65,14 +65,15 @@ class PriorTransformer(ModelMixin, ConfigMixin): @register_to_config def __init__( - self, - num_attention_heads: int=32, - attention_head_dim: int=64, - num_layers: int=20, - embedding_dim: int=768, - num_embeddings=77, - additional_embeddings=4, - dropout: float=0.0, ): + self, + num_attention_heads: int = 32, + attention_head_dim: int = 64, + num_layers: int = 20, + embedding_dim: int = 768, + num_embeddings=77, + additional_embeddings=4, + dropout: float = 0.0, + ): super().__init__() self.num_attention_heads = num_attention_heads self.attention_head_dim = attention_head_dim @@ -90,20 +91,26 @@ def __init__( self.positional_embedding = self.create_parameter( (1, num_embeddings + additional_embeddings, inner_dim), dtype=paddle.get_default_dtype(), - default_initializer=nn.initializer.Constant(0.0), ) + default_initializer=nn.initializer.Constant(0.0), + ) self.prd_embedding = self.create_parameter( (1, 1, inner_dim), dtype=paddle.get_default_dtype(), - default_initializer=nn.initializer.Constant(0.0), ) - self.transformer_blocks = nn.LayerList([ - BasicTransformerBlock( - inner_dim, - num_attention_heads, - attention_head_dim, - dropout=dropout, - activation_fn="gelu", - attention_bias=True, ) for d in range(num_layers) - ]) + default_initializer=nn.initializer.Constant(0.0), + ) + self.transformer_blocks = nn.LayerList( + [ + BasicTransformerBlock( + inner_dim, + num_attention_heads, + attention_head_dim, + dropout=dropout, + activation_fn="gelu", + attention_bias=True, + ) + for d in range(num_layers) + ] + ) self.norm_out = nn.LayerNorm(inner_dim) self.proj_to_clip_embeddings = nn.Linear(inner_dim, embedding_dim) @@ -114,29 +121,33 @@ def __init__( num_embeddings + additional_embeddings, num_embeddings + additional_embeddings, ], - NEG_INF, ), - 1, ) + NEG_INF, + ), + 1, + ) causal_attention_mask = causal_attention_mask.unsqueeze(0) - self.register_buffer( - "causal_attention_mask", causal_attention_mask, persistable=False) + self.register_buffer("causal_attention_mask", causal_attention_mask, persistable=False) self.clip_mean = self.create_parameter( (1, embedding_dim), dtype=paddle.get_default_dtype(), - default_initializer=nn.initializer.Constant(0.0), ) + default_initializer=nn.initializer.Constant(0.0), + ) self.clip_std = self.create_parameter( (1, embedding_dim), dtype=paddle.get_default_dtype(), - default_initializer=nn.initializer.Constant(0.0), ) + default_initializer=nn.initializer.Constant(0.0), + ) def forward( - self, - hidden_states, - timestep: Union[paddle.Tensor, float, int], - proj_embedding: paddle.Tensor, - encoder_hidden_states: paddle.Tensor, - attention_mask: Optional[paddle.Tensor]=None, - return_dict: bool=True, ): + self, + hidden_states, + timestep: Union[paddle.Tensor, float, int], + proj_embedding: paddle.Tensor, + encoder_hidden_states: paddle.Tensor, + attention_mask: Optional[paddle.Tensor] = None, + return_dict: bool = True, + ): """ Args: hidden_states (`paddle.Tensor` of shape `(batch_size, embedding_dim)`): @@ -168,8 +179,7 @@ def forward( timesteps = timesteps[None] # broadcast to batch dimension in a way that's compatible with ONNX/Core ML - timesteps = timesteps * paddle.ones( - (batch_size, ), dtype=timesteps.dtype) + timesteps = timesteps * paddle.ones((batch_size,), dtype=timesteps.dtype) timesteps_projected = self.time_proj(timesteps) @@ -179,13 +189,10 @@ def forward( time_embeddings = self.time_embedding(timesteps_projected) proj_embeddings = self.embedding_proj(proj_embedding) - encoder_hidden_states = self.encoder_hidden_states_proj( - encoder_hidden_states) + encoder_hidden_states = self.encoder_hidden_states_proj(encoder_hidden_states) hidden_states = self.proj_in(hidden_states) - prd_embedding = self.prd_embedding.cast(hidden_states.dtype).expand( - [batch_size, -1, -1]) - positional_embeddings = self.positional_embedding.cast( - hidden_states.dtype) + prd_embedding = self.prd_embedding.cast(hidden_states.dtype).expand([batch_size, -1, -1]) + positional_embeddings = self.positional_embedding.cast(hidden_states.dtype) hidden_states = paddle.concat( [ @@ -195,23 +202,21 @@ def forward( hidden_states[:, None, :], prd_embedding, ], - axis=1, ) + axis=1, + ) hidden_states = hidden_states + positional_embeddings if attention_mask is not None: - attention_mask = ( - 1 - attention_mask.cast(hidden_states.dtype)) * NEG_INF + attention_mask = (1 - attention_mask.cast(hidden_states.dtype)) * NEG_INF attention_mask = F.pad( attention_mask.unsqueeze(0), (0, self.additional_embeddings), value=0.0, - data_format="NCL", ).squeeze(0) - attention_mask = ( - attention_mask[:, None, :] + self.causal_attention_mask - ).cast(hidden_states.dtype) - attention_mask = attention_mask.repeat_interleave( - self.config.num_attention_heads, axis=0) + data_format="NCL", + ).squeeze(0) + attention_mask = (attention_mask[:, None, :] + self.causal_attention_mask).cast(hidden_states.dtype) + attention_mask = attention_mask.repeat_interleave(self.config.num_attention_heads, axis=0) for block in self.transformer_blocks: hidden_states = block(hidden_states, attention_mask=attention_mask) @@ -221,10 +226,9 @@ def forward( predicted_image_embedding = self.proj_to_clip_embeddings(hidden_states) if not return_dict: - return (predicted_image_embedding, ) + return (predicted_image_embedding,) - return PriorTransformerOutput( - predicted_image_embedding=predicted_image_embedding) + return PriorTransformerOutput(predicted_image_embedding=predicted_image_embedding) def post_process_latents(self, prior_latents): prior_latents = (prior_latents * self.clip_std) + self.clip_mean diff --git a/ppdiffusers/ppdiffusers/models/resnet.py b/ppdiffusers/ppdiffusers/models/resnet.py index 60998dc3fc1b7..39bf23c59264d 100644 --- a/ppdiffusers/ppdiffusers/models/resnet.py +++ b/ppdiffusers/ppdiffusers/models/resnet.py @@ -37,12 +37,13 @@ class Upsample1D(nn.Layer): """ def __init__( - self, - channels, - use_conv=False, - use_conv_transpose=False, - out_channels=None, - name="conv", ): + self, + channels, + use_conv=False, + use_conv_transpose=False, + out_channels=None, + name="conv", + ): super().__init__() self.channels = channels self.out_channels = out_channels or channels @@ -54,8 +55,7 @@ def __init__( if use_conv_transpose: self.conv = nn.Conv1DTranspose(channels, self.out_channels, 4, 2, 1) elif use_conv: - self.conv = nn.Conv1D( - self.channels, self.out_channels, 3, padding=1) + self.conv = nn.Conv1D(self.channels, self.out_channels, 3, padding=1) def forward(self, x): assert x.shape[1] == self.channels @@ -81,12 +81,7 @@ class Downsample1D(nn.Layer): padding: """ - def __init__(self, - channels, - use_conv=False, - out_channels=None, - padding=1, - name="conv"): + def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"): super().__init__() self.channels = channels self.out_channels = out_channels or channels @@ -96,12 +91,7 @@ def __init__(self, self.name = name if use_conv: - self.conv = nn.Conv1D( - self.channels, - self.out_channels, - 3, - stride=stride, - padding=padding) + self.conv = nn.Conv1D(self.channels, self.out_channels, 3, stride=stride, padding=padding) else: assert self.channels == self.out_channels self.conv = nn.AvgPool1D(kernel_size=stride, stride=stride) @@ -123,12 +113,13 @@ class Upsample2D(nn.Layer): """ def __init__( - self, - channels, - use_conv=False, - use_conv_transpose=False, - out_channels=None, - name="conv", ): + self, + channels, + use_conv=False, + use_conv_transpose=False, + out_channels=None, + name="conv", + ): super().__init__() self.channels = channels self.out_channels = out_channels or channels @@ -164,11 +155,9 @@ def forward(self, hidden_states, output_size=None): # if `output_size` is passed we force the interpolation output # size and do not make use of `scale_factor=2` if output_size is None: - hidden_states = F.interpolate( - hidden_states, scale_factor=2.0, mode="nearest") + hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest") else: - hidden_states = F.interpolate( - hidden_states, size=output_size, mode="nearest") + hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest") # If the input is bfloat16, we cast back to bfloat16 if dtype == paddle.bfloat16: @@ -195,12 +184,7 @@ class Downsample2D(nn.Layer): padding: """ - def __init__(self, - channels, - use_conv=False, - out_channels=None, - padding=1, - name="conv"): + def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"): super().__init__() self.channels = channels self.out_channels = out_channels or channels @@ -210,12 +194,7 @@ def __init__(self, self.name = name if use_conv: - conv = nn.Conv2D( - self.channels, - self.out_channels, - 3, - stride=stride, - padding=padding) + conv = nn.Conv2D(self.channels, self.out_channels, 3, stride=stride, padding=padding) else: assert self.channels == self.out_channels conv = nn.AvgPool2D(kernel_size=stride, stride=stride) @@ -242,26 +221,16 @@ def forward(self, hidden_states): class FirUpsample2D(nn.Layer): - def __init__(self, - channels=None, - out_channels=None, - use_conv=False, - fir_kernel=(1, 3, 3, 1)): + def __init__(self, channels=None, out_channels=None, use_conv=False, fir_kernel=(1, 3, 3, 1)): super().__init__() out_channels = out_channels if out_channels else channels if use_conv: - self.Conv2d_0 = nn.Conv2D( - channels, out_channels, kernel_size=3, stride=1, padding=1) + self.Conv2d_0 = nn.Conv2D(channels, out_channels, kernel_size=3, stride=1, padding=1) self.use_conv = use_conv self.fir_kernel = fir_kernel self.out_channels = out_channels - def _upsample_2d(self, - hidden_states, - weight=None, - kernel=None, - factor=2, - gain=1): + def _upsample_2d(self, hidden_states, weight=None, kernel=None, factor=2, gain=1): """Fused `upsample_2d()` followed by `Conv2d()`. Padding is performed only once at the beginning, not between the operations. The fused op is considerably more @@ -307,12 +276,12 @@ def _upsample_2d(self, # Determine data dimensions. output_shape = ( (hidden_states.shape[2] - 1) * factor + convH, - (hidden_states.shape[3] - 1) * factor + convW, ) + (hidden_states.shape[3] - 1) * factor + convW, + ) output_padding = ( - output_shape[0] - - (hidden_states.shape[2] - 1) * stride[0] - convH, - output_shape[1] - - (hidden_states.shape[3] - 1) * stride[1] - convW, ) + output_shape[0] - (hidden_states.shape[2] - 1) * stride[0] - convH, + output_shape[1] - (hidden_states.shape[3] - 1) * stride[1] - convW, + ) assert output_padding[0] >= 0 and output_padding[1] >= 0 num_groups = hidden_states.shape[1] // inC @@ -326,55 +295,46 @@ def _upsample_2d(self, weight, stride=stride, output_padding=output_padding, - padding=0, ) + padding=0, + ) output = upfirdn2d_native( inverse_conv, paddle.to_tensor(kernel), - pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2 + 1), ) + pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2 + 1), + ) else: pad_value = kernel.shape[0] - factor output = upfirdn2d_native( hidden_states, paddle.to_tensor(kernel), up=factor, - pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2), ) + pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2), + ) return output def forward(self, hidden_states): if self.use_conv: - height = self._upsample_2d( - hidden_states, self.Conv2d_0.weight, kernel=self.fir_kernel) + height = self._upsample_2d(hidden_states, self.Conv2d_0.weight, kernel=self.fir_kernel) height = height + self.Conv2d_0.bias.reshape([1, -1, 1, 1]) else: - height = self._upsample_2d( - hidden_states, kernel=self.fir_kernel, factor=2) + height = self._upsample_2d(hidden_states, kernel=self.fir_kernel, factor=2) return height class FirDownsample2D(nn.Layer): - def __init__(self, - channels=None, - out_channels=None, - use_conv=False, - fir_kernel=(1, 3, 3, 1)): + def __init__(self, channels=None, out_channels=None, use_conv=False, fir_kernel=(1, 3, 3, 1)): super().__init__() out_channels = out_channels if out_channels else channels if use_conv: - self.Conv2d_0 = nn.Conv2D( - channels, out_channels, kernel_size=3, stride=1, padding=1) + self.Conv2d_0 = nn.Conv2D(channels, out_channels, kernel_size=3, stride=1, padding=1) self.fir_kernel = fir_kernel self.use_conv = use_conv self.out_channels = out_channels - def _downsample_2d(self, - hidden_states, - weight=None, - kernel=None, - factor=2, - gain=1): + def _downsample_2d(self, hidden_states, weight=None, kernel=None, factor=2, gain=1): """Fused `Conv2d()` followed by `downsample_2d()`. Padding is performed only once at the beginning, not between the operations. The fused op is considerably more efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of @@ -414,30 +374,26 @@ def _downsample_2d(self, upfirdn_input = upfirdn2d_native( hidden_states, paddle.to_tensor(kernel), - pad=((pad_value + 1) // 2, pad_value // 2), ) - output = F.conv2d( - upfirdn_input, weight, stride=stride_value, padding=0) + pad=((pad_value + 1) // 2, pad_value // 2), + ) + output = F.conv2d(upfirdn_input, weight, stride=stride_value, padding=0) else: pad_value = kernel.shape[0] - factor output = upfirdn2d_native( hidden_states, paddle.to_tensor(kernel), down=factor, - pad=((pad_value + 1) // 2, pad_value // 2), ) + pad=((pad_value + 1) // 2, pad_value // 2), + ) return output def forward(self, hidden_states): if self.use_conv: - downsample_input = self._downsample_2d( - hidden_states, - weight=self.Conv2d_0.weight, - kernel=self.fir_kernel) - hidden_states = downsample_input + self.Conv2d_0.bias.reshape( - [1, -1, 1, 1]) + downsample_input = self._downsample_2d(hidden_states, weight=self.Conv2d_0.weight, kernel=self.fir_kernel) + hidden_states = downsample_input + self.Conv2d_0.bias.reshape([1, -1, 1, 1]) else: - hidden_states = self._downsample_2d( - hidden_states, kernel=self.fir_kernel, factor=2) + hidden_states = self._downsample_2d(hidden_states, kernel=self.fir_kernel, factor=2) return hidden_states @@ -451,18 +407,16 @@ def __init__(self, pad_mode="reflect"): self.pad = kernel_1d.shape[1] // 2 - 1 self.register_buffer( "kernel", - paddle.matmul( - kernel_1d, kernel_1d, transpose_x=True), - persistable=False, ) + paddle.matmul(kernel_1d, kernel_1d, transpose_x=True), + persistable=False, + ) def forward(self, x): - x = F.pad(x, (self.pad, ) * 4, self.pad_mode) + x = F.pad(x, (self.pad,) * 4, self.pad_mode) weight = paddle.zeros( - [ - x.shape[1], x.shape[1], self.kernel.shape[0], - self.kernel.shape[1] - ], - dtype=x.dtype, ) + [x.shape[1], x.shape[1], self.kernel.shape[0], self.kernel.shape[1]], + dtype=x.dtype, + ) indices = paddle.arange(x.shape[1]) # TODO verify this method weight[indices, indices] = self.kernel.cast(weight.dtype) @@ -477,18 +431,16 @@ def __init__(self, pad_mode="reflect"): self.pad = kernel_1d.shape[1] // 2 - 1 self.register_buffer( "kernel", - paddle.matmul( - kernel_1d, kernel_1d, transpose_x=True), - persistable=False, ) + paddle.matmul(kernel_1d, kernel_1d, transpose_x=True), + persistable=False, + ) def forward(self, x): - x = F.pad(x, ((self.pad + 1) // 2, ) * 4, self.pad_mode) + x = F.pad(x, ((self.pad + 1) // 2,) * 4, self.pad_mode) weight = paddle.zeros( - [ - x.shape[1], x.shape[1], self.kernel.shape[0], - self.kernel.shape[1] - ], - dtype=x.dtype, ) + [x.shape[1], x.shape[1], self.kernel.shape[0], self.kernel.shape[1]], + dtype=x.dtype, + ) indices = paddle.arange(x.shape[1]) # TODO verify this method weight[indices, indices] = self.kernel.cast(weight.dtype) @@ -527,28 +479,28 @@ class ResnetBlock2D(nn.Layer): """ def __init__( - self, - *, - in_channels, - out_channels=None, - conv_shortcut=False, - dropout=0.0, - temb_channels=512, - groups=32, - groups_out=None, - pre_norm=True, - eps=1e-6, - non_linearity="swish", - skip_time_act: bool=False, # skip_time_act is the same as pre_temb_non_linearity - time_embedding_norm="default", # default, scale_shift, ada_group - kernel=None, - output_scale_factor=1.0, - use_in_shortcut=None, - up=False, - down=False, - conv_shortcut_bias: bool=True, - conv_2d_out_channels: Optional[int]=None, - pre_temb_non_linearity: bool=False, # skip_time_act is the same as pre_temb_non_linearity + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout=0.0, + temb_channels=512, + groups=32, + groups_out=None, + pre_norm=True, + eps=1e-6, + non_linearity="swish", + skip_time_act: bool = False, # skip_time_act is the same as pre_temb_non_linearity + time_embedding_norm="default", # default, scale_shift, ada_group + kernel=None, + output_scale_factor=1.0, + use_in_shortcut=None, + up=False, + down=False, + conv_shortcut_bias: bool = True, + conv_2d_out_channels: Optional[int] = None, + pre_temb_non_linearity: bool = False, # skip_time_act is the same as pre_temb_non_linearity ): super().__init__() self.pre_temb_non_linearity = pre_temb_non_linearity @@ -568,14 +520,11 @@ def __init__( groups_out = groups if self.time_embedding_norm == "ada_group": - self.norm1 = AdaGroupNorm( - temb_channels, in_channels, groups, eps=eps) + self.norm1 = AdaGroupNorm(temb_channels, in_channels, groups, eps=eps) else: - self.norm1 = nn.GroupNorm( - num_groups=groups, num_channels=in_channels, epsilon=eps) + self.norm1 = nn.GroupNorm(num_groups=groups, num_channels=in_channels, epsilon=eps) - self.conv1 = nn.Conv2D( - in_channels, out_channels, kernel_size=3, stride=1, padding=1) + self.conv1 = nn.Conv2D(in_channels, out_channels, kernel_size=3, stride=1, padding=1) if temb_channels is not None: if self.time_embedding_norm == "default": @@ -585,26 +534,18 @@ def __init__( elif self.time_embedding_norm == "ada_group": self.time_emb_proj = None else: - raise ValueError( - f"unknown time_embedding_norm : {self.time_embedding_norm} ") + raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ") else: self.time_emb_proj = None if self.time_embedding_norm == "ada_group": - self.norm2 = AdaGroupNorm( - temb_channels, out_channels, groups_out, eps=eps) + self.norm2 = AdaGroupNorm(temb_channels, out_channels, groups_out, eps=eps) else: - self.norm2 = nn.GroupNorm( - num_groups=groups_out, num_channels=out_channels, epsilon=eps) + self.norm2 = nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, epsilon=eps) self.dropout = nn.Dropout(dropout) conv_2d_out_channels = conv_2d_out_channels or out_channels - self.conv2 = nn.Conv2D( - out_channels, - conv_2d_out_channels, - kernel_size=3, - stride=1, - padding=1) + self.conv2 = nn.Conv2D(out_channels, conv_2d_out_channels, kernel_size=3, stride=1, padding=1) if non_linearity == "swish": self.nonlinearity = lambda x: F.silu(x) @@ -621,8 +562,7 @@ def __init__( fir_kernel = (1, 3, 3, 1) self.upsample = lambda x: upsample_2d(x, kernel=fir_kernel) elif kernel == "sde_vp": - self.upsample = partial( - F.interpolate, scale_factor=2.0, mode="nearest") + self.upsample = partial(F.interpolate, scale_factor=2.0, mode="nearest") else: self.upsample = Upsample2D(in_channels, use_conv=False) elif self.down: @@ -632,11 +572,9 @@ def __init__( elif kernel == "sde_vp": self.downsample = partial(F.avg_pool2d, kernel_size=2, stride=2) else: - self.downsample = Downsample2D( - in_channels, use_conv=False, padding=1, name="op") + self.downsample = Downsample2D(in_channels, use_conv=False, padding=1, name="op") - self.use_in_shortcut = (self.in_channels != conv_2d_out_channels - if use_in_shortcut is None else use_in_shortcut) + self.use_in_shortcut = self.in_channels != conv_2d_out_channels if use_in_shortcut is None else use_in_shortcut self.conv_shortcut = None if self.use_in_shortcut: @@ -646,7 +584,8 @@ def __init__( kernel_size=1, stride=1, padding=0, - bias_attr=conv_shortcut_bias, ) + bias_attr=conv_shortcut_bias, + ) def forward(self, input_tensor, temb): hidden_states = input_tensor @@ -693,8 +632,7 @@ def forward(self, input_tensor, temb): input_tensor = self.conv_shortcut(input_tensor) # TODO this maybe result -inf, input_tensor's min value -57644 hidden_states's min value -10000 - output_tensor = ( - input_tensor + hidden_states) / self.output_scale_factor + output_tensor = (input_tensor + hidden_states) / self.output_scale_factor return output_tensor @@ -724,8 +662,7 @@ class Conv1dBlock(nn.Layer): def __init__(self, inp_channels, out_channels, kernel_size, n_groups=8): super().__init__() - self.conv1d = nn.Conv1D( - inp_channels, out_channels, kernel_size, padding=kernel_size // 2) + self.conv1d = nn.Conv1D(inp_channels, out_channels, kernel_size, padding=kernel_size // 2) self.group_norm = nn.GroupNorm(n_groups, out_channels) self.mish = nn.Mish() @@ -748,8 +685,9 @@ def __init__(self, inp_channels, out_channels, embed_dim, kernel_size=5): self.time_emb_act = nn.Mish() self.time_emb = nn.Linear(embed_dim, out_channels) - self.residual_conv = (nn.Conv1D(inp_channels, out_channels, 1) if - inp_channels != out_channels else nn.Identity()) + self.residual_conv = ( + nn.Conv1D(inp_channels, out_channels, 1) if inp_channels != out_channels else nn.Identity() + ) def forward(self, x, t): """ @@ -799,7 +737,8 @@ def upsample_2d(hidden_states, kernel=None, factor=2, gain=1): hidden_states, kernel, up=factor, - pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2), ) + pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2), + ) return output @@ -832,11 +771,7 @@ def downsample_2d(hidden_states, kernel=None, factor=2, gain=1): kernel = kernel * gain pad_value = kernel.shape[0] - factor - output = upfirdn2d_native( - hidden_states, - kernel, - down=factor, - pad=((pad_value + 1) // 2, pad_value // 2)) + output = upfirdn2d_native(hidden_states, kernel, down=factor, pad=((pad_value + 1) // 2, pad_value // 2)) return output @@ -854,9 +789,11 @@ def dummy_pad(tensor, up_x=0, up_y=0): up_x, tensor.shape[5], ], - dtype=tensor.dtype, ), + dtype=tensor.dtype, + ), ], - axis=4, ) + axis=4, + ) if up_y > 0: tensor = paddle.concat( [ @@ -870,9 +807,11 @@ def dummy_pad(tensor, up_x=0, up_y=0): tensor.shape[4], tensor.shape[5], ], - dtype=tensor.dtype, ), + dtype=tensor.dtype, + ), ], - axis=2, ) + axis=2, + ) return tensor @@ -900,23 +839,29 @@ def upfirdn2d_native(tensor, kernel, up=1, down=1, pad=(0, 0)): out = F.pad( out, [max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0), 0, 0], - data_format="NDHWC", ) + data_format="NDHWC", + ) out = out.squeeze(0) - out = out[:, max(-pad_y0, 0):out.shape[1] - max(-pad_y1, 0), max( - -pad_x0, 0):out.shape[2] - max(-pad_x1, 0), :, ] + out = out[ + :, + max(-pad_y0, 0) : out.shape[1] - max(-pad_y1, 0), + max(-pad_x0, 0) : out.shape[2] - max(-pad_x1, 0), + :, + ] out = out.transpose([0, 3, 1, 2]) - out = out.reshape( - [-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1]) + out = out.reshape([-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1]) w = paddle.flip(kernel, [0, 1]).reshape([1, 1, kernel_h, kernel_w]) out = F.conv2d(out, w) - out = out.reshape([ - -1, - minor, - in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1, - in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1, - ]) + out = out.reshape( + [ + -1, + minor, + in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1, + in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1, + ] + ) out = out.transpose([0, 2, 3, 1]) out = out[:, ::down_y, ::down_x, :] @@ -938,44 +883,48 @@ def __init__(self, in_dim, out_dim=None, dropout=0.0): self.in_dim = in_dim self.out_dim = out_dim self.conv1 = nn.Sequential( - nn.GroupNorm( - num_groups=32, num_channels=in_dim), + nn.GroupNorm(num_groups=32, num_channels=in_dim), nn.Silu(), nn.Conv3D( in_channels=in_dim, out_channels=out_dim, kernel_size=(3, 1, 1), - padding=(1, 0, 0), ), ) + padding=(1, 0, 0), + ), + ) self.conv2 = nn.Sequential( - nn.GroupNorm( - num_groups=32, num_channels=out_dim), + nn.GroupNorm(num_groups=32, num_channels=out_dim), nn.Silu(), nn.Dropout(p=dropout), nn.Conv3D( in_channels=out_dim, out_channels=in_dim, kernel_size=(3, 1, 1), - padding=(1, 0, 0), ), ) + padding=(1, 0, 0), + ), + ) self.conv3 = nn.Sequential( - nn.GroupNorm( - num_groups=32, num_channels=out_dim), + nn.GroupNorm(num_groups=32, num_channels=out_dim), nn.Silu(), nn.Dropout(p=dropout), nn.Conv3D( in_channels=out_dim, out_channels=in_dim, kernel_size=(3, 1, 1), - padding=(1, 0, 0), ), ) + padding=(1, 0, 0), + ), + ) self.conv4 = nn.Sequential( - nn.GroupNorm( - num_groups=32, num_channels=out_dim), + nn.GroupNorm(num_groups=32, num_channels=out_dim), nn.Silu(), nn.Dropout(p=dropout), nn.Conv3D( in_channels=out_dim, out_channels=in_dim, kernel_size=(3, 1, 1), - padding=(1, 0, 0), ), ) + padding=(1, 0, 0), + ), + ) zeros_(self.conv4[-1].weight) zeros_(self.conv4[-1].bias) @@ -983,14 +932,15 @@ def forward(self, hidden_states, num_frames=1): hidden_states = ( hidden_states[None, :] .reshape((-1, num_frames) + tuple(hidden_states.shape[1:])) - .transpose(perm=[0, 2, 1, 3, 4])) + .transpose(perm=[0, 2, 1, 3, 4]) + ) identity = hidden_states hidden_states = self.conv1(hidden_states) hidden_states = self.conv2(hidden_states) hidden_states = self.conv3(hidden_states) hidden_states = self.conv4(hidden_states) hidden_states = identity + hidden_states - hidden_states = hidden_states.transpose(perm=[0, 2, 1, 3, 4]).reshape(( - hidden_states.shape[0] * hidden_states.shape[2], -1) + tuple( - hidden_states.shape[3:])) + hidden_states = hidden_states.transpose(perm=[0, 2, 1, 3, 4]).reshape( + (hidden_states.shape[0] * hidden_states.shape[2], -1) + tuple(hidden_states.shape[3:]) + ) return hidden_states diff --git a/ppdiffusers/ppdiffusers/models/t5_film_transformer.py b/ppdiffusers/ppdiffusers/models/t5_film_transformer.py index fabe9f4eaec86..2d0a45bcc46c9 100644 --- a/ppdiffusers/ppdiffusers/models/t5_film_transformer.py +++ b/ppdiffusers/ppdiffusers/models/t5_film_transformer.py @@ -26,31 +26,30 @@ class T5FilmDecoder(ModelMixin, ConfigMixin): @register_to_config def __init__( - self, - input_dims: int=128, - targets_length: int=256, - max_decoder_noise_time: float=2000.0, - d_model: int=768, - num_layers: int=12, - num_heads: int=12, - d_kv: int=64, - d_ff: int=2048, - dropout_rate: float=0.1, ): + self, + input_dims: int = 128, + targets_length: int = 256, + max_decoder_noise_time: float = 2000.0, + d_model: int = 768, + num_layers: int = 12, + num_heads: int = 12, + d_kv: int = 64, + d_ff: int = 2048, + dropout_rate: float = 0.1, + ): super().__init__() self.conditioning_emb = nn.Sequential( - nn.Linear( - d_model, d_model * 4, bias_attr=False), + nn.Linear(d_model, d_model * 4, bias_attr=False), nn.Silu(), - nn.Linear( - d_model * 4, d_model * 4, bias_attr=False), - nn.Silu(), ) + nn.Linear(d_model * 4, d_model * 4, bias_attr=False), + nn.Silu(), + ) self.position_encoding = nn.Embedding(targets_length, d_model) self.position_encoding.weight.stop_gradient = True - self.continuous_inputs_projection = nn.Linear( - input_dims, d_model, bias_attr=False) + self.continuous_inputs_projection = nn.Linear(input_dims, d_model, bias_attr=False) self.dropout = nn.Dropout(p=dropout_rate) @@ -62,7 +61,8 @@ def __init__( d_kv=d_kv, num_heads=num_heads, d_ff=d_ff, - dropout_rate=dropout_rate, ) + dropout_rate=dropout_rate, + ) self.decoders.append(lyr) self.decoder_norm = T5LayerNorm(d_model) @@ -71,13 +71,10 @@ def __init__( self.spec_out = nn.Linear(d_model, input_dims, bias_attr=False) def encoder_decoder_mask(self, query_input, key_input): - mask = paddle.multiply( - query_input.unsqueeze(-1), - key_input.unsqueeze(-2).cast(query_input.dtype)) + mask = paddle.multiply(query_input.unsqueeze(-1), key_input.unsqueeze(-2).cast(query_input.dtype)) return mask.unsqueeze(-3) - def forward(self, encodings_and_masks, decoder_input_tokens, - decoder_noise_time): + def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time): batch, _, _ = decoder_input_tokens.shape assert decoder_noise_time.shape[0] == batch @@ -85,7 +82,8 @@ def forward(self, encodings_and_masks, decoder_input_tokens, time_steps = get_timestep_embedding( decoder_noise_time * self.config.max_decoder_noise_time, embedding_dim=self.config.d_model, - max_period=self.config.max_decoder_noise_time, ).cast(self.dtype) + max_period=self.config.max_decoder_noise_time, + ).cast(self.dtype) conditioning_emb = self.conditioning_emb(time_steps).unsqueeze(1) @@ -96,37 +94,34 @@ def forward(self, encodings_and_masks, decoder_input_tokens, # If we want to use relative positions for audio context, we can just offset # this sequence by the length of encodings_and_masks. decoder_positions = paddle.broadcast_to( - paddle.arange(seq_length, ), - shape=(batch, seq_length), ) + paddle.arange( + seq_length, + ), + shape=(batch, seq_length), + ) position_encodings = self.position_encoding(decoder_positions) - inputs = self.continuous_inputs_projection( - decoder_input_tokens.cast(position_encodings.dtype)) + inputs = self.continuous_inputs_projection(decoder_input_tokens.cast(position_encodings.dtype)) inputs += position_encodings y = self.dropout(inputs) # decoder: No padding present. - decoder_mask = paddle.ones( - decoder_input_tokens.shape[:2], dtype=inputs.dtype) + decoder_mask = paddle.ones(decoder_input_tokens.shape[:2], dtype=inputs.dtype) # Translate encoding masks to encoder-decoder masks. - encodings_and_encdec_masks = [ - (x, self.encoder_decoder_mask(decoder_mask, y)) - for x, y in encodings_and_masks - ] + encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks] # cross attend style: concat encodings - encoded = paddle.concat( - [x[0] for x in encodings_and_encdec_masks], axis=1) - encoder_decoder_mask = paddle.concat( - [x[1] for x in encodings_and_encdec_masks], axis=-1) + encoded = paddle.concat([x[0] for x in encodings_and_encdec_masks], axis=1) + encoder_decoder_mask = paddle.concat([x[1] for x in encodings_and_encdec_masks], axis=-1) for lyr in self.decoders: y = lyr( y, conditioning_emb=conditioning_emb, encoder_hidden_states=encoded, - encoder_attention_mask=encoder_decoder_mask, )[0] + encoder_attention_mask=encoder_decoder_mask, + )[0] y = self.decoder_norm(y) y = self.post_dropout(y) @@ -136,13 +131,7 @@ def forward(self, encodings_and_masks, decoder_input_tokens, class DecoderLayer(nn.Layer): - def __init__(self, - d_model, - d_kv, - num_heads, - d_ff, - dropout_rate, - layer_norm_epsilon=1e-6): + def __init__(self, d_model, d_kv, num_heads, d_ff, dropout_rate, layer_norm_epsilon=1e-6): super().__init__() self.layer = nn.LayerList() @@ -152,7 +141,9 @@ def __init__(self, d_model=d_model, d_kv=d_kv, num_heads=num_heads, - dropout_rate=dropout_rate, )) + dropout_rate=dropout_rate, + ) + ) # cross attention: layer 1 self.layer.append( @@ -161,7 +152,9 @@ def __init__(self, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate, - layer_norm_epsilon=layer_norm_epsilon, )) + layer_norm_epsilon=layer_norm_epsilon, + ) + ) # Film Cond MLP + dropout: last layer self.layer.append( @@ -169,62 +162,67 @@ def __init__(self, d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate, - layer_norm_epsilon=layer_norm_epsilon, )) + layer_norm_epsilon=layer_norm_epsilon, + ) + ) def forward( - self, - hidden_states, - conditioning_emb=None, - attention_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - encoder_decoder_position_bias=None, ): + self, + hidden_states, + conditioning_emb=None, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + encoder_decoder_position_bias=None, + ): hidden_states = self.layer[0]( hidden_states, conditioning_emb=conditioning_emb, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) if encoder_hidden_states is not None: - encoder_extended_attention_mask = paddle.where( - encoder_attention_mask > 0, 0.0, - -1e10).cast(encoder_hidden_states.dtype) + encoder_extended_attention_mask = paddle.where(encoder_attention_mask > 0, 0.0, -1e10).cast( + encoder_hidden_states.dtype + ) hidden_states = self.layer[1]( hidden_states, key_value_states=encoder_hidden_states, - attention_mask=encoder_extended_attention_mask, ) + attention_mask=encoder_extended_attention_mask, + ) # Apply Film Conditional Feed Forward layer hidden_states = self.layer[-1](hidden_states, conditioning_emb) - return (hidden_states, ) + return (hidden_states,) class T5LayerSelfAttentionCond(nn.Layer): def __init__(self, d_model, d_kv, num_heads, dropout_rate): super().__init__() self.layer_norm = T5LayerNorm(d_model) - self.FiLMLayer = T5FiLMLayer( - in_features=d_model * 4, out_features=d_model) + self.FiLMLayer = T5FiLMLayer(in_features=d_model * 4, out_features=d_model) self.attention = Attention( query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, - scale_qk=False, ) + scale_qk=False, + ) self.dropout = nn.Dropout(dropout_rate) def forward( - self, - hidden_states, - conditioning_emb=None, - attention_mask=None, ): + self, + hidden_states, + conditioning_emb=None, + attention_mask=None, + ): # pre_self_attention_layer_norm normed_hidden_states = self.layer_norm(hidden_states) if conditioning_emb is not None: - normed_hidden_states = self.FiLMLayer(normed_hidden_states, - conditioning_emb) + normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb) # Self-attention block attention_output = self.attention(normed_hidden_states) @@ -235,28 +233,30 @@ def forward( class T5LayerCrossAttention(nn.Layer): - def __init__(self, d_model, d_kv, num_heads, dropout_rate, - layer_norm_epsilon): + def __init__(self, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon): super().__init__() self.attention = Attention( query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, - scale_qk=False, ) + scale_qk=False, + ) self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon) self.dropout = nn.Dropout(dropout_rate) def forward( - self, - hidden_states, - key_value_states=None, - attention_mask=None, ): + self, + hidden_states, + key_value_states=None, + attention_mask=None, + ): normed_hidden_states = self.layer_norm(hidden_states) attention_output = self.attention( normed_hidden_states, encoder_hidden_states=key_value_states, - attention_mask=attention_mask.squeeze(1), ) + attention_mask=attention_mask.squeeze(1), + ) layer_output = hidden_states + self.dropout(attention_output) return layer_output @@ -264,8 +264,7 @@ def forward( class T5LayerFFCond(nn.Layer): def __init__(self, d_model, d_ff, dropout_rate, layer_norm_epsilon): super().__init__() - self.DenseReluDense = T5DenseGatedActDense( - d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate) + self.DenseReluDense = T5DenseGatedActDense(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate) self.film = T5FiLMLayer(in_features=d_model * 4, out_features=d_model) self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon) self.dropout = nn.Dropout(dropout_rate) @@ -306,9 +305,7 @@ class T5LayerNorm(nn.Layer): def __init__(self, hidden_size, eps=1e-6): super().__init__() - self.weight = self.create_parameter( - shape=[hidden_size], - default_initializer=nn.initializer.Constant(1.0)) + self.weight = self.create_parameter(shape=[hidden_size], default_initializer=nn.initializer.Constant(1.0)) self.variance_epsilon = eps def forward(self, hidden_states): @@ -317,10 +314,8 @@ def forward(self, hidden_states): # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for # half-precision inputs is done in fp32 - variance = paddle.pow(hidden_states.cast(paddle.float32), 2).mean( - axis=-1, keepdim=True) - hidden_states = hidden_states * paddle.rsqrt(variance + - self.variance_epsilon) + variance = paddle.pow(hidden_states.cast(paddle.float32), 2).mean(axis=-1, keepdim=True) + hidden_states = hidden_states * paddle.rsqrt(variance + self.variance_epsilon) # convert into half-precision if necessary if self.weight.dtype == paddle.float16: @@ -335,9 +330,9 @@ class NewGELUActivation(nn.Layer): """ def forward(self, input: paddle.Tensor) -> paddle.Tensor: - return (0.5 * input * (1.0 + paddle.tanh( - math.sqrt(2.0 / math.pi) * - (input + 0.044715 * paddle.pow(input, 3.0))))) + return ( + 0.5 * input * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * paddle.pow(input, 3.0)))) + ) class T5FiLMLayer(nn.Layer): @@ -347,8 +342,7 @@ class T5FiLMLayer(nn.Layer): def __init__(self, in_features, out_features): super().__init__() - self.scale_bias = nn.Linear( - in_features, out_features * 2, bias_attr=False) + self.scale_bias = nn.Linear(in_features, out_features * 2, bias_attr=False) def forward(self, x, conditioning_emb): emb = self.scale_bias(conditioning_emb) diff --git a/ppdiffusers/ppdiffusers/models/transformer_2d.py b/ppdiffusers/ppdiffusers/models/transformer_2d.py index e9f47cbee3f7b..2207b8b46974e 100644 --- a/ppdiffusers/ppdiffusers/models/transformer_2d.py +++ b/ppdiffusers/ppdiffusers/models/transformer_2d.py @@ -79,26 +79,27 @@ class Transformer2DModel(ModelMixin, ConfigMixin): @register_to_config def __init__( - self, - num_attention_heads: int=16, - attention_head_dim: int=88, - in_channels: Optional[int]=None, - out_channels: Optional[int]=None, - num_layers: int=1, - dropout: float=0.0, - norm_num_groups: int=32, - cross_attention_dim: Optional[int]=None, - attention_bias: bool=False, - sample_size: Optional[int]=None, - num_vector_embeds: Optional[int]=None, - patch_size: Optional[int]=None, - activation_fn: str="geglu", - num_embeds_ada_norm: Optional[int]=None, - use_linear_projection: bool=False, - only_cross_attention: bool=False, - upcast_attention: bool=False, - norm_type: str="layer_norm", - norm_elementwise_affine: bool=True, ): + self, + num_attention_heads: int = 16, + attention_head_dim: int = 88, + in_channels: Optional[int] = None, + out_channels: Optional[int] = None, + num_layers: int = 1, + dropout: float = 0.0, + norm_num_groups: int = 32, + cross_attention_dim: Optional[int] = None, + attention_bias: bool = False, + sample_size: Optional[int] = None, + num_vector_embeds: Optional[int] = None, + patch_size: Optional[int] = None, + activation_fn: str = "geglu", + num_embeds_ada_norm: Optional[int] = None, + use_linear_projection: bool = False, + only_cross_attention: bool = False, + upcast_attention: bool = False, + norm_type: str = "layer_norm", + norm_elementwise_affine: bool = True, + ): super().__init__() self.use_linear_projection = use_linear_projection self.num_attention_heads = num_attention_heads @@ -107,8 +108,7 @@ def __init__( # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)` # Define whether input is continuous or discrete depending on configuration - self.is_input_continuous = (in_channels is not None) and ( - patch_size is None) + self.is_input_continuous = (in_channels is not None) and (patch_size is None) self.is_input_vectorized = num_vector_embeds is not None self.is_input_patches = in_channels is not None and patch_size is not None @@ -124,7 +124,8 @@ def __init__( "norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, - standard_warn=False, ) + standard_warn=False, + ) norm_type = "ada_norm" if self.is_input_continuous and self.is_input_vectorized: @@ -137,8 +138,7 @@ def __init__( f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make" " sure that either `num_vector_embeds` or `num_patches` is None." ) - elif (not self.is_input_continuous and not self.is_input_vectorized and - not self.is_input_patches): + elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches: raise ValueError( f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:" f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None." @@ -148,22 +148,14 @@ def __init__( if self.is_input_continuous: self.in_channels = in_channels - self.norm = nn.GroupNorm( - num_groups=norm_num_groups, - num_channels=in_channels, - epsilon=1e-6) + self.norm = nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, epsilon=1e-6) if use_linear_projection: self.proj_in = nn.Linear(in_channels, inner_dim) else: - self.proj_in = nn.Conv2D( - in_channels, inner_dim, kernel_size=1, stride=1, padding=0) + self.proj_in = nn.Conv2D(in_channels, inner_dim, kernel_size=1, stride=1, padding=0) elif self.is_input_vectorized: - assert ( - sample_size is not None - ), "Transformer2DModel over discrete input must provide sample_size" - assert ( - num_vector_embeds is not None - ), "Transformer2DModel over discrete input must provide num_embed" + assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size" + assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed" self.height = sample_size self.width = sample_size @@ -174,11 +166,10 @@ def __init__( num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, - width=self.width, ) + width=self.width, + ) elif self.is_input_patches: - assert ( - sample_size is not None - ), "Transformer2DModel over patched input must provide sample_size" + assert sample_size is not None, "Transformer2DModel over patched input must provide sample_size" self.height = sample_size self.width = sample_size @@ -189,25 +180,29 @@ def __init__( width=sample_size, patch_size=patch_size, in_channels=in_channels, - embed_dim=inner_dim, ) + embed_dim=inner_dim, + ) # 3. Define transformers blocks - self.transformer_blocks = nn.LayerList([ - BasicTransformerBlock( - inner_dim, - num_attention_heads, - attention_head_dim, - dropout=dropout, - cross_attention_dim=cross_attention_dim, - activation_fn=activation_fn, - num_embeds_ada_norm=num_embeds_ada_norm, - attention_bias=attention_bias, - only_cross_attention=only_cross_attention, - upcast_attention=upcast_attention, - norm_type=norm_type, - norm_elementwise_affine=norm_elementwise_affine, ) - for d in range(num_layers) - ]) + self.transformer_blocks = nn.LayerList( + [ + BasicTransformerBlock( + inner_dim, + num_attention_heads, + attention_head_dim, + dropout=dropout, + cross_attention_dim=cross_attention_dim, + activation_fn=activation_fn, + num_embeds_ada_norm=num_embeds_ada_norm, + attention_bias=attention_bias, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + norm_type=norm_type, + norm_elementwise_affine=norm_elementwise_affine, + ) + for d in range(num_layers) + ] + ) # 4. Define output layers self.out_channels = in_channels if out_channels is None else out_channels @@ -216,8 +211,7 @@ def __init__( if use_linear_projection: self.proj_out = nn.Linear(inner_dim, in_channels) else: - self.proj_out = nn.Conv2D( - inner_dim, in_channels, kernel_size=1, stride=1, padding=0) + self.proj_out = nn.Conv2D(inner_dim, in_channels, kernel_size=1, stride=1, padding=0) elif self.is_input_vectorized: self.norm_out = nn.LayerNorm(inner_dim) self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1) @@ -226,17 +220,17 @@ def __init__( norm_kwargs = {"weight_attr": False, "bias_attr": False} self.norm_out = nn.LayerNorm(inner_dim, epsilon=1e-6, **norm_kwargs) self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim) - self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * - self.out_channels) + self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels) def forward( - self, - hidden_states, - encoder_hidden_states=None, - timestep=None, - class_labels=None, - cross_attention_kwargs=None, - return_dict: bool=True, ): + self, + hidden_states, + encoder_hidden_states=None, + timestep=None, + class_labels=None, + cross_attention_kwargs=None, + return_dict: bool = True, + ): """ Args: hidden_states ( When discrete, `paddle.Tensor` of shape `(batch size, num latent pixels)`. @@ -270,8 +264,7 @@ def forward( if self.use_linear_projection: hidden_states = self.proj_in(hidden_states) elif self.is_input_vectorized: - hidden_states = self.latent_image_embedding( - hidden_states.cast("int64")) + hidden_states = self.latent_image_embedding(hidden_states.cast("int64")) elif self.is_input_patches: hidden_states = self.pos_embed(hidden_states) @@ -282,14 +275,14 @@ def forward( encoder_hidden_states=encoder_hidden_states, timestep=timestep, cross_attention_kwargs=cross_attention_kwargs, - class_labels=class_labels, ) + class_labels=class_labels, + ) # 3. Output if self.is_input_continuous: if self.use_linear_projection: hidden_states = self.proj_out(hidden_states) - hidden_states = hidden_states.reshape( - [-1, height, width, self.inner_dim]).transpose([0, 3, 1, 2]) + hidden_states = hidden_states.reshape([-1, height, width, self.inner_dim]).transpose([0, 3, 1, 2]) if not self.use_linear_projection: hidden_states = self.proj_out(hidden_states) output = hidden_states + residual @@ -300,31 +293,32 @@ def forward( logits = logits.transpose([0, 2, 1]) # log(p(x_0)) - output = F.log_softmax( - logits.cast("float64"), axis=1).cast("float32") + output = F.log_softmax(logits.cast("float64"), axis=1).cast("float32") elif self.is_input_patches: # TODO: cleanup! conditioning = self.transformer_blocks[0].norm1.emb( - timestep, class_labels, hidden_dtype=hidden_states.dtype) - shift, scale = self.proj_out_1(F.silu(conditioning)).chunk( - 2, axis=1) - hidden_states = (self.norm_out(hidden_states) * - (1 + scale[:, None]) + shift[:, None]) + timestep, class_labels, hidden_dtype=hidden_states.dtype + ) + shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, axis=1) + hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None] hidden_states = self.proj_out_2(hidden_states) # unpatchify - height = width = int(hidden_states.shape[1]**0.5) + height = width = int(hidden_states.shape[1] ** 0.5) hidden_states = hidden_states.reshape( - (-1, height, width, self.patch_size, self.patch_size, - self.out_channels)) + (-1, height, width, self.patch_size, self.patch_size, self.out_channels) + ) hidden_states = paddle.einsum("nhwpqc->nchpwq", hidden_states) - output = hidden_states.reshape(( - -1, - self.out_channels, - height * self.patch_size, - width * self.patch_size, )) + output = hidden_states.reshape( + ( + -1, + self.out_channels, + height * self.patch_size, + width * self.patch_size, + ) + ) if not return_dict: - return (output, ) + return (output,) return Transformer2DModelOutput(sample=output) diff --git a/ppdiffusers/ppdiffusers/models/transformer_temporal.py b/ppdiffusers/ppdiffusers/models/transformer_temporal.py index 0052335c043f4..bfd1985eb99a7 100644 --- a/ppdiffusers/ppdiffusers/models/transformer_temporal.py +++ b/ppdiffusers/ppdiffusers/models/transformer_temporal.py @@ -60,52 +60,56 @@ class TransformerTemporalModel(ModelMixin, ConfigMixin): @register_to_config def __init__( - self, - num_attention_heads: int=16, - attention_head_dim: int=88, - in_channels: Optional[int]=None, - out_channels: Optional[int]=None, - num_layers: int=1, - dropout: float=0.0, - norm_num_groups: int=32, - cross_attention_dim: Optional[int]=None, - attention_bias: bool=False, - sample_size: Optional[int]=None, - activation_fn: str="geglu", - norm_elementwise_affine: bool=True, - double_self_attention: bool=True, ): + self, + num_attention_heads: int = 16, + attention_head_dim: int = 88, + in_channels: Optional[int] = None, + out_channels: Optional[int] = None, + num_layers: int = 1, + dropout: float = 0.0, + norm_num_groups: int = 32, + cross_attention_dim: Optional[int] = None, + attention_bias: bool = False, + sample_size: Optional[int] = None, + activation_fn: str = "geglu", + norm_elementwise_affine: bool = True, + double_self_attention: bool = True, + ): super().__init__() self.num_attention_heads = num_attention_heads self.attention_head_dim = attention_head_dim inner_dim = num_attention_heads * attention_head_dim self.in_channels = in_channels - self.norm = nn.GroupNorm( - num_groups=norm_num_groups, num_channels=in_channels, epsilon=1e-06) + self.norm = nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, epsilon=1e-06) self.proj_in = nn.Linear(in_channels, inner_dim) - self.transformer_blocks = nn.LayerList([ - BasicTransformerBlock( - inner_dim, - num_attention_heads, - attention_head_dim, - dropout=dropout, - cross_attention_dim=cross_attention_dim, - activation_fn=activation_fn, - attention_bias=attention_bias, - double_self_attention=double_self_attention, - norm_elementwise_affine=norm_elementwise_affine, ) - for d in range(num_layers) - ]) + self.transformer_blocks = nn.LayerList( + [ + BasicTransformerBlock( + inner_dim, + num_attention_heads, + attention_head_dim, + dropout=dropout, + cross_attention_dim=cross_attention_dim, + activation_fn=activation_fn, + attention_bias=attention_bias, + double_self_attention=double_self_attention, + norm_elementwise_affine=norm_elementwise_affine, + ) + for d in range(num_layers) + ] + ) self.proj_out = nn.Linear(inner_dim, in_channels) def forward( - self, - hidden_states, - encoder_hidden_states=None, - timestep=None, - class_labels=None, - num_frames=1, - cross_attention_kwargs=None, - return_dict: bool=True, ): + self, + hidden_states, + encoder_hidden_states=None, + timestep=None, + class_labels=None, + num_frames=1, + cross_attention_kwargs=None, + return_dict: bool = True, + ): """ Args: hidden_states ( When discrete, `paddle.Tensor` of shape `(batch size, num latent pixels)`. @@ -131,12 +135,12 @@ def forward( batch_frames, channel, height, width = hidden_states.shape batch_size = batch_frames // num_frames residual = hidden_states - hidden_states = hidden_states[None, :].reshape( - (batch_size, num_frames, channel, height, width)) + hidden_states = hidden_states[None, :].reshape((batch_size, num_frames, channel, height, width)) hidden_states = hidden_states.transpose([0, 2, 1, 3, 4]) hidden_states = self.norm(hidden_states) hidden_states = hidden_states.transpose([0, 3, 4, 2, 1]).reshape( - (batch_size * height * width, num_frames, channel)) + (batch_size * height * width, num_frames, channel) + ) hidden_states = self.proj_in(hidden_states) # 2. Blocks for block in self.transformer_blocks: @@ -145,15 +149,17 @@ def forward( encoder_hidden_states=encoder_hidden_states, timestep=timestep, cross_attention_kwargs=cross_attention_kwargs, - class_labels=class_labels, ) + class_labels=class_labels, + ) # 3. Output hidden_states = self.proj_out(hidden_states) - hidden_states = (hidden_states[None, None, :].reshape( - (batch_size, height, width, channel, num_frames)) - .transpose([0, 3, 4, 1, 2])) - hidden_states = hidden_states.reshape( - (batch_frames, channel, height, width)) + hidden_states = ( + hidden_states[None, None, :] + .reshape((batch_size, height, width, channel, num_frames)) + .transpose([0, 3, 4, 1, 2]) + ) + hidden_states = hidden_states.reshape((batch_frames, channel, height, width)) output = hidden_states + residual if not return_dict: - return (output, ) + return (output,) return TransformerTemporalModelOutput(sample=output) diff --git a/ppdiffusers/ppdiffusers/models/unet_1d.py b/ppdiffusers/ppdiffusers/models/unet_1d.py index 70ecea668c88f..df62f8477b0bb 100644 --- a/ppdiffusers/ppdiffusers/models/unet_1d.py +++ b/ppdiffusers/ppdiffusers/models/unet_1d.py @@ -23,8 +23,7 @@ from ..utils import BaseOutput from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps from .modeling_utils import ModelMixin -from .unet_1d_blocks import (get_down_block, get_mid_block, get_out_block, - get_up_block) +from .unet_1d_blocks import get_down_block, get_mid_block, get_out_block, get_up_block @dataclass @@ -73,29 +72,30 @@ class UNet1DModel(ModelMixin, ConfigMixin): @register_to_config def __init__( - self, - sample_size: int=65536, - sample_rate: Optional[int]=None, - in_channels: int=2, - out_channels: int=2, - extra_in_channels: int=0, - time_embedding_type: str="fourier", - flip_sin_to_cos: bool=True, - use_timestep_embedding: bool=False, - freq_shift: float=0.0, - down_block_types: Tuple[str]=( - "DownBlock1DNoSkip", - "DownBlock1D", - "AttnDownBlock1D", ), - up_block_types: Tuple[str]=("AttnUpBlock1D", "UpBlock1D", - "UpBlock1DNoSkip"), - mid_block_type: Tuple[str]="UNetMidBlock1D", - out_block_type: str=None, - block_out_channels: Tuple[int]=(32, 32, 64), - act_fn: str=None, - norm_num_groups: int=8, - layers_per_block: int=1, - downsample_each_block: bool=False, ): + self, + sample_size: int = 65536, + sample_rate: Optional[int] = None, + in_channels: int = 2, + out_channels: int = 2, + extra_in_channels: int = 0, + time_embedding_type: str = "fourier", + flip_sin_to_cos: bool = True, + use_timestep_embedding: bool = False, + freq_shift: float = 0.0, + down_block_types: Tuple[str] = ( + "DownBlock1DNoSkip", + "DownBlock1D", + "AttnDownBlock1D", + ), + up_block_types: Tuple[str] = ("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"), + mid_block_type: Tuple[str] = "UNetMidBlock1D", + out_block_type: str = None, + block_out_channels: Tuple[int] = (32, 32, 64), + act_fn: str = None, + norm_num_groups: int = 8, + layers_per_block: int = 1, + downsample_each_block: bool = False, + ): super().__init__() self.sample_size = sample_size @@ -105,13 +105,15 @@ def __init__( embedding_size=8, set_W_to_weight=False, log=False, - flip_sin_to_cos=flip_sin_to_cos, ) + flip_sin_to_cos=flip_sin_to_cos, + ) timestep_input_dim = 2 * block_out_channels[0] elif time_embedding_type == "positional": self.time_proj = Timesteps( block_out_channels[0], flip_sin_to_cos=flip_sin_to_cos, - downscale_freq_shift=freq_shift, ) + downscale_freq_shift=freq_shift, + ) timestep_input_dim = block_out_channels[0] if use_timestep_embedding: @@ -120,7 +122,8 @@ def __init__( in_channels=timestep_input_dim, time_embed_dim=time_embed_dim, act_fn=act_fn, - out_dim=block_out_channels[0], ) + out_dim=block_out_channels[0], + ) self.down_blocks = nn.LayerList([]) self.mid_block = None @@ -144,7 +147,8 @@ def __init__( in_channels=input_channel, out_channels=output_channel, temb_channels=block_out_channels[0], - add_downsample=not is_final_block or downsample_each_block, ) + add_downsample=not is_final_block or downsample_each_block, + ) self.down_blocks.append(down_block) # mid @@ -155,7 +159,8 @@ def __init__( out_channels=block_out_channels[-1], embed_dim=block_out_channels[0], num_layers=layers_per_block, - add_downsample=downsample_each_block, ) + add_downsample=downsample_each_block, + ) # up reversed_block_out_channels = list(reversed(block_out_channels)) @@ -167,9 +172,9 @@ def __init__( for i, up_block_type in enumerate(up_block_types): prev_output_channel = output_channel - output_channel = (reversed_block_out_channels[i + 1] - if i < len(up_block_types) - 1 else - final_upsample_channels) + output_channel = ( + reversed_block_out_channels[i + 1] if i < len(up_block_types) - 1 else final_upsample_channels + ) is_final_block = i == len(block_out_channels) - 1 @@ -179,26 +184,28 @@ def __init__( in_channels=prev_output_channel, out_channels=output_channel, temb_channels=block_out_channels[0], - add_upsample=not is_final_block, ) + add_upsample=not is_final_block, + ) self.up_blocks.append(up_block) prev_output_channel = output_channel # out - num_groups_out = (norm_num_groups if norm_num_groups is not None else - min(block_out_channels[0] // 4, 32)) + num_groups_out = norm_num_groups if norm_num_groups is not None else min(block_out_channels[0] // 4, 32) self.out_block = get_out_block( out_block_type=out_block_type, num_groups_out=num_groups_out, embed_dim=block_out_channels[0], out_channels=out_channels, act_fn=act_fn, - fc_dim=block_out_channels[-1] // 4, ) + fc_dim=block_out_channels[-1] // 4, + ) def forward( - self, - sample: paddle.Tensor, - timestep: Union[paddle.Tensor, float, int], - return_dict: bool=True, ) -> Union[UNet1DOutput, Tuple]: + self, + sample: paddle.Tensor, + timestep: Union[paddle.Tensor, float, int], + return_dict: bool = True, + ) -> Union[UNet1DOutput, Tuple]: r""" Args: sample (`paddle.Tensor`): `(batch_size, num_channels, sample_size)` noisy inputs tensor @@ -223,16 +230,13 @@ def forward( timestep_embed = self.time_mlp(timestep_embed) else: timestep_embed = timestep_embed[..., None] - timestep_embed = timestep_embed.tile( - [1, 1, sample.shape[2]]).cast(sample.dtype) - timestep_embed = timestep_embed.broadcast_to( - (sample.shape[:1] + timestep_embed.shape[1:])) + timestep_embed = timestep_embed.tile([1, 1, sample.shape[2]]).cast(sample.dtype) + timestep_embed = timestep_embed.broadcast_to((sample.shape[:1] + timestep_embed.shape[1:])) # 2. down down_block_res_samples = () for downsample_block in self.down_blocks: - sample, res_samples = downsample_block( - hidden_states=sample, temb=timestep_embed) + sample, res_samples = downsample_block(hidden_states=sample, temb=timestep_embed) down_block_res_samples += res_samples # 3. mid @@ -243,16 +247,13 @@ def forward( for i, upsample_block in enumerate(self.up_blocks): res_samples = down_block_res_samples[-1:] down_block_res_samples = down_block_res_samples[:-1] - sample = upsample_block( - sample, - res_hidden_states_tuple=res_samples, - temb=timestep_embed) + sample = upsample_block(sample, res_hidden_states_tuple=res_samples, temb=timestep_embed) # 5. post-process if self.out_block: sample = self.out_block(sample, timestep_embed) if not return_dict: - return (sample, ) + return (sample,) return UNet1DOutput(sample=sample) diff --git a/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py b/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py index 7b3cf833bfba8..41a1810408693 100644 --- a/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py +++ b/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py @@ -20,24 +20,24 @@ from paddle import nn from ..utils import is_ppxformers_available -from .resnet import (Downsample1D, ResidualTemporalBlock1D, Upsample1D, - rearrange_dims) +from .resnet import Downsample1D, ResidualTemporalBlock1D, Upsample1D, rearrange_dims class DownResnetBlock1D(nn.Layer): def __init__( - self, - in_channels, - out_channels=None, - num_layers=1, - conv_shortcut=False, - temb_channels=32, - groups=32, - groups_out=None, - non_linearity=None, - time_embedding_norm="default", - output_scale_factor=1.0, - add_downsample=True, ): + self, + in_channels, + out_channels=None, + num_layers=1, + conv_shortcut=False, + temb_channels=32, + groups=32, + groups_out=None, + non_linearity=None, + time_embedding_norm="default", + output_scale_factor=1.0, + add_downsample=True, + ): super().__init__() self.in_channels = in_channels out_channels = in_channels if out_channels is None else out_channels @@ -51,15 +51,10 @@ def __init__( groups_out = groups # there will always be at least one resnet - resnets = [ - ResidualTemporalBlock1D( - in_channels, out_channels, embed_dim=temb_channels) - ] + resnets = [ResidualTemporalBlock1D(in_channels, out_channels, embed_dim=temb_channels)] for _ in range(num_layers): - resnets.append( - ResidualTemporalBlock1D( - out_channels, out_channels, embed_dim=temb_channels)) + resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=temb_channels)) self.resnets = nn.LayerList(resnets) @@ -74,8 +69,7 @@ def __init__( self.downsample = None if add_downsample: - self.downsample = Downsample1D( - out_channels, use_conv=True, padding=1) + self.downsample = Downsample1D(out_channels, use_conv=True, padding=1) def forward(self, hidden_states, temb=None): output_states = () @@ -84,7 +78,7 @@ def forward(self, hidden_states, temb=None): for resnet in self.resnets[1:]: hidden_states = resnet(hidden_states, temb) - output_states += (hidden_states, ) + output_states += (hidden_states,) if self.nonlinearity is not None: hidden_states = self.nonlinearity(hidden_states) @@ -97,17 +91,18 @@ def forward(self, hidden_states, temb=None): class UpResnetBlock1D(nn.Layer): def __init__( - self, - in_channels, - out_channels=None, - num_layers=1, - temb_channels=32, - groups=32, - groups_out=None, - non_linearity=None, - time_embedding_norm="default", - output_scale_factor=1.0, - add_upsample=True, ): + self, + in_channels, + out_channels=None, + num_layers=1, + temb_channels=32, + groups=32, + groups_out=None, + non_linearity=None, + time_embedding_norm="default", + output_scale_factor=1.0, + add_upsample=True, + ): super().__init__() self.in_channels = in_channels out_channels = in_channels if out_channels is None else out_channels @@ -120,15 +115,10 @@ def __init__( groups_out = groups # there will always be at least one resnet - resnets = [ - ResidualTemporalBlock1D( - 2 * in_channels, out_channels, embed_dim=temb_channels) - ] + resnets = [ResidualTemporalBlock1D(2 * in_channels, out_channels, embed_dim=temb_channels)] for _ in range(num_layers): - resnets.append( - ResidualTemporalBlock1D( - out_channels, out_channels, embed_dim=temb_channels)) + resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=temb_channels)) self.resnets = nn.LayerList(resnets) @@ -148,8 +138,7 @@ def __init__( def forward(self, hidden_states, res_hidden_states_tuple=None, temb=None): if res_hidden_states_tuple is not None: res_hidden_states = res_hidden_states_tuple[-1] - hidden_states = paddle.concat( - (hidden_states, res_hidden_states), axis=1) + hidden_states = paddle.concat((hidden_states, res_hidden_states), axis=1) hidden_states = self.resnets[0](hidden_states, temb) for resnet in self.resnets[1:]: @@ -171,11 +160,9 @@ def __init__(self, in_channels, out_channels, embed_dim): self.out_channels = out_channels self.embed_dim = embed_dim - self.res1 = ResidualTemporalBlock1D( - in_channels, in_channels // 2, embed_dim=embed_dim) + self.res1 = ResidualTemporalBlock1D(in_channels, in_channels // 2, embed_dim=embed_dim) self.down1 = Downsample1D(out_channels // 2, use_conv=True) - self.res2 = ResidualTemporalBlock1D( - in_channels // 2, in_channels // 4, embed_dim=embed_dim) + self.res2 = ResidualTemporalBlock1D(in_channels // 2, in_channels // 4, embed_dim=embed_dim) self.down2 = Downsample1D(out_channels // 4, use_conv=True) def forward(self, x, temb=None): @@ -188,29 +175,25 @@ def forward(self, x, temb=None): class MidResTemporalBlock1D(nn.Layer): def __init__( - self, - in_channels, - out_channels, - embed_dim, - num_layers: int=1, - add_downsample: bool=False, - add_upsample: bool=False, - non_linearity=None, ): + self, + in_channels, + out_channels, + embed_dim, + num_layers: int = 1, + add_downsample: bool = False, + add_upsample: bool = False, + non_linearity=None, + ): super().__init__() self.in_channels = in_channels self.out_channels = out_channels self.add_downsample = add_downsample # there will always be at least one resnet - resnets = [ - ResidualTemporalBlock1D( - in_channels, out_channels, embed_dim=embed_dim) - ] + resnets = [ResidualTemporalBlock1D(in_channels, out_channels, embed_dim=embed_dim)] for _ in range(num_layers): - resnets.append( - ResidualTemporalBlock1D( - out_channels, out_channels, embed_dim=embed_dim)) + resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=embed_dim)) self.resnets = nn.LayerList(resnets) @@ -271,11 +254,13 @@ def forward(self, hidden_states, temb=None): class OutValueFunctionBlock(nn.Layer): def __init__(self, fc_dim, embed_dim): super().__init__() - self.final_block = nn.LayerList([ - nn.Linear(fc_dim + embed_dim, fc_dim // 2), - nn.Mish(), - nn.Linear(fc_dim // 2, 1), - ]) + self.final_block = nn.LayerList( + [ + nn.Linear(fc_dim + embed_dim, fc_dim // 2), + nn.Mish(), + nn.Linear(fc_dim // 2, 1), + ] + ) def forward(self, hidden_states, temb): hidden_states = hidden_states.reshape([hidden_states.shape[0], -1]) @@ -324,15 +309,11 @@ def __init__(self, kernel="linear", pad_mode="reflect"): self.register_buffer("kernel", kernel_1d) def forward(self, hidden_states): - hidden_states = F.pad(hidden_states, (self.pad, ) * 2, - self.pad_mode, - data_format="NCL") + hidden_states = F.pad(hidden_states, (self.pad,) * 2, self.pad_mode, data_format="NCL") weight = paddle.zeros( - [ - hidden_states.shape[1], hidden_states.shape[1], - self.kernel.shape[0] - ], - dtype=hidden_states.dtype, ) + [hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]], + dtype=hidden_states.dtype, + ) indices = paddle.arange(hidden_states.shape[1]) weight[indices, indices] = self.kernel.cast(weight.dtype) return F.conv1d(hidden_states, weight, stride=2) @@ -347,19 +328,14 @@ def __init__(self, kernel="linear", pad_mode="reflect"): self.register_buffer("kernel", kernel_1d) def forward(self, hidden_states, temb=None): - hidden_states = F.pad(hidden_states, ((self.pad + 1) // 2, ) * 2, - self.pad_mode, - data_format="NCL") + hidden_states = F.pad(hidden_states, ((self.pad + 1) // 2,) * 2, self.pad_mode, data_format="NCL") weight = paddle.zeros( - [ - hidden_states.shape[1], hidden_states.shape[1], - self.kernel.shape[0] - ], - dtype=hidden_states.dtype, ) + [hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]], + dtype=hidden_states.dtype, + ) indices = paddle.arange(hidden_states.shape[1]) weight[indices, indices] = self.kernel.cast(weight.dtype) - return F.conv1d_transpose( - hidden_states, weight, stride=2, padding=self.pad * 2 + 1) + return F.conv1d_transpose(hidden_states, weight, stride=2, padding=self.pad * 2 + 1) class SelfAttention1d(nn.Layer): @@ -395,9 +371,10 @@ def reshape_batch_dim_to_heads(self, tensor, transpose=True): return tensor def set_use_memory_efficient_attention_xformers( - self, - use_memory_efficient_attention_xformers: bool, - attention_op: Optional[str]=None, ): + self, + use_memory_efficient_attention_xformers: bool, + attention_op: Optional[str] = None, + ): # remove this PR: https://github.com/PaddlePaddle/Paddle/pull/56045 # if self.head_size > 128 and attention_op == "flash": # attention_op = "cutlass" @@ -409,18 +386,15 @@ def set_use_memory_efficient_attention_xformers( else: try: _ = F.scaled_dot_product_attention_( - paddle.randn( - (1, 1, 2, 40), dtype=paddle.float16), - paddle.randn( - (1, 1, 2, 40), dtype=paddle.float16), - paddle.randn( - (1, 1, 2, 40), dtype=paddle.float16), - attention_op=attention_op, ) + paddle.randn((1, 1, 2, 40), dtype=paddle.float16), + paddle.randn((1, 1, 2, 40), dtype=paddle.float16), + paddle.randn((1, 1, 2, 40), dtype=paddle.float16), + attention_op=attention_op, + ) except Exception as e: raise e - self._use_memory_efficient_attention_xformers = ( - use_memory_efficient_attention_xformers) + self._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers self._attention_op = attention_op def forward(self, hidden_states): @@ -434,14 +408,14 @@ def forward(self, hidden_states): value_proj = self.value(hidden_states) query_proj = self.reshape_heads_to_batch_dim( - query_proj, - transpose=not self._use_memory_efficient_attention_xformers) + query_proj, transpose=not self._use_memory_efficient_attention_xformers + ) key_proj = self.reshape_heads_to_batch_dim( - key_proj, - transpose=not self._use_memory_efficient_attention_xformers) + key_proj, transpose=not self._use_memory_efficient_attention_xformers + ) value_proj = self.reshape_heads_to_batch_dim( - value_proj, - transpose=not self._use_memory_efficient_attention_xformers) + value_proj, transpose=not self._use_memory_efficient_attention_xformers + ) if self._use_memory_efficient_attention_xformers: hidden_states = F.scaled_dot_product_attention_( @@ -452,19 +426,17 @@ def forward(self, hidden_states): scale=self.scale, dropout_p=0.0, training=self.training, - attention_op=self._attention_op, ) + attention_op=self._attention_op, + ) else: - attention_scores = (paddle.matmul( - query_proj, key_proj, transpose_y=True) * self.scale) - attention_probs = F.softmax( - attention_scores.cast("float32"), - axis=-1).cast(attention_scores.dtype) + attention_scores = paddle.matmul(query_proj, key_proj, transpose_y=True) * self.scale + attention_probs = F.softmax(attention_scores.cast("float32"), axis=-1).cast(attention_scores.dtype) hidden_states = paddle.matmul(attention_probs, value_proj) # reshape hidden_states hidden_states = self.reshape_batch_dim_to_heads( - hidden_states, - transpose=not self._use_memory_efficient_attention_xformers) + hidden_states, transpose=not self._use_memory_efficient_attention_xformers + ) # compute next hidden_states hidden_states = self.proj_attn(hidden_states) @@ -483,8 +455,7 @@ def __init__(self, in_channels, mid_channels, out_channels, is_last=False): self.has_conv_skip = in_channels != out_channels if self.has_conv_skip: - self.conv_skip = nn.Conv1D( - in_channels, out_channels, 1, bias_attr=False) + self.conv_skip = nn.Conv1D(in_channels, out_channels, 1, bias_attr=False) self.conv_1 = nn.Conv1D(in_channels, mid_channels, 5, padding=2) self.group_norm_1 = nn.GroupNorm(1, mid_channels) @@ -496,8 +467,7 @@ def __init__(self, in_channels, mid_channels, out_channels, is_last=False): self.gelu_2 = nn.GELU() def forward(self, hidden_states): - residual = (self.conv_skip(hidden_states) - if self.has_conv_skip else hidden_states) + residual = self.conv_skip(hidden_states) if self.has_conv_skip else hidden_states hidden_states = self.conv_1(hidden_states) hidden_states = self.group_norm_1(hidden_states) @@ -579,7 +549,7 @@ def forward(self, hidden_states, temb=None): hidden_states = resnet(hidden_states) hidden_states = attn(hidden_states) - return hidden_states, (hidden_states, ) + return hidden_states, (hidden_states,) class DownBlock1D(nn.Layer): @@ -602,7 +572,7 @@ def forward(self, hidden_states, temb=None): for resnet in self.resnets: hidden_states = resnet(hidden_states) - return hidden_states, (hidden_states, ) + return hidden_states, (hidden_states,) class DownBlock1DNoSkip(nn.Layer): @@ -623,7 +593,7 @@ def forward(self, hidden_states, temb=None): for resnet in self.resnets: hidden_states = resnet(hidden_states) - return hidden_states, (hidden_states, ) + return hidden_states, (hidden_states,) class AttnUpBlock1D(nn.Layer): @@ -648,8 +618,7 @@ def __init__(self, in_channels, out_channels, mid_channels=None): def forward(self, hidden_states, res_hidden_states_tuple, temb=None): res_hidden_states = res_hidden_states_tuple[-1] - hidden_states = paddle.concat( - [hidden_states, res_hidden_states], axis=1) + hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1) for resnet, attn in zip(self.resnets, self.attentions): hidden_states = resnet(hidden_states) @@ -676,8 +645,7 @@ def __init__(self, in_channels, out_channels, mid_channels=None): def forward(self, hidden_states, res_hidden_states_tuple, temb=None): res_hidden_states = res_hidden_states_tuple[-1] - hidden_states = paddle.concat( - [hidden_states, res_hidden_states], axis=1) + hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1) for resnet in self.resnets: hidden_states = resnet(hidden_states) @@ -695,16 +663,14 @@ def __init__(self, in_channels, out_channels, mid_channels=None): resnets = [ ResConvBlock(2 * in_channels, mid_channels, mid_channels), ResConvBlock(mid_channels, mid_channels, mid_channels), - ResConvBlock( - mid_channels, mid_channels, out_channels, is_last=True), + ResConvBlock(mid_channels, mid_channels, out_channels, is_last=True), ] self.resnets = nn.LayerList(resnets) def forward(self, hidden_states, res_hidden_states_tuple, temb=None): res_hidden_states = res_hidden_states_tuple[-1] - hidden_states = paddle.concat( - [hidden_states, res_hidden_states], axis=1) + hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1) for resnet in self.resnets: hidden_states = resnet(hidden_states) @@ -713,79 +679,77 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None): def get_down_block( - down_block_type, - num_layers, - in_channels, - out_channels, - temb_channels, - add_downsample, ): + down_block_type, + num_layers, + in_channels, + out_channels, + temb_channels, + add_downsample, +): if down_block_type == "DownResnetBlock1D": return DownResnetBlock1D( in_channels=in_channels, num_layers=num_layers, out_channels=out_channels, temb_channels=temb_channels, - add_downsample=add_downsample, ) + add_downsample=add_downsample, + ) elif down_block_type == "DownBlock1D": return DownBlock1D(out_channels=out_channels, in_channels=in_channels) elif down_block_type == "AttnDownBlock1D": - return AttnDownBlock1D( - out_channels=out_channels, in_channels=in_channels) + return AttnDownBlock1D(out_channels=out_channels, in_channels=in_channels) elif down_block_type == "DownBlock1DNoSkip": - return DownBlock1DNoSkip( - out_channels=out_channels, in_channels=in_channels) + return DownBlock1DNoSkip(out_channels=out_channels, in_channels=in_channels) raise ValueError(f"{down_block_type} does not exist.") -def get_up_block(up_block_type, num_layers, in_channels, out_channels, - temb_channels, add_upsample): +def get_up_block(up_block_type, num_layers, in_channels, out_channels, temb_channels, add_upsample): if up_block_type == "UpResnetBlock1D": return UpResnetBlock1D( in_channels=in_channels, num_layers=num_layers, out_channels=out_channels, temb_channels=temb_channels, - add_upsample=add_upsample, ) + add_upsample=add_upsample, + ) elif up_block_type == "UpBlock1D": return UpBlock1D(in_channels=in_channels, out_channels=out_channels) elif up_block_type == "AttnUpBlock1D": return AttnUpBlock1D(in_channels=in_channels, out_channels=out_channels) elif up_block_type == "UpBlock1DNoSkip": - return UpBlock1DNoSkip( - in_channels=in_channels, out_channels=out_channels) + return UpBlock1DNoSkip(in_channels=in_channels, out_channels=out_channels) raise ValueError(f"{up_block_type} does not exist.") def get_mid_block( - mid_block_type, - num_layers, - in_channels, - mid_channels, - out_channels, - embed_dim, - add_downsample, ): + mid_block_type, + num_layers, + in_channels, + mid_channels, + out_channels, + embed_dim, + add_downsample, +): if mid_block_type == "MidResTemporalBlock1D": return MidResTemporalBlock1D( num_layers=num_layers, in_channels=in_channels, out_channels=out_channels, embed_dim=embed_dim, - add_downsample=add_downsample, ) + add_downsample=add_downsample, + ) elif mid_block_type == "ValueFunctionMidBlock1D": - return ValueFunctionMidBlock1D( - in_channels=in_channels, - out_channels=out_channels, - embed_dim=embed_dim) + return ValueFunctionMidBlock1D(in_channels=in_channels, out_channels=out_channels, embed_dim=embed_dim) elif mid_block_type == "UNetMidBlock1D": return UNetMidBlock1D( in_channels=in_channels, mid_channels=mid_channels, - out_channels=out_channels, ) + out_channels=out_channels, + ) raise ValueError(f"{mid_block_type} does not exist.") -def get_out_block(*, out_block_type, num_groups_out, embed_dim, out_channels, - act_fn, fc_dim): +def get_out_block(*, out_block_type, num_groups_out, embed_dim, out_channels, act_fn, fc_dim): if out_block_type == "OutConv1DBlock": return OutConv1DBlock(num_groups_out, out_channels, embed_dim, act_fn) elif out_block_type == "ValueFunction": diff --git a/ppdiffusers/ppdiffusers/models/unet_2d.py b/ppdiffusers/ppdiffusers/models/unet_2d.py index c3bcf99332789..f66b21a6a9e50 100644 --- a/ppdiffusers/ppdiffusers/models/unet_2d.py +++ b/ppdiffusers/ppdiffusers/models/unet_2d.py @@ -83,37 +83,40 @@ class conditioning with `class_embed_type` equal to `None`. @register_to_config def __init__( - self, - sample_size: Optional[Union[int, Tuple[int, int]]]=None, - in_channels: int=3, - out_channels: int=3, - center_input_sample: bool=False, - time_embedding_type: str="positional", - freq_shift: int=0, - flip_sin_to_cos: bool=True, - down_block_types: Tuple[str]=( - "DownBlock2D", - "AttnDownBlock2D", - "AttnDownBlock2D", - "AttnDownBlock2D", ), - up_block_types: Tuple[str]=( - "AttnUpBlock2D", - "AttnUpBlock2D", - "AttnUpBlock2D", - "UpBlock2D", ), - block_out_channels: Tuple[int]=(224, 448, 672, 896), - layers_per_block: int=2, - mid_block_scale_factor: float=1, - downsample_padding: int=1, - act_fn: str="silu", - attention_head_dim: Optional[int]=8, - norm_num_groups: int=32, - norm_eps: float=1e-5, - resnet_time_scale_shift: str="default", - add_attention: bool=True, - class_embed_type: Optional[str]=None, - num_class_embeds: Optional[int]=None, - resnet_pre_temb_non_linearity: Optional[bool]=False, ): + self, + sample_size: Optional[Union[int, Tuple[int, int]]] = None, + in_channels: int = 3, + out_channels: int = 3, + center_input_sample: bool = False, + time_embedding_type: str = "positional", + freq_shift: int = 0, + flip_sin_to_cos: bool = True, + down_block_types: Tuple[str] = ( + "DownBlock2D", + "AttnDownBlock2D", + "AttnDownBlock2D", + "AttnDownBlock2D", + ), + up_block_types: Tuple[str] = ( + "AttnUpBlock2D", + "AttnUpBlock2D", + "AttnUpBlock2D", + "UpBlock2D", + ), + block_out_channels: Tuple[int] = (224, 448, 672, 896), + layers_per_block: int = 2, + mid_block_scale_factor: float = 1, + downsample_padding: int = 1, + act_fn: str = "silu", + attention_head_dim: Optional[int] = 8, + norm_num_groups: int = 32, + norm_eps: float = 1e-5, + resnet_time_scale_shift: str = "default", + add_attention: bool = True, + class_embed_type: Optional[str] = None, + num_class_embeds: Optional[int] = None, + resnet_pre_temb_non_linearity: Optional[bool] = False, + ): super().__init__() self.sample_size = sample_size @@ -131,29 +134,23 @@ def __init__( ) # input - self.conv_in = nn.Conv2D( - in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1)) + self.conv_in = nn.Conv2D(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1)) # time if time_embedding_type == "fourier": - self.time_proj = GaussianFourierProjection( - embedding_size=block_out_channels[0], scale=16) + self.time_proj = GaussianFourierProjection(embedding_size=block_out_channels[0], scale=16) timestep_input_dim = 2 * block_out_channels[0] elif time_embedding_type == "positional": - self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, - freq_shift) + self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift) timestep_input_dim = block_out_channels[0] - self.time_embedding = TimestepEmbedding(timestep_input_dim, - time_embed_dim) + self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim) # class embedding if class_embed_type is None and num_class_embeds is not None: - self.class_embedding = nn.Embedding(num_class_embeds, - time_embed_dim) + self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim) elif class_embed_type == "timestep": - self.class_embedding = TimestepEmbedding(timestep_input_dim, - time_embed_dim) + self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim) elif class_embed_type == "identity": self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim) else: @@ -195,7 +192,8 @@ def __init__( attn_num_head_channels=attention_head_dim, downsample_padding=downsample_padding, resnet_time_scale_shift=resnet_time_scale_shift, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) self.down_blocks.append(down_block) # mid @@ -209,7 +207,8 @@ def __init__( attn_num_head_channels=attention_head_dim, resnet_groups=norm_num_groups, add_attention=add_attention, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) # up reversed_block_out_channels = list(reversed(block_out_channels)) @@ -217,8 +216,7 @@ def __init__( for i, up_block_type in enumerate(up_block_types): prev_output_channel = output_channel output_channel = reversed_block_out_channels[i] - input_channel = reversed_block_out_channels[min( - i + 1, len(block_out_channels) - 1)] + input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)] is_final_block = i == len(block_out_channels) - 1 @@ -235,27 +233,28 @@ def __init__( resnet_groups=norm_num_groups, attn_num_head_channels=attention_head_dim, resnet_time_scale_shift=resnet_time_scale_shift, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) self.up_blocks.append(up_block) prev_output_channel = output_channel # out - num_groups_out = (norm_num_groups if norm_num_groups is not None else - min(block_out_channels[0] // 4, 32)) + num_groups_out = norm_num_groups if norm_num_groups is not None else min(block_out_channels[0] // 4, 32) self.conv_norm_out = nn.GroupNorm( num_channels=block_out_channels[0], num_groups=num_groups_out, - epsilon=norm_eps, ) + epsilon=norm_eps, + ) self.conv_act = nn.Silu() - self.conv_out = nn.Conv2D( - block_out_channels[0], out_channels, kernel_size=3, padding=1) + self.conv_out = nn.Conv2D(block_out_channels[0], out_channels, kernel_size=3, padding=1) def forward( - self, - sample: paddle.Tensor, - timestep: Union[paddle.Tensor, float, int], - class_labels: Optional[paddle.Tensor]=None, - return_dict: bool=True, ) -> Union[UNet2DOutput, Tuple]: + self, + sample: paddle.Tensor, + timestep: Union[paddle.Tensor, float, int], + class_labels: Optional[paddle.Tensor] = None, + return_dict: bool = True, + ) -> Union[UNet2DOutput, Tuple]: r""" Args: sample (`paddle.Tensor`): (batch, channel, height, width) noisy inputs tensor @@ -284,7 +283,11 @@ def forward( timesteps = timesteps[None] # broadcast to batch dimension in a way that's compatible with ONNX/Core ML - timesteps = timesteps.expand([sample.shape[0], ]) + timesteps = timesteps.expand( + [ + sample.shape[0], + ] + ) t_emb = self.time_proj(timesteps) @@ -296,9 +299,7 @@ def forward( if self.class_embedding is not None: if class_labels is None: - raise ValueError( - "class_labels should be provided when doing class conditioning" - ) + raise ValueError("class_labels should be provided when doing class conditioning") class_labels = class_labels.cast(self.dtype) @@ -315,7 +316,7 @@ def forward( sample = self.conv_in(sample) # 3. down - down_block_res_samples = (sample, ) + down_block_res_samples = (sample,) if self.resnet_pre_temb_non_linearity: emb = self.down_resnet_temb_nonlinearity(emb) @@ -323,10 +324,10 @@ def forward( for downsample_block in self.down_blocks: if hasattr(downsample_block, "skip_conv"): sample, res_samples, skip_sample = downsample_block( - hidden_states=sample, temb=emb, skip_sample=skip_sample) + hidden_states=sample, temb=emb, skip_sample=skip_sample + ) else: - sample, res_samples = downsample_block( - hidden_states=sample, temb=emb) + sample, res_samples = downsample_block(hidden_states=sample, temb=emb) down_block_res_samples += res_samples @@ -336,13 +337,11 @@ def forward( # 5. up skip_sample = None for upsample_block in self.up_blocks: - res_samples = down_block_res_samples[-len(upsample_block.resnets):] - down_block_res_samples = down_block_res_samples[:-len( - upsample_block.resnets)] + res_samples = down_block_res_samples[-len(upsample_block.resnets) :] + down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)] if hasattr(upsample_block, "skip_conv"): - sample, skip_sample = upsample_block(sample, res_samples, emb, - skip_sample) + sample, skip_sample = upsample_block(sample, res_samples, emb, skip_sample) else: sample = upsample_block(sample, res_samples, emb) @@ -355,11 +354,10 @@ def forward( sample += skip_sample if self.config.time_embedding_type == "fourier": - timesteps = timesteps.reshape( - [sample.shape[0], *([1] * len(sample.shape[1:]))]) + timesteps = timesteps.reshape([sample.shape[0], *([1] * len(sample.shape[1:]))]) sample = sample / timesteps if not return_dict: - return (sample, ) + return (sample,) return UNet2DOutput(sample=sample) diff --git a/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py b/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py index b49e5263c2077..5bfa7a33dcbff 100644 --- a/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py +++ b/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py @@ -22,36 +22,42 @@ from .attention import AdaGroupNorm, AttentionBlock from .attention_processor import Attention, AttnAddedKVProcessor from .dual_transformer_2d import DualTransformer2DModel -from .resnet import (Downsample2D, FirDownsample2D, FirUpsample2D, - KDownsample2D, KUpsample2D, ResnetBlock2D, Upsample2D) +from .resnet import ( + Downsample2D, + FirDownsample2D, + FirUpsample2D, + KDownsample2D, + KUpsample2D, + ResnetBlock2D, + Upsample2D, +) from .transformer_2d import Transformer2DModel def get_down_block( - down_block_type, - num_layers, - in_channels, - out_channels, - temb_channels, - add_downsample, - resnet_eps, - resnet_act_fn, - attn_num_head_channels, - resnet_groups=None, - cross_attention_dim=None, - downsample_padding=None, - dual_cross_attention=False, - use_linear_projection=False, - only_cross_attention=False, - upcast_attention=False, - resnet_time_scale_shift="default", - resnet_skip_time_act=False, - resnet_out_scale_factor=1.0, - cross_attention_norm=None, - resnet_pre_temb_non_linearity=False, ): - down_block_type = (down_block_type[7:] - if down_block_type.startswith("UNetRes") else - down_block_type) + down_block_type, + num_layers, + in_channels, + out_channels, + temb_channels, + add_downsample, + resnet_eps, + resnet_act_fn, + attn_num_head_channels, + resnet_groups=None, + cross_attention_dim=None, + downsample_padding=None, + dual_cross_attention=False, + use_linear_projection=False, + only_cross_attention=False, + upcast_attention=False, + resnet_time_scale_shift="default", + resnet_skip_time_act=False, + resnet_out_scale_factor=1.0, + cross_attention_norm=None, + resnet_pre_temb_non_linearity=False, +): + down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type if down_block_type == "DownBlock2D": return DownBlock2D( num_layers=num_layers, @@ -64,7 +70,8 @@ def get_down_block( resnet_groups=resnet_groups, downsample_padding=downsample_padding, resnet_time_scale_shift=resnet_time_scale_shift, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif down_block_type == "ResnetDownsampleBlock2D": return ResnetDownsampleBlock2D( num_layers=num_layers, @@ -78,7 +85,8 @@ def get_down_block( resnet_time_scale_shift=resnet_time_scale_shift, skip_time_act=resnet_skip_time_act, output_scale_factor=resnet_out_scale_factor, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif down_block_type == "AttnDownBlock2D": return AttnDownBlock2D( num_layers=num_layers, @@ -92,11 +100,11 @@ def get_down_block( downsample_padding=downsample_padding, attn_num_head_channels=attn_num_head_channels, resnet_time_scale_shift=resnet_time_scale_shift, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif down_block_type == "CrossAttnDownBlock2D": if cross_attention_dim is None: - raise ValueError( - "cross_attention_dim must be specified for CrossAttnDownBlock2D") + raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D") return CrossAttnDownBlock2D( num_layers=num_layers, in_channels=in_channels, @@ -114,12 +122,11 @@ def get_down_block( only_cross_attention=only_cross_attention, upcast_attention=upcast_attention, resnet_time_scale_shift=resnet_time_scale_shift, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif down_block_type == "SimpleCrossAttnDownBlock2D": if cross_attention_dim is None: - raise ValueError( - "cross_attention_dim must be specified for SimpleCrossAttnDownBlock2D" - ) + raise ValueError("cross_attention_dim must be specified for SimpleCrossAttnDownBlock2D") return SimpleCrossAttnDownBlock2D( num_layers=num_layers, in_channels=in_channels, @@ -136,7 +143,8 @@ def get_down_block( output_scale_factor=resnet_out_scale_factor, only_cross_attention=only_cross_attention, cross_attention_norm=cross_attention_norm, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif down_block_type == "SkipDownBlock2D": return SkipDownBlock2D( num_layers=num_layers, @@ -148,7 +156,8 @@ def get_down_block( resnet_act_fn=resnet_act_fn, downsample_padding=downsample_padding, resnet_time_scale_shift=resnet_time_scale_shift, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif down_block_type == "AttnSkipDownBlock2D": return AttnSkipDownBlock2D( num_layers=num_layers, @@ -161,7 +170,8 @@ def get_down_block( downsample_padding=downsample_padding, attn_num_head_channels=attn_num_head_channels, resnet_time_scale_shift=resnet_time_scale_shift, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif down_block_type == "DownEncoderBlock2D": return DownEncoderBlock2D( num_layers=num_layers, @@ -173,7 +183,8 @@ def get_down_block( resnet_groups=resnet_groups, downsample_padding=downsample_padding, resnet_time_scale_shift=resnet_time_scale_shift, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif down_block_type == "AttnDownEncoderBlock2D": return AttnDownEncoderBlock2D( num_layers=num_layers, @@ -186,7 +197,8 @@ def get_down_block( downsample_padding=downsample_padding, attn_num_head_channels=attn_num_head_channels, resnet_time_scale_shift=resnet_time_scale_shift, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif down_block_type == "KDownBlock2D": return KDownBlock2D( num_layers=num_layers, @@ -196,7 +208,8 @@ def get_down_block( add_downsample=add_downsample, resnet_eps=resnet_eps, resnet_act_fn=resnet_act_fn, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif down_block_type == "KCrossAttnDownBlock2D": return KCrossAttnDownBlock2D( num_layers=num_layers, @@ -209,34 +222,35 @@ def get_down_block( cross_attention_dim=cross_attention_dim, attn_num_head_channels=attn_num_head_channels, add_self_attention=True if not add_downsample else False, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) raise ValueError(f"{down_block_type} does not exist.") def get_up_block( - up_block_type, - num_layers, - in_channels, - out_channels, - prev_output_channel, - temb_channels, - add_upsample, - resnet_eps, - resnet_act_fn, - attn_num_head_channels, - resnet_groups=None, - cross_attention_dim=None, - dual_cross_attention=False, - use_linear_projection=False, - only_cross_attention=False, - upcast_attention=False, - resnet_time_scale_shift="default", - resnet_skip_time_act=False, - resnet_out_scale_factor=1.0, - cross_attention_norm=None, - resnet_pre_temb_non_linearity=False, ): - up_block_type = (up_block_type[7:] - if up_block_type.startswith("UNetRes") else up_block_type) + up_block_type, + num_layers, + in_channels, + out_channels, + prev_output_channel, + temb_channels, + add_upsample, + resnet_eps, + resnet_act_fn, + attn_num_head_channels, + resnet_groups=None, + cross_attention_dim=None, + dual_cross_attention=False, + use_linear_projection=False, + only_cross_attention=False, + upcast_attention=False, + resnet_time_scale_shift="default", + resnet_skip_time_act=False, + resnet_out_scale_factor=1.0, + cross_attention_norm=None, + resnet_pre_temb_non_linearity=False, +): + up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type if up_block_type == "UpBlock2D": return UpBlock2D( num_layers=num_layers, @@ -249,7 +263,8 @@ def get_up_block( resnet_act_fn=resnet_act_fn, resnet_groups=resnet_groups, resnet_time_scale_shift=resnet_time_scale_shift, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif up_block_type == "ResnetUpsampleBlock2D": return ResnetUpsampleBlock2D( num_layers=num_layers, @@ -264,11 +279,11 @@ def get_up_block( resnet_time_scale_shift=resnet_time_scale_shift, skip_time_act=resnet_skip_time_act, output_scale_factor=resnet_out_scale_factor, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif up_block_type == "CrossAttnUpBlock2D": if cross_attention_dim is None: - raise ValueError( - "cross_attention_dim must be specified for CrossAttnUpBlock2D") + raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D") return CrossAttnUpBlock2D( num_layers=num_layers, in_channels=in_channels, @@ -286,12 +301,11 @@ def get_up_block( only_cross_attention=only_cross_attention, upcast_attention=upcast_attention, resnet_time_scale_shift=resnet_time_scale_shift, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif up_block_type == "SimpleCrossAttnUpBlock2D": if cross_attention_dim is None: - raise ValueError( - "cross_attention_dim must be specified for SimpleCrossAttnUpBlock2D" - ) + raise ValueError("cross_attention_dim must be specified for SimpleCrossAttnUpBlock2D") return SimpleCrossAttnUpBlock2D( num_layers=num_layers, in_channels=in_channels, @@ -309,7 +323,8 @@ def get_up_block( output_scale_factor=resnet_out_scale_factor, only_cross_attention=only_cross_attention, cross_attention_norm=cross_attention_norm, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif up_block_type == "AttnUpBlock2D": return AttnUpBlock2D( num_layers=num_layers, @@ -323,7 +338,8 @@ def get_up_block( resnet_groups=resnet_groups, attn_num_head_channels=attn_num_head_channels, resnet_time_scale_shift=resnet_time_scale_shift, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif up_block_type == "SkipUpBlock2D": return SkipUpBlock2D( num_layers=num_layers, @@ -335,7 +351,8 @@ def get_up_block( resnet_eps=resnet_eps, resnet_act_fn=resnet_act_fn, resnet_time_scale_shift=resnet_time_scale_shift, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif up_block_type == "AttnSkipUpBlock2D": return AttnSkipUpBlock2D( num_layers=num_layers, @@ -348,7 +365,8 @@ def get_up_block( resnet_act_fn=resnet_act_fn, attn_num_head_channels=attn_num_head_channels, resnet_time_scale_shift=resnet_time_scale_shift, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif up_block_type == "UpDecoderBlock2D": return UpDecoderBlock2D( num_layers=num_layers, @@ -359,7 +377,8 @@ def get_up_block( resnet_act_fn=resnet_act_fn, resnet_groups=resnet_groups, resnet_time_scale_shift=resnet_time_scale_shift, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif up_block_type == "AttnUpDecoderBlock2D": return AttnUpDecoderBlock2D( num_layers=num_layers, @@ -371,7 +390,8 @@ def get_up_block( resnet_groups=resnet_groups, attn_num_head_channels=attn_num_head_channels, resnet_time_scale_shift=resnet_time_scale_shift, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif up_block_type == "KUpBlock2D": return KUpBlock2D( num_layers=num_layers, @@ -381,7 +401,8 @@ def get_up_block( add_upsample=add_upsample, resnet_eps=resnet_eps, resnet_act_fn=resnet_act_fn, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif up_block_type == "KCrossAttnUpBlock2D": return KCrossAttnUpBlock2D( num_layers=num_layers, @@ -393,30 +414,31 @@ def get_up_block( resnet_act_fn=resnet_act_fn, cross_attention_dim=cross_attention_dim, attn_num_head_channels=attn_num_head_channels, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) raise ValueError(f"{up_block_type} does not exist.") class UNetMidBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - add_attention: bool=True, - attn_num_head_channels: int=1, - output_scale_factor: float=1.0, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + add_attention: bool = True, + attn_num_head_channels: int = 1, + output_scale_factor: float = 1.0, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() - resnet_groups = (resnet_groups if resnet_groups is not None else - min(in_channels // 4, 32)) + resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32) self.add_attention = add_attention # there is always at least one resnet @@ -432,7 +454,8 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) ] attentions = [] @@ -444,7 +467,9 @@ def __init__( num_head_channels=attn_num_head_channels, rescale_output_factor=output_scale_factor, eps=resnet_eps, - norm_num_groups=resnet_groups, )) + norm_num_groups=resnet_groups, + ) + ) else: attentions.append(None) @@ -460,7 +485,9 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) self.attentions = nn.LayerList(attentions) self.resnets = nn.LayerList(resnets) @@ -477,29 +504,29 @@ def forward(self, hidden_states, temb=None): class UNetMidBlock2DCrossAttn(nn.Layer): def __init__( - self, - in_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - attn_num_head_channels: int=1, - output_scale_factor: float=1.0, - cross_attention_dim: int=1280, - dual_cross_attention: bool=False, - use_linear_projection: bool=False, - upcast_attention: bool=False, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels: int = 1, + output_scale_factor: float = 1.0, + cross_attention_dim: int = 1280, + dual_cross_attention: bool = False, + use_linear_projection: bool = False, + upcast_attention: bool = False, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() self.has_cross_attention = True self.attn_num_head_channels = attn_num_head_channels - resnet_groups = (resnet_groups if resnet_groups is not None else - min(in_channels // 4, 32)) + resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32) # there is always at least one resnet resnets = [ @@ -514,7 +541,8 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) ] attentions = [] @@ -529,7 +557,9 @@ def __init__( cross_attention_dim=cross_attention_dim, norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, - upcast_attention=upcast_attention, )) + upcast_attention=upcast_attention, + ) + ) else: attentions.append( DualTransformer2DModel( @@ -538,7 +568,9 @@ def __init__( in_channels=in_channels, num_layers=1, cross_attention_dim=cross_attention_dim, - norm_num_groups=resnet_groups, )) + norm_num_groups=resnet_groups, + ) + ) resnets.append( ResnetBlock2D( in_channels=in_channels, @@ -551,24 +583,28 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) self.attentions = nn.LayerList(attentions) self.resnets = nn.LayerList(resnets) def forward( - self, - hidden_states, - temb=None, - encoder_hidden_states=None, - attention_mask=None, - cross_attention_kwargs=None, ): + self, + hidden_states, + temb=None, + encoder_hidden_states=None, + attention_mask=None, + cross_attention_kwargs=None, + ): hidden_states = self.resnets[0](hidden_states, temb) for attn, resnet in zip(self.attentions, self.resnets[1:]): hidden_states = attn( hidden_states, encoder_hidden_states=encoder_hidden_states, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample hidden_states = resnet(hidden_states, temb) return hidden_states @@ -576,30 +612,30 @@ def forward( class UNetMidBlock2DSimpleCrossAttn(nn.Layer): def __init__( - self, - in_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - attn_num_head_channels: int=1, - output_scale_factor: float=1.0, - cross_attention_dim: int=1280, - skip_time_act=False, - only_cross_attention=False, - cross_attention_norm=None, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels: int = 1, + output_scale_factor: float = 1.0, + cross_attention_dim: int = 1280, + skip_time_act=False, + only_cross_attention=False, + cross_attention_norm=None, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() self.has_cross_attention = True self.attn_num_head_channels = attn_num_head_channels - resnet_groups = (resnet_groups if resnet_groups is not None else - min(in_channels // 4, 32)) + resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32) self.num_heads = in_channels // self.attn_num_head_channels @@ -617,7 +653,8 @@ def __init__( output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, skip_time_act=skip_time_act, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) ] attentions = [] @@ -639,7 +676,9 @@ def __init__( upcast_softmax=True, only_cross_attention=only_cross_attention, cross_attention_norm=cross_attention_norm, - processor=processor, )) + processor=processor, + ) + ) resnets.append( ResnetBlock2D( in_channels=in_channels, @@ -653,20 +692,22 @@ def __init__( output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, skip_time_act=skip_time_act, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) self.attentions = nn.LayerList(attentions) self.resnets = nn.LayerList(resnets) def forward( - self, - hidden_states, - temb=None, - encoder_hidden_states=None, - attention_mask=None, - cross_attention_kwargs=None, ): - cross_attention_kwargs = (cross_attention_kwargs if - cross_attention_kwargs is not None else {}) + self, + hidden_states, + temb=None, + encoder_hidden_states=None, + attention_mask=None, + cross_attention_kwargs=None, + ): + cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} hidden_states = self.resnets[0](hidden_states, temb) for attn, resnet in zip(self.attentions, self.resnets[1:]): # attn @@ -674,7 +715,8 @@ def forward( hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, - **cross_attention_kwargs, ) + **cross_attention_kwargs, + ) # resnet hidden_states = resnet(hidden_states, temb) @@ -684,22 +726,23 @@ def forward( class AttnDownBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - attn_num_head_channels: int=1, - output_scale_factor: float=1.0, - downsample_padding: int=1, - add_downsample: bool=True, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels: int = 1, + output_scale_factor: float = 1.0, + downsample_padding: int = 1, + add_downsample: bool = True, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() resnets = [] attentions = [] @@ -718,27 +761,34 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) attentions.append( AttentionBlock( out_channels, num_head_channels=attn_num_head_channels, rescale_output_factor=output_scale_factor, eps=resnet_eps, - norm_num_groups=resnet_groups, )) + norm_num_groups=resnet_groups, + ) + ) self.attentions = nn.LayerList(attentions) self.resnets = nn.LayerList(resnets) if add_downsample: - self.downsamplers = nn.LayerList([ - Downsample2D( - out_channels, - use_conv=True, - out_channels=out_channels, - padding=downsample_padding, - name="op", ) - ]) + self.downsamplers = nn.LayerList( + [ + Downsample2D( + out_channels, + use_conv=True, + out_channels=out_channels, + padding=downsample_padding, + name="op", + ) + ] + ) else: self.downsamplers = None @@ -748,40 +798,41 @@ def forward(self, hidden_states, temb=None): for resnet, attn in zip(self.resnets, self.attentions): hidden_states = resnet(hidden_states, temb) hidden_states = attn(hidden_states) - output_states += (hidden_states, ) + output_states += (hidden_states,) if self.downsamplers is not None: for downsampler in self.downsamplers: hidden_states = downsampler(hidden_states) - output_states += (hidden_states, ) + output_states += (hidden_states,) return hidden_states, output_states class CrossAttnDownBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - attn_num_head_channels: int=1, - cross_attention_dim: int=1280, - output_scale_factor: float=1.0, - downsample_padding: int=1, - add_downsample: bool=True, - dual_cross_attention: bool=False, - use_linear_projection: bool=False, - only_cross_attention: bool=False, - upcast_attention: bool=False, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels: int = 1, + cross_attention_dim: int = 1280, + output_scale_factor: float = 1.0, + downsample_padding: int = 1, + add_downsample: bool = True, + dual_cross_attention: bool = False, + use_linear_projection: bool = False, + only_cross_attention: bool = False, + upcast_attention: bool = False, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() resnets = [] attentions = [] @@ -803,7 +854,9 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) if not dual_cross_attention: attentions.append( Transformer2DModel( @@ -815,7 +868,9 @@ def __init__( norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, - upcast_attention=upcast_attention, )) + upcast_attention=upcast_attention, + ) + ) else: attentions.append( DualTransformer2DModel( @@ -824,99 +879,103 @@ def __init__( in_channels=out_channels, num_layers=1, cross_attention_dim=cross_attention_dim, - norm_num_groups=resnet_groups, )) + norm_num_groups=resnet_groups, + ) + ) self.attentions = nn.LayerList(attentions) self.resnets = nn.LayerList(resnets) if add_downsample: - self.downsamplers = nn.LayerList([ - Downsample2D( - out_channels, - use_conv=True, - out_channels=out_channels, - padding=downsample_padding, - name="op", ) - ]) + self.downsamplers = nn.LayerList( + [ + Downsample2D( + out_channels, + use_conv=True, + out_channels=out_channels, + padding=downsample_padding, + name="op", + ) + ] + ) else: self.downsamplers = None self.gradient_checkpointing = False def forward( - self, - hidden_states, - temb=None, - encoder_hidden_states=None, - attention_mask=None, - cross_attention_kwargs=None, - additional_residuals=None, ): + self, + hidden_states, + temb=None, + encoder_hidden_states=None, + attention_mask=None, + cross_attention_kwargs=None, + additional_residuals=None, + ): # TODO(Patrick, William) - attention mask is not used output_states = () for resnet, attn in zip(self.resnets, self.attentions): - if (self.training and self.gradient_checkpointing and - not hidden_states.stop_gradient): + if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient: def create_custom_forward(module, return_dict=None): def custom_forward(*inputs): if return_dict is not None: - return module( - *inputs, return_dict=return_dict)[ - 0] # move [0] when paddlepaddle <= 2.4.1 + return module(*inputs, return_dict=return_dict)[0] # move [0] when paddlepaddle <= 2.4.1 else: return module(*inputs) return custom_forward + hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb) hidden_states = recompute( - create_custom_forward(resnet), hidden_states, temb) - hidden_states = recompute( - create_custom_forward( - attn, return_dict=False), + create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states, - cross_attention_kwargs, ) # [0] + cross_attention_kwargs, + ) # [0] else: hidden_states = resnet(hidden_states, temb) hidden_states = attn( hidden_states, encoder_hidden_states=encoder_hidden_states, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample - output_states += (hidden_states, ) + output_states += (hidden_states,) if additional_residuals is not None: hidden_states += additional_residuals # westfish: add to align with torch features - output_states = tuple(output_states[:-1]) + (hidden_states, ) + output_states = tuple(output_states[:-1]) + (hidden_states,) if self.downsamplers is not None: for downsampler in self.downsamplers: hidden_states = downsampler(hidden_states) - output_states += (hidden_states, ) + output_states += (hidden_states,) return hidden_states, output_states class DownBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - output_scale_factor: float=1.0, - add_downsample: bool=True, - downsample_padding: int=1, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + output_scale_factor: float = 1.0, + add_downsample: bool = True, + downsample_padding: int = 1, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() resnets = [] @@ -934,19 +993,24 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) self.resnets = nn.LayerList(resnets) if add_downsample: - self.downsamplers = nn.LayerList([ - Downsample2D( - out_channels, - use_conv=True, - out_channels=out_channels, - padding=downsample_padding, - name="op", ) - ]) + self.downsamplers = nn.LayerList( + [ + Downsample2D( + out_channels, + use_conv=True, + out_channels=out_channels, + padding=downsample_padding, + name="op", + ) + ] + ) else: self.downsamplers = None @@ -956,8 +1020,7 @@ def forward(self, hidden_states, temb=None): output_states = () for resnet in self.resnets: - if (self.training and self.gradient_checkpointing and - not hidden_states.stop_gradient): + if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient: def create_custom_forward(module): def custom_forward(*inputs): @@ -965,38 +1028,38 @@ def custom_forward(*inputs): return custom_forward - hidden_states = recompute( - create_custom_forward(resnet), hidden_states, temb) + hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb) else: hidden_states = resnet(hidden_states, temb) - output_states += (hidden_states, ) + output_states += (hidden_states,) if self.downsamplers is not None: for downsampler in self.downsamplers: hidden_states = downsampler(hidden_states) - output_states += (hidden_states, ) + output_states += (hidden_states,) return hidden_states, output_states class DownEncoderBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - output_scale_factor: float=1.0, - add_downsample: bool=True, - downsample_padding: int=1, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + out_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + output_scale_factor: float = 1.0, + add_downsample: bool = True, + downsample_padding: int = 1, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() resnets = [] @@ -1014,19 +1077,24 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) self.resnets = nn.LayerList(resnets) if add_downsample: - self.downsamplers = nn.LayerList([ - Downsample2D( - out_channels, - use_conv=True, - out_channels=out_channels, - padding=downsample_padding, - name="op", ) - ]) + self.downsamplers = nn.LayerList( + [ + Downsample2D( + out_channels, + use_conv=True, + out_channels=out_channels, + padding=downsample_padding, + name="op", + ) + ] + ) else: self.downsamplers = None @@ -1043,21 +1111,22 @@ def forward(self, hidden_states): class AttnDownEncoderBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - attn_num_head_channels: int=1, - output_scale_factor: float=1.0, - add_downsample: bool=True, - downsample_padding: int=1, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + out_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels: int = 1, + output_scale_factor: float = 1.0, + add_downsample: bool = True, + downsample_padding: int = 1, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() resnets = [] attentions = [] @@ -1076,27 +1145,34 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) attentions.append( AttentionBlock( out_channels, num_head_channels=attn_num_head_channels, rescale_output_factor=output_scale_factor, eps=resnet_eps, - norm_num_groups=resnet_groups, )) + norm_num_groups=resnet_groups, + ) + ) self.attentions = nn.LayerList(attentions) self.resnets = nn.LayerList(resnets) if add_downsample: - self.downsamplers = nn.LayerList([ - Downsample2D( - out_channels, - use_conv=True, - out_channels=out_channels, - padding=downsample_padding, - name="op", ) - ]) + self.downsamplers = nn.LayerList( + [ + Downsample2D( + out_channels, + use_conv=True, + out_channels=out_channels, + padding=downsample_padding, + name="op", + ) + ] + ) else: self.downsamplers = None @@ -1114,21 +1190,22 @@ def forward(self, hidden_states): class AttnSkipDownBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_pre_norm: bool=True, - attn_num_head_channels: int=1, - output_scale_factor: float=np.sqrt(2.0), - downsample_padding: int=1, - add_downsample: bool=True, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_pre_norm: bool = True, + attn_num_head_channels: int = 1, + output_scale_factor: float = np.sqrt(2.0), + downsample_padding: int = 1, + add_downsample: bool = True, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() self.attentions = nn.LayerList([]) self.resnets = nn.LayerList([]) @@ -1148,13 +1225,17 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) self.attentions.append( AttentionBlock( out_channels, num_head_channels=attn_num_head_channels, rescale_output_factor=output_scale_factor, - eps=resnet_eps, )) + eps=resnet_eps, + ) + ) if add_downsample: self.resnet_down = ResnetBlock2D( @@ -1171,12 +1252,10 @@ def __init__( use_in_shortcut=True, down=True, kernel="fir", - pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) - self.downsamplers = nn.LayerList( - [FirDownsample2D( - out_channels, out_channels=out_channels)]) - self.skip_conv = nn.Conv2D( - 3, out_channels, kernel_size=(1, 1), stride=(1, 1)) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + self.downsamplers = nn.LayerList([FirDownsample2D(out_channels, out_channels=out_channels)]) + self.skip_conv = nn.Conv2D(3, out_channels, kernel_size=(1, 1), stride=(1, 1)) else: self.resnet_down = None self.downsamplers = None @@ -1188,7 +1267,7 @@ def forward(self, hidden_states, temb=None, skip_sample=None): for resnet, attn in zip(self.resnets, self.attentions): hidden_states = resnet(hidden_states, temb) hidden_states = attn(hidden_states) - output_states += (hidden_states, ) + output_states += (hidden_states,) if self.downsamplers is not None: hidden_states = self.resnet_down(hidden_states, temb) @@ -1197,27 +1276,28 @@ def forward(self, hidden_states, temb=None, skip_sample=None): hidden_states = self.skip_conv(skip_sample) + hidden_states - output_states += (hidden_states, ) + output_states += (hidden_states,) return hidden_states, output_states, skip_sample class SkipDownBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_pre_norm: bool=True, - output_scale_factor: float=np.sqrt(2.0), - add_downsample: bool=True, - downsample_padding: int=1, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_pre_norm: bool = True, + output_scale_factor: float = np.sqrt(2.0), + add_downsample: bool = True, + downsample_padding: int = 1, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() self.resnets = nn.LayerList([]) @@ -1236,7 +1316,9 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) if add_downsample: self.resnet_down = ResnetBlock2D( @@ -1253,12 +1335,10 @@ def __init__( use_in_shortcut=True, down=True, kernel="fir", - pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) - self.downsamplers = nn.LayerList( - [FirDownsample2D( - out_channels, out_channels=out_channels)]) - self.skip_conv = nn.Conv2D( - 3, out_channels, kernel_size=(1, 1), stride=(1, 1)) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + self.downsamplers = nn.LayerList([FirDownsample2D(out_channels, out_channels=out_channels)]) + self.skip_conv = nn.Conv2D(3, out_channels, kernel_size=(1, 1), stride=(1, 1)) else: self.resnet_down = None self.downsamplers = None @@ -1269,7 +1349,7 @@ def forward(self, hidden_states, temb=None, skip_sample=None): for resnet in self.resnets: hidden_states = resnet(hidden_states, temb) - output_states += (hidden_states, ) + output_states += (hidden_states,) if self.downsamplers is not None: hidden_states = self.resnet_down(hidden_states, temb) @@ -1278,28 +1358,29 @@ def forward(self, hidden_states, temb=None, skip_sample=None): hidden_states = self.skip_conv(skip_sample) + hidden_states - output_states += (hidden_states, ) + output_states += (hidden_states,) return hidden_states, output_states, skip_sample class ResnetDownsampleBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - output_scale_factor: float=1.0, - add_downsample: bool=True, - skip_time_act: bool=False, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + output_scale_factor: float = 1.0, + add_downsample: bool = True, + skip_time_act: bool = False, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() resnets = [] @@ -1318,27 +1399,32 @@ def __init__( output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, skip_time_act=skip_time_act, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) self.resnets = nn.LayerList(resnets) if add_downsample: - self.downsamplers = nn.LayerList([ - ResnetBlock2D( - in_channels=out_channels, - out_channels=out_channels, - temb_channels=temb_channels, - eps=resnet_eps, - groups=resnet_groups, - dropout=dropout, - time_embedding_norm=resnet_time_scale_shift, - non_linearity=resnet_act_fn, - output_scale_factor=output_scale_factor, - pre_norm=resnet_pre_norm, - skip_time_act=skip_time_act, - down=True, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) - ]) + self.downsamplers = nn.LayerList( + [ + ResnetBlock2D( + in_channels=out_channels, + out_channels=out_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + skip_time_act=skip_time_act, + down=True, + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ] + ) else: self.downsamplers = None @@ -1348,8 +1434,7 @@ def forward(self, hidden_states, temb=None): output_states = () for resnet in self.resnets: - if (self.training and self.gradient_checkpointing and - not hidden_states.stop_gradient): + if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient: def create_custom_forward(module): def custom_forward(*inputs): @@ -1357,43 +1442,43 @@ def custom_forward(*inputs): return custom_forward - hidden_states = recompute( - create_custom_forward(resnet), hidden_states, temb) + hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb) else: hidden_states = resnet(hidden_states, temb) - output_states += (hidden_states, ) + output_states += (hidden_states,) if self.downsamplers is not None: for downsampler in self.downsamplers: hidden_states = downsampler(hidden_states, temb) - output_states += (hidden_states, ) + output_states += (hidden_states,) return hidden_states, output_states class SimpleCrossAttnDownBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - attn_num_head_channels: int=1, - cross_attention_dim: int=1280, - output_scale_factor: float=1.0, - add_downsample: bool=True, - skip_time_act=False, - only_cross_attention=False, - cross_attention_norm=None, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels: int = 1, + cross_attention_dim: int = 1280, + output_scale_factor: float = 1.0, + add_downsample: bool = True, + skip_time_act=False, + only_cross_attention=False, + cross_attention_norm=None, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() self.has_cross_attention = True @@ -1419,7 +1504,9 @@ def __init__( output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, skip_time_act=skip_time_act, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) # TODO use AttnAddedKVProcessor2_5 # processor = ( # AttnAddedKVProcessor2_5() if hasattr(F, "scaled_dot_product_attention_") else AttnAddedKVProcessor() @@ -1437,42 +1524,47 @@ def __init__( upcast_softmax=True, only_cross_attention=only_cross_attention, cross_attention_norm=cross_attention_norm, - processor=processor, )) + processor=processor, + ) + ) self.attentions = nn.LayerList(attentions) self.resnets = nn.LayerList(resnets) if add_downsample: - self.downsamplers = nn.LayerList([ - ResnetBlock2D( - in_channels=out_channels, - out_channels=out_channels, - temb_channels=temb_channels, - eps=resnet_eps, - groups=resnet_groups, - dropout=dropout, - time_embedding_norm=resnet_time_scale_shift, - non_linearity=resnet_act_fn, - output_scale_factor=output_scale_factor, - pre_norm=resnet_pre_norm, - skip_time_act=skip_time_act, - down=True, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) - ]) + self.downsamplers = nn.LayerList( + [ + ResnetBlock2D( + in_channels=out_channels, + out_channels=out_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + skip_time_act=skip_time_act, + down=True, + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ] + ) else: self.downsamplers = None self.gradient_checkpointing = False def forward( - self, - hidden_states, - temb=None, - encoder_hidden_states=None, - attention_mask=None, - cross_attention_kwargs=None, ): + self, + hidden_states, + temb=None, + encoder_hidden_states=None, + attention_mask=None, + cross_attention_kwargs=None, + ): output_states = () - cross_attention_kwargs = (cross_attention_kwargs if - cross_attention_kwargs is not None else {}) + cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} for resnet, attn in zip(self.resnets, self.attentions): # resnet @@ -1483,32 +1575,34 @@ def forward( hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, - **cross_attention_kwargs, ) + **cross_attention_kwargs, + ) - output_states += (hidden_states, ) + output_states += (hidden_states,) if self.downsamplers is not None: for downsampler in self.downsamplers: hidden_states = downsampler(hidden_states, temb) - output_states += (hidden_states, ) + output_states += (hidden_states,) return hidden_states, output_states class KDownBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=4, - resnet_eps: float=1e-5, - resnet_act_fn: str="gelu", - resnet_group_size: int=32, - add_downsample: bool=False, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 4, + resnet_eps: float = 1e-5, + resnet_act_fn: str = "gelu", + resnet_group_size: int = 32, + add_downsample: bool = False, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() resnets = [] @@ -1529,7 +1623,9 @@ def __init__( non_linearity=resnet_act_fn, time_embedding_norm="ada_group", conv_shortcut_bias=False, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) self.resnets = nn.LayerList(resnets) @@ -1545,8 +1641,7 @@ def forward(self, hidden_states, temb=None): output_states = () for resnet in self.resnets: - if (self.training and self.gradient_checkpointing and - not hidden_states.stop_gradient): + if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient: def create_custom_forward(module): def custom_forward(*inputs): @@ -1554,12 +1649,11 @@ def custom_forward(*inputs): return custom_forward - hidden_states = recompute( - create_custom_forward(resnet), hidden_states, temb) + hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb) else: hidden_states = resnet(hidden_states, temb) - output_states += (hidden_states, ) + output_states += (hidden_states,) if self.downsamplers is not None: for downsampler in self.downsamplers: @@ -1570,20 +1664,21 @@ def custom_forward(*inputs): class KCrossAttnDownBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - temb_channels: int, - cross_attention_dim: int, - dropout: float=0.0, - num_layers: int=4, - resnet_group_size: int=32, - add_downsample=True, - attn_num_head_channels: int=64, - add_self_attention: bool=False, - resnet_eps: float=1e-5, - resnet_act_fn: str="gelu", - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + out_channels: int, + temb_channels: int, + cross_attention_dim: int, + dropout: float = 0.0, + num_layers: int = 4, + resnet_group_size: int = 32, + add_downsample=True, + attn_num_head_channels: int = 64, + add_self_attention: bool = False, + resnet_eps: float = 1e-5, + resnet_act_fn: str = "gelu", + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() resnets = [] attentions = [] @@ -1607,7 +1702,9 @@ def __init__( non_linearity=resnet_act_fn, time_embedding_norm="ada_group", conv_shortcut_bias=False, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) attentions.append( KAttentionBlock( out_channels, @@ -1618,7 +1715,9 @@ def __init__( attention_bias=True, add_self_attention=add_self_attention, cross_attention_norm="layer_norm", - group_size=resnet_group_size, )) + group_size=resnet_group_size, + ) + ) self.resnets = nn.LayerList(resnets) self.attentions = nn.LayerList(attentions) @@ -1631,17 +1730,17 @@ def __init__( self.gradient_checkpointing = False def forward( - self, - hidden_states, - temb=None, - encoder_hidden_states=None, - attention_mask=None, - cross_attention_kwargs=None, ): + self, + hidden_states, + temb=None, + encoder_hidden_states=None, + attention_mask=None, + cross_attention_kwargs=None, + ): output_states = () for resnet, attn in zip(self.resnets, self.attentions): - if (self.training and self.gradient_checkpointing and - not hidden_states.stop_gradient): + if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient: def create_custom_forward(module, return_dict=None): def custom_forward(*inputs): @@ -1652,15 +1751,14 @@ def custom_forward(*inputs): return custom_forward + hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb) hidden_states = recompute( - create_custom_forward(resnet), hidden_states, temb) - hidden_states = recompute( - create_custom_forward( - attn, return_dict=False), + create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states, attention_mask, - cross_attention_kwargs, ) + cross_attention_kwargs, + ) else: hidden_states = resnet(hidden_states, temb) hidden_states = attn( @@ -1668,12 +1766,13 @@ def custom_forward(*inputs): encoder_hidden_states=encoder_hidden_states, emb=temb, attention_mask=attention_mask, - cross_attention_kwargs=cross_attention_kwargs, ) + cross_attention_kwargs=cross_attention_kwargs, + ) if self.downsamplers is None: - output_states += (None, ) + output_states += (None,) else: - output_states += (hidden_states, ) + output_states += (hidden_states,) if self.downsamplers is not None: for downsampler in self.downsamplers: @@ -1684,29 +1783,29 @@ def custom_forward(*inputs): class AttnUpBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - prev_output_channel: int, - out_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - attn_num_head_channels: int=1, - output_scale_factor: float=1.0, - add_upsample: bool=True, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + prev_output_channel: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels: int = 1, + output_scale_factor: float = 1.0, + add_upsample: bool = True, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() resnets = [] attentions = [] for i in range(num_layers): - res_skip_channels = in_channels if ( - i == num_layers - 1) else out_channels + res_skip_channels = in_channels if (i == num_layers - 1) else out_channels resnet_in_channels = prev_output_channel if i == 0 else out_channels resnets.append( @@ -1721,23 +1820,24 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) attentions.append( AttentionBlock( out_channels, num_head_channels=attn_num_head_channels, rescale_output_factor=output_scale_factor, eps=resnet_eps, - norm_num_groups=resnet_groups, )) + norm_num_groups=resnet_groups, + ) + ) self.attentions = nn.LayerList(attentions) self.resnets = nn.LayerList(resnets) if add_upsample: - self.upsamplers = nn.LayerList([ - Upsample2D( - out_channels, use_conv=True, out_channels=out_channels) - ]) + self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]) else: self.upsamplers = None @@ -1746,8 +1846,7 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None): # pop res hidden states res_hidden_states = res_hidden_states_tuple[-1] res_hidden_states_tuple = res_hidden_states_tuple[:-1] - hidden_states = paddle.concat( - [hidden_states, res_hidden_states], axis=1) + hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1) hidden_states = resnet(hidden_states, temb) hidden_states = attn(hidden_states) @@ -1761,27 +1860,28 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None): class CrossAttnUpBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - prev_output_channel: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - attn_num_head_channels: int=1, - cross_attention_dim: int=1280, - output_scale_factor: float=1.0, - add_upsample: bool=True, - dual_cross_attention: bool=False, - use_linear_projection: bool=False, - only_cross_attention: bool=False, - upcast_attention: bool=False, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + out_channels: int, + prev_output_channel: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels: int = 1, + cross_attention_dim: int = 1280, + output_scale_factor: float = 1.0, + add_upsample: bool = True, + dual_cross_attention: bool = False, + use_linear_projection: bool = False, + only_cross_attention: bool = False, + upcast_attention: bool = False, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() resnets = [] attentions = [] @@ -1790,8 +1890,7 @@ def __init__( self.attn_num_head_channels = attn_num_head_channels for i in range(num_layers): - res_skip_channels = in_channels if ( - i == num_layers - 1) else out_channels + res_skip_channels = in_channels if (i == num_layers - 1) else out_channels resnet_in_channels = prev_output_channel if i == 0 else out_channels resnets.append( @@ -1806,7 +1905,9 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) if not dual_cross_attention: attentions.append( Transformer2DModel( @@ -1818,7 +1919,9 @@ def __init__( norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, - upcast_attention=upcast_attention, )) + upcast_attention=upcast_attention, + ) + ) else: attentions.append( DualTransformer2DModel( @@ -1827,64 +1930,61 @@ def __init__( in_channels=out_channels, num_layers=1, cross_attention_dim=cross_attention_dim, - norm_num_groups=resnet_groups, )) + norm_num_groups=resnet_groups, + ) + ) self.attentions = nn.LayerList(attentions) self.resnets = nn.LayerList(resnets) if add_upsample: - self.upsamplers = nn.LayerList([ - Upsample2D( - out_channels, use_conv=True, out_channels=out_channels) - ]) + self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]) else: self.upsamplers = None self.gradient_checkpointing = False def forward( - self, - hidden_states, - res_hidden_states_tuple, - temb=None, - encoder_hidden_states=None, - cross_attention_kwargs=None, - upsample_size=None, - attention_mask=None, ): + self, + hidden_states, + res_hidden_states_tuple, + temb=None, + encoder_hidden_states=None, + cross_attention_kwargs=None, + upsample_size=None, + attention_mask=None, + ): # TODO(Patrick, William) - attention mask is not used for resnet, attn in zip(self.resnets, self.attentions): # pop res hidden states res_hidden_states = res_hidden_states_tuple[-1] res_hidden_states_tuple = res_hidden_states_tuple[:-1] - hidden_states = paddle.concat( - [hidden_states, res_hidden_states], axis=1) + hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1) - if (self.training and self.gradient_checkpointing and - not hidden_states.stop_gradient): + if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient: def create_custom_forward(module, return_dict=None): def custom_forward(*inputs): if return_dict is not None: - return module( - *inputs, return_dict=return_dict)[0] # move [0] + return module(*inputs, return_dict=return_dict)[0] # move [0] else: return module(*inputs) return custom_forward + hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb) hidden_states = recompute( - create_custom_forward(resnet), hidden_states, temb) - hidden_states = recompute( - create_custom_forward( - attn, return_dict=False), + create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states, - cross_attention_kwargs, ) # [0] + cross_attention_kwargs, + ) # [0] else: hidden_states = resnet(hidden_states, temb) hidden_states = attn( hidden_states, encoder_hidden_states=encoder_hidden_states, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample if self.upsamplers is not None: for upsampler in self.upsamplers: @@ -1895,27 +1995,27 @@ def custom_forward(*inputs): class UpBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - prev_output_channel: int, - out_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - output_scale_factor: float=1.0, - add_upsample: bool=True, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + prev_output_channel: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + output_scale_factor: float = 1.0, + add_upsample: bool = True, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() resnets = [] for i in range(num_layers): - res_skip_channels = in_channels if ( - i == num_layers - 1) else out_channels + res_skip_channels = in_channels if (i == num_layers - 1) else out_channels resnet_in_channels = prev_output_channel if i == 0 else out_channels resnets.append( @@ -1930,34 +2030,27 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) self.resnets = nn.LayerList(resnets) if add_upsample: - self.upsamplers = nn.LayerList([ - Upsample2D( - out_channels, use_conv=True, out_channels=out_channels) - ]) + self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]) else: self.upsamplers = None self.gradient_checkpointing = False - def forward(self, - hidden_states, - res_hidden_states_tuple, - temb=None, - upsample_size=None): + def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None): for resnet in self.resnets: # pop res hidden states res_hidden_states = res_hidden_states_tuple[-1] res_hidden_states_tuple = res_hidden_states_tuple[:-1] - hidden_states = paddle.concat( - [hidden_states, res_hidden_states], axis=1) + hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1) - if (self.training and self.gradient_checkpointing and - not hidden_states.stop_gradient): + if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient: def create_custom_forward(module): def custom_forward(*inputs): @@ -1965,8 +2058,7 @@ def custom_forward(*inputs): return custom_forward - hidden_states = recompute( - create_custom_forward(resnet), hidden_states, temb) + hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb) else: hidden_states = resnet(hidden_states, temb) @@ -1979,19 +2071,20 @@ def custom_forward(*inputs): class UpDecoderBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - output_scale_factor: float=1.0, - add_upsample: bool=True, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + out_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + output_scale_factor: float = 1.0, + add_upsample: bool = True, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() resnets = [] @@ -2010,15 +2103,14 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) self.resnets = nn.LayerList(resnets) if add_upsample: - self.upsamplers = nn.LayerList([ - Upsample2D( - out_channels, use_conv=True, out_channels=out_channels) - ]) + self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]) else: self.upsamplers = None @@ -2035,20 +2127,21 @@ def forward(self, hidden_states): class AttnUpDecoderBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - attn_num_head_channels: int=1, - output_scale_factor: float=1.0, - add_upsample: bool=True, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + out_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels: int = 1, + output_scale_factor: float = 1.0, + add_upsample: bool = True, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() resnets = [] attentions = [] @@ -2068,23 +2161,24 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) attentions.append( AttentionBlock( out_channels, num_head_channels=attn_num_head_channels, rescale_output_factor=output_scale_factor, eps=resnet_eps, - norm_num_groups=resnet_groups, )) + norm_num_groups=resnet_groups, + ) + ) self.attentions = nn.LayerList(attentions) self.resnets = nn.LayerList(resnets) if add_upsample: - self.upsamplers = nn.LayerList([ - Upsample2D( - out_channels, use_conv=True, out_channels=out_channels) - ]) + self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]) else: self.upsamplers = None @@ -2102,29 +2196,29 @@ def forward(self, hidden_states): class AttnSkipUpBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - prev_output_channel: int, - out_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_pre_norm: bool=True, - attn_num_head_channels: int=1, - output_scale_factor: float=np.sqrt(2.0), - upsample_padding: int=1, - add_upsample: bool=True, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + prev_output_channel: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_pre_norm: bool = True, + attn_num_head_channels: int = 1, + output_scale_factor: float = np.sqrt(2.0), + upsample_padding: int = 1, + add_upsample: bool = True, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() self.attentions = nn.LayerList([]) self.resnets = nn.LayerList([]) for i in range(num_layers): - res_skip_channels = in_channels if ( - i == num_layers - 1) else out_channels + res_skip_channels = in_channels if (i == num_layers - 1) else out_channels resnet_in_channels = prev_output_channel if i == 0 else out_channels self.resnets.append( @@ -2140,14 +2234,18 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) self.attentions.append( AttentionBlock( out_channels, num_head_channels=attn_num_head_channels, rescale_output_factor=output_scale_factor, - eps=resnet_eps, )) + eps=resnet_eps, + ) + ) self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels) if add_upsample: @@ -2166,17 +2264,14 @@ def __init__( use_in_shortcut=True, up=True, kernel="fir", - pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) - self.skip_conv = nn.Conv2D( - out_channels, - 3, - kernel_size=(3, 3), - stride=(1, 1), - padding=(1, 1)) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + self.skip_conv = nn.Conv2D(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.skip_norm = nn.GroupNorm( num_groups=min(out_channels // 4, 32), num_channels=out_channels, - epsilon=resnet_eps, ) + epsilon=resnet_eps, + ) self.act = nn.Silu() else: self.resnet_up = None @@ -2184,17 +2279,12 @@ def __init__( self.skip_norm = None self.act = None - def forward(self, - hidden_states, - res_hidden_states_tuple, - temb=None, - skip_sample=None): + def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None): for resnet in self.resnets: # pop res hidden states res_hidden_states = res_hidden_states_tuple[-1] res_hidden_states_tuple = res_hidden_states_tuple[:-1] - hidden_states = paddle.concat( - [hidden_states, res_hidden_states], axis=1) + hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1) hidden_states = resnet(hidden_states, temb) @@ -2219,27 +2309,27 @@ def forward(self, class SkipUpBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - prev_output_channel: int, - out_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_pre_norm: bool=True, - output_scale_factor: float=np.sqrt(2.0), - add_upsample: bool=True, - upsample_padding: int=1, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + prev_output_channel: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_pre_norm: bool = True, + output_scale_factor: float = np.sqrt(2.0), + add_upsample: bool = True, + upsample_padding: int = 1, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() self.resnets = nn.LayerList([]) for i in range(num_layers): - res_skip_channels = in_channels if ( - i == num_layers - 1) else out_channels + res_skip_channels = in_channels if (i == num_layers - 1) else out_channels resnet_in_channels = prev_output_channel if i == 0 else out_channels self.resnets.append( @@ -2248,15 +2338,16 @@ def __init__( out_channels=out_channels, temb_channels=temb_channels, eps=resnet_eps, - groups=min((resnet_in_channels + res_skip_channels) // 4, - 32), + groups=min((resnet_in_channels + res_skip_channels) // 4, 32), groups_out=min(out_channels // 4, 32), dropout=dropout, time_embedding_norm=resnet_time_scale_shift, non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels) if add_upsample: @@ -2275,17 +2366,14 @@ def __init__( use_in_shortcut=True, up=True, kernel="fir", - pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) - self.skip_conv = nn.Conv2D( - out_channels, - 3, - kernel_size=(3, 3), - stride=(1, 1), - padding=(1, 1)) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + self.skip_conv = nn.Conv2D(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) self.skip_norm = nn.GroupNorm( num_groups=min(out_channels // 4, 32), num_channels=out_channels, - epsilon=resnet_eps, ) + epsilon=resnet_eps, + ) self.act = nn.Silu() else: self.resnet_up = None @@ -2293,17 +2381,12 @@ def __init__( self.skip_norm = None self.act = None - def forward(self, - hidden_states, - res_hidden_states_tuple, - temb=None, - skip_sample=None): + def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None): for resnet in self.resnets: # pop res hidden states res_hidden_states = res_hidden_states_tuple[-1] res_hidden_states_tuple = res_hidden_states_tuple[:-1] - hidden_states = paddle.concat( - [hidden_states, res_hidden_states], axis=1) + hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1) hidden_states = resnet(hidden_states, temb) @@ -2326,28 +2409,28 @@ def forward(self, class ResnetUpsampleBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - prev_output_channel: int, - out_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - output_scale_factor: float=1.0, - add_upsample: bool=True, - skip_time_act=False, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + prev_output_channel: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + output_scale_factor: float = 1.0, + add_upsample: bool = True, + skip_time_act=False, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() resnets = [] for i in range(num_layers): - res_skip_channels = in_channels if ( - i == num_layers - 1) else out_channels + res_skip_channels = in_channels if (i == num_layers - 1) else out_channels resnet_in_channels = prev_output_channel if i == 0 else out_channels resnets.append( @@ -2363,46 +2446,45 @@ def __init__( output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, skip_time_act=skip_time_act, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) self.resnets = nn.LayerList(resnets) if add_upsample: - self.upsamplers = nn.LayerList([ - ResnetBlock2D( - in_channels=out_channels, - out_channels=out_channels, - temb_channels=temb_channels, - eps=resnet_eps, - groups=resnet_groups, - dropout=dropout, - time_embedding_norm=resnet_time_scale_shift, - non_linearity=resnet_act_fn, - output_scale_factor=output_scale_factor, - pre_norm=resnet_pre_norm, - skip_time_act=skip_time_act, - up=True, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) - ]) + self.upsamplers = nn.LayerList( + [ + ResnetBlock2D( + in_channels=out_channels, + out_channels=out_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + skip_time_act=skip_time_act, + up=True, + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ] + ) else: self.upsamplers = None self.gradient_checkpointing = False - def forward(self, - hidden_states, - res_hidden_states_tuple, - temb=None, - upsample_size=None): + def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None): for resnet in self.resnets: # pop res hidden states res_hidden_states = res_hidden_states_tuple[-1] res_hidden_states_tuple = res_hidden_states_tuple[:-1] - hidden_states = paddle.concat( - [hidden_states, res_hidden_states], axis=1) + hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1) - if (self.training and self.gradient_checkpointing and - not hidden_states.stop_gradient): + if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient: def create_custom_forward(module): def custom_forward(*inputs): @@ -2410,8 +2492,7 @@ def custom_forward(*inputs): return custom_forward - hidden_states = recompute( - create_custom_forward(resnet), hidden_states, temb) + hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb) else: hidden_states = resnet(hidden_states, temb) @@ -2424,26 +2505,27 @@ def custom_forward(*inputs): class SimpleCrossAttnUpBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - prev_output_channel: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - attn_num_head_channels: int=1, - cross_attention_dim: int=1280, - output_scale_factor: float=1.0, - add_upsample: bool=True, - skip_time_act=False, - only_cross_attention=False, - cross_attention_norm=None, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + out_channels: int, + prev_output_channel: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels: int = 1, + cross_attention_dim: int = 1280, + output_scale_factor: float = 1.0, + add_upsample: bool = True, + skip_time_act=False, + only_cross_attention=False, + cross_attention_norm=None, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() resnets = [] attentions = [] @@ -2454,8 +2536,7 @@ def __init__( self.num_heads = out_channels // self.attn_num_head_channels for i in range(num_layers): - res_skip_channels = in_channels if ( - i == num_layers - 1) else out_channels + res_skip_channels = in_channels if (i == num_layers - 1) else out_channels resnet_in_channels = prev_output_channel if i == 0 else out_channels resnets.append( @@ -2471,7 +2552,9 @@ def __init__( output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, skip_time_act=skip_time_act, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) # TODO support AttnAddedKVProcessor2_5 # processor = ( # AttnAddedKVProcessor2_5() if hasattr(F, "scaled_dot_product_attention_") else AttnAddedKVProcessor() @@ -2489,50 +2572,54 @@ def __init__( upcast_softmax=True, only_cross_attention=only_cross_attention, cross_attention_norm=cross_attention_norm, - processor=processor, )) + processor=processor, + ) + ) self.attentions = nn.LayerList(attentions) self.resnets = nn.LayerList(resnets) if add_upsample: - self.upsamplers = nn.LayerList([ - ResnetBlock2D( - in_channels=out_channels, - out_channels=out_channels, - temb_channels=temb_channels, - eps=resnet_eps, - groups=resnet_groups, - dropout=dropout, - time_embedding_norm=resnet_time_scale_shift, - non_linearity=resnet_act_fn, - output_scale_factor=output_scale_factor, - pre_norm=resnet_pre_norm, - skip_time_act=skip_time_act, - up=True, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) - ]) + self.upsamplers = nn.LayerList( + [ + ResnetBlock2D( + in_channels=out_channels, + out_channels=out_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + skip_time_act=skip_time_act, + up=True, + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ] + ) else: self.upsamplers = None self.gradient_checkpointing = False def forward( - self, - hidden_states, - res_hidden_states_tuple, - temb=None, - encoder_hidden_states=None, - upsample_size=None, - attention_mask=None, - cross_attention_kwargs=None, ): - cross_attention_kwargs = (cross_attention_kwargs if - cross_attention_kwargs is not None else {}) + self, + hidden_states, + res_hidden_states_tuple, + temb=None, + encoder_hidden_states=None, + upsample_size=None, + attention_mask=None, + cross_attention_kwargs=None, + ): + cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} for resnet, attn in zip(self.resnets, self.attentions): # resnet # pop res hidden states res_hidden_states = res_hidden_states_tuple[-1] res_hidden_states_tuple = res_hidden_states_tuple[:-1] - hidden_states = paddle.concat( - [hidden_states, res_hidden_states], axis=1) + hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1) hidden_states = resnet(hidden_states, temb) @@ -2541,7 +2628,8 @@ def forward( hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, - **cross_attention_kwargs, ) + **cross_attention_kwargs, + ) if self.upsamplers is not None: for upsampler in self.upsamplers: @@ -2552,17 +2640,18 @@ def forward( class KUpBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=5, - resnet_eps: float=1e-5, - resnet_act_fn: str="gelu", - resnet_group_size: Optional[int]=32, - add_upsample: bool=True, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 5, + resnet_eps: float = 1e-5, + resnet_act_fn: str = "gelu", + resnet_group_size: Optional[int] = 32, + add_upsample: bool = True, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() resnets = [] k_in_channels = 2 * out_channels @@ -2577,8 +2666,7 @@ def __init__( resnets.append( ResnetBlock2D( in_channels=in_channels, - out_channels=k_out_channels - if (i == num_layers - 1) else out_channels, + out_channels=k_out_channels if (i == num_layers - 1) else out_channels, temb_channels=temb_channels, eps=resnet_eps, groups=groups, @@ -2587,7 +2675,9 @@ def __init__( non_linearity=resnet_act_fn, time_embedding_norm="ada_group", conv_shortcut_bias=False, - pre_norm=resnet_pre_temb_non_linearity, )) + pre_norm=resnet_pre_temb_non_linearity, + ) + ) self.resnets = nn.LayerList(resnets) @@ -2598,19 +2688,13 @@ def __init__( self.gradient_checkpointing = False - def forward(self, - hidden_states, - res_hidden_states_tuple, - temb=None, - upsample_size=None): + def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None): res_hidden_states_tuple = res_hidden_states_tuple[-1] if res_hidden_states_tuple is not None: - hidden_states = paddle.concat( - [hidden_states, res_hidden_states_tuple], axis=1) + hidden_states = paddle.concat([hidden_states, res_hidden_states_tuple], axis=1) for resnet in self.resnets: - if (self.training and self.gradient_checkpointing and - not hidden_states.stop_gradient): + if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient: def create_custom_forward(module): def custom_forward(*inputs): @@ -2618,8 +2702,7 @@ def custom_forward(*inputs): return custom_forward - hidden_states = recompute( - create_custom_forward(resnet), hidden_states, temb) + hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb) else: hidden_states = resnet(hidden_states, temb) @@ -2632,20 +2715,21 @@ def custom_forward(*inputs): class KCrossAttnUpBlock2D(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=4, - resnet_eps: float=1e-5, - resnet_act_fn: str="gelu", - resnet_group_size: int=32, - attn_num_head_channels=1, # attention dim_head - cross_attention_dim: int=768, - add_upsample: bool=True, - upcast_attention: bool=False, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 4, + resnet_eps: float = 1e-5, + resnet_act_fn: str = "gelu", + resnet_group_size: int = 32, + attn_num_head_channels=1, # attention dim_head + cross_attention_dim: int = 768, + add_upsample: bool = True, + upcast_attention: bool = False, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() resnets = [] attentions = [] @@ -2686,20 +2770,24 @@ def __init__( non_linearity=resnet_act_fn, time_embedding_norm="ada_group", conv_shortcut_bias=False, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) attentions.append( KAttentionBlock( k_out_channels if (i == num_layers - 1) else out_channels, k_out_channels // attn_num_head_channels - if (i == num_layers - 1) else out_channels // - attn_num_head_channels, + if (i == num_layers - 1) + else out_channels // attn_num_head_channels, attn_num_head_channels, cross_attention_dim=cross_attention_dim, temb_channels=temb_channels, attention_bias=True, add_self_attention=add_self_attention, cross_attention_norm="layer_norm", - upcast_attention=upcast_attention, )) + upcast_attention=upcast_attention, + ) + ) self.resnets = nn.LayerList(resnets) self.attentions = nn.LayerList(attentions) @@ -2712,42 +2800,39 @@ def __init__( self.gradient_checkpointing = False def forward( - self, - hidden_states, - res_hidden_states_tuple, - temb=None, - encoder_hidden_states=None, - cross_attention_kwargs=None, - upsample_size=None, - attention_mask=None, ): + self, + hidden_states, + res_hidden_states_tuple, + temb=None, + encoder_hidden_states=None, + cross_attention_kwargs=None, + upsample_size=None, + attention_mask=None, + ): res_hidden_states_tuple = res_hidden_states_tuple[-1] if res_hidden_states_tuple is not None: - hidden_states = paddle.concat( - [hidden_states, res_hidden_states_tuple], axis=1) + hidden_states = paddle.concat([hidden_states, res_hidden_states_tuple], axis=1) for resnet, attn in zip(self.resnets, self.attentions): - if (self.training and self.gradient_checkpointing and - not hidden_states.stop_gradient): + if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient: def create_custom_forward(module, return_dict=None): def custom_forward(*inputs): if return_dict is not None: - return module( - *inputs, return_dict=return_dict)[0] # move [0] + return module(*inputs, return_dict=return_dict)[0] # move [0] else: return module(*inputs) return custom_forward + hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb) hidden_states = recompute( - create_custom_forward(resnet), hidden_states, temb) - hidden_states = recompute( - create_custom_forward( - attn, return_dict=False), + create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states, attention_mask, - cross_attention_kwargs, ) # [0] + cross_attention_kwargs, + ) # [0] else: hidden_states = resnet(hidden_states, temb) hidden_states = attn( @@ -2755,7 +2840,8 @@ def custom_forward(*inputs): encoder_hidden_states=encoder_hidden_states, emb=temb, attention_mask=attention_mask, - cross_attention_kwargs=cross_attention_kwargs, ) + cross_attention_kwargs=cross_attention_kwargs, + ) if self.upsamplers is not None: for upsampler in self.upsamplers: @@ -2783,25 +2869,25 @@ class KAttentionBlock(nn.Layer): """ def __init__( - self, - dim: int, - num_attention_heads: int, - attention_head_dim: int, - dropout: float=0.0, - cross_attention_dim: Optional[int]=None, - attention_bias: bool=False, - upcast_attention: bool=False, - temb_channels: int=768, # for ada_group_norm - add_self_attention: bool=False, - cross_attention_norm: Optional[str]=None, - group_size: int=32, ): + self, + dim: int, + num_attention_heads: int, + attention_head_dim: int, + dropout: float = 0.0, + cross_attention_dim: Optional[int] = None, + attention_bias: bool = False, + upcast_attention: bool = False, + temb_channels: int = 768, # for ada_group_norm + add_self_attention: bool = False, + cross_attention_norm: Optional[str] = None, + group_size: int = 32, + ): super().__init__() self.add_self_attention = add_self_attention # 1. Self-Attn if add_self_attention: - self.norm1 = AdaGroupNorm(temb_channels, dim, - max(1, dim // group_size)) + self.norm1 = AdaGroupNorm(temb_channels, dim, max(1, dim // group_size)) self.attn1 = Attention( query_dim=dim, heads=num_attention_heads, @@ -2809,7 +2895,8 @@ def __init__( dropout=dropout, bias=attention_bias, cross_attention_dim=None, - cross_attention_norm=None, ) + cross_attention_norm=None, + ) # 2. Cross-Attn self.norm2 = AdaGroupNorm(temb_channels, dim, max(1, dim // group_size)) @@ -2821,25 +2908,24 @@ def __init__( dropout=dropout, bias=attention_bias, upcast_attention=upcast_attention, - cross_attention_norm=cross_attention_norm, ) + cross_attention_norm=cross_attention_norm, + ) def _to_3d(self, hidden_states, height, weight): - return hidden_states.transpose([0, 2, 3, 1]).reshape( - [hidden_states.shape[0], height * weight, -1]) + return hidden_states.transpose([0, 2, 3, 1]).reshape([hidden_states.shape[0], height * weight, -1]) def _to_4d(self, hidden_states, height, weight): - return hidden_states.transpose([0, 2, 1]).reshape( - [hidden_states.shape[0], -1, height, weight]) + return hidden_states.transpose([0, 2, 1]).reshape([hidden_states.shape[0], -1, height, weight]) def forward( - self, - hidden_states, - encoder_hidden_states=None, - emb=None, - attention_mask=None, - cross_attention_kwargs=None, ): - cross_attention_kwargs = (cross_attention_kwargs if - cross_attention_kwargs is not None else {}) + self, + hidden_states, + encoder_hidden_states=None, + emb=None, + attention_mask=None, + cross_attention_kwargs=None, + ): + cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} # 1. Self-Attention if self.add_self_attention: @@ -2851,7 +2937,8 @@ def forward( attn_output = self.attn1( norm_hidden_states, encoder_hidden_states=None, - **cross_attention_kwargs, ) + **cross_attention_kwargs, + ) attn_output = self._to_4d(attn_output, height, weight) hidden_states = attn_output + hidden_states @@ -2864,7 +2951,8 @@ def forward( attn_output = self.attn2( norm_hidden_states, encoder_hidden_states=encoder_hidden_states, - **cross_attention_kwargs, ) + **cross_attention_kwargs, + ) attn_output = self._to_4d(attn_output, height, weight) hidden_states = attn_output + hidden_states diff --git a/ppdiffusers/ppdiffusers/models/unet_2d_condition.py b/ppdiffusers/ppdiffusers/models/unet_2d_condition.py index 173a1185da9a8..606a6f0b91ba5 100644 --- a/ppdiffusers/ppdiffusers/models/unet_2d_condition.py +++ b/ppdiffusers/ppdiffusers/models/unet_2d_condition.py @@ -23,13 +23,23 @@ from ..loaders import UNet2DConditionLoadersMixin from ..utils import NEG_INF, BaseOutput, logging from .attention_processor import AttentionProcessor, AttnProcessor -from .embeddings import (GaussianFourierProjection, TextTimeEmbedding, - TimestepEmbedding, Timesteps) +from .embeddings import ( + GaussianFourierProjection, + TextTimeEmbedding, + TimestepEmbedding, + Timesteps, +) from .modeling_utils import ModelMixin -from .unet_2d_blocks import (CrossAttnDownBlock2D, CrossAttnUpBlock2D, - DownBlock2D, UNetMidBlock2DCrossAttn, - UNetMidBlock2DSimpleCrossAttn, UpBlock2D, - get_down_block, get_up_block) +from .unet_2d_blocks import ( + CrossAttnDownBlock2D, + CrossAttnUpBlock2D, + DownBlock2D, + UNetMidBlock2DCrossAttn, + UNetMidBlock2DSimpleCrossAttn, + UpBlock2D, + get_down_block, + get_up_block, +) logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -45,8 +55,7 @@ class UNet2DConditionOutput(BaseOutput): sample: paddle.Tensor -class UNet2DConditionModel(ModelMixin, ConfigMixin, - UNet2DConditionLoadersMixin): +class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): r""" UNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep and returns sample shaped output. @@ -126,57 +135,60 @@ class conditioning with `class_embed_type` equal to `None`. @register_to_config def __init__( - self, - sample_size: Optional[int]=None, - in_channels: int=4, - out_channels: int=4, - center_input_sample: bool=False, - flip_sin_to_cos: bool=True, - freq_shift: int=0, - down_block_types: Tuple[str]=( - "CrossAttnDownBlock2D", - "CrossAttnDownBlock2D", - "CrossAttnDownBlock2D", - "DownBlock2D", ), - mid_block_type: Optional[str]="UNetMidBlock2DCrossAttn", - up_block_types: Tuple[str]=( - "UpBlock2D", - "CrossAttnUpBlock2D", - "CrossAttnUpBlock2D", - "CrossAttnUpBlock2D", ), - only_cross_attention: Union[bool, Tuple[bool]]=False, - block_out_channels: Tuple[int]=(320, 640, 1280, 1280), - layers_per_block: Union[int, Tuple[int]]=2, - downsample_padding: int=1, - mid_block_scale_factor: float=1, - act_fn: str="silu", - norm_num_groups: Optional[int]=32, - norm_eps: float=1e-5, - cross_attention_dim: Union[int, Tuple[int]]=1280, - encoder_hid_dim: Optional[int]=None, - attention_head_dim: Union[int, Tuple[int]]=8, - dual_cross_attention: bool=False, - use_linear_projection: bool=False, - class_embed_type: Optional[str]=None, - addition_embed_type: Optional[str]=None, - num_class_embeds: Optional[int]=None, - upcast_attention: bool=False, - resnet_time_scale_shift: str="default", - resnet_skip_time_act: bool=False, - resnet_out_scale_factor: int=1.0, - time_embedding_type: str="positional", # fourier, positional - time_embedding_dim: Optional[int]=None, - time_embedding_act_fn: Optional[str]=None, - timestep_post_act: Optional[str]=None, - time_cond_proj_dim: Optional[int]=None, - conv_in_kernel: int=3, - conv_out_kernel: int=3, - projection_class_embeddings_input_dim: Optional[int]=None, - class_embeddings_concat: bool=False, - mid_block_only_cross_attention: Optional[bool]=None, - cross_attention_norm: Optional[str]=None, - resnet_pre_temb_non_linearity: Optional[bool]=False, - addition_embed_type_num_heads: int=64, ): + self, + sample_size: Optional[int] = None, + in_channels: int = 4, + out_channels: int = 4, + center_input_sample: bool = False, + flip_sin_to_cos: bool = True, + freq_shift: int = 0, + down_block_types: Tuple[str] = ( + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + "DownBlock2D", + ), + mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn", + up_block_types: Tuple[str] = ( + "UpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + ), + only_cross_attention: Union[bool, Tuple[bool]] = False, + block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + layers_per_block: Union[int, Tuple[int]] = 2, + downsample_padding: int = 1, + mid_block_scale_factor: float = 1, + act_fn: str = "silu", + norm_num_groups: Optional[int] = 32, + norm_eps: float = 1e-5, + cross_attention_dim: Union[int, Tuple[int]] = 1280, + encoder_hid_dim: Optional[int] = None, + attention_head_dim: Union[int, Tuple[int]] = 8, + dual_cross_attention: bool = False, + use_linear_projection: bool = False, + class_embed_type: Optional[str] = None, + addition_embed_type: Optional[str] = None, + num_class_embeds: Optional[int] = None, + upcast_attention: bool = False, + resnet_time_scale_shift: str = "default", + resnet_skip_time_act: bool = False, + resnet_out_scale_factor: int = 1.0, + time_embedding_type: str = "positional", # fourier, positional + time_embedding_dim: Optional[int] = None, + time_embedding_act_fn: Optional[str] = None, + timestep_post_act: Optional[str] = None, + time_cond_proj_dim: Optional[int] = None, + conv_in_kernel: int = 3, + conv_out_kernel: int = 3, + projection_class_embeddings_input_dim: Optional[int] = None, + class_embeddings_concat: bool = False, + mid_block_only_cross_attention: Optional[bool] = None, + cross_attention_norm: Optional[str] = None, + resnet_pre_temb_non_linearity: Optional[bool] = False, + addition_embed_type_num_heads: int = 64, + ): super().__init__() self.sample_size = sample_size @@ -192,30 +204,22 @@ def __init__( f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}." ) - if not isinstance( - only_cross_attention, - bool) and len(only_cross_attention) != len(down_block_types): + if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types): raise ValueError( f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}." ) - if not isinstance( - attention_head_dim, - int) and len(attention_head_dim) != len(down_block_types): + if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types): raise ValueError( f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}." ) - if isinstance( - cross_attention_dim, - list) and len(cross_attention_dim) != len(down_block_types): + if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types): raise ValueError( f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}." ) - if not isinstance( - layers_per_block, - int) and len(layers_per_block) != len(down_block_types): + if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types): raise ValueError( f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}." ) @@ -226,26 +230,25 @@ def __init__( in_channels, block_out_channels[0], kernel_size=conv_in_kernel, - padding=conv_in_padding, ) + padding=conv_in_padding, + ) # time if time_embedding_type == "fourier": time_embed_dim = time_embedding_dim or block_out_channels[0] * 2 if time_embed_dim % 2 != 0: - raise ValueError( - f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}." - ) + raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.") self.time_proj = GaussianFourierProjection( time_embed_dim // 2, set_W_to_weight=False, log=False, - flip_sin_to_cos=flip_sin_to_cos, ) + flip_sin_to_cos=flip_sin_to_cos, + ) timestep_input_dim = time_embed_dim elif time_embedding_type == "positional": time_embed_dim = time_embedding_dim or block_out_channels[0] * 4 - self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, - freq_shift) + self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift) timestep_input_dim = block_out_channels[0] else: raise ValueError( @@ -257,21 +260,19 @@ def __init__( time_embed_dim, act_fn=act_fn, post_act_fn=timestep_post_act, - cond_proj_dim=time_cond_proj_dim, ) + cond_proj_dim=time_cond_proj_dim, + ) if encoder_hid_dim is not None: - self.encoder_hid_proj = nn.Linear(encoder_hid_dim, - cross_attention_dim) + self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim) else: self.encoder_hid_proj = None # class embedding if class_embed_type is None and num_class_embeds is not None: - self.class_embedding = nn.Embedding(num_class_embeds, - time_embed_dim) # int64 + self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim) # int64 elif class_embed_type == "timestep": - self.class_embedding = TimestepEmbedding( - timestep_input_dim, time_embed_dim, act_fn=act_fn) # float + self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn) # float elif class_embed_type == "identity": self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim) elif class_embed_type == "projection": @@ -286,15 +287,13 @@ def __init__( # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations. # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings. # As a result, `TimestepEmbedding` can be passed arbitrary vectors. - self.class_embedding = TimestepEmbedding( - projection_class_embeddings_input_dim, time_embed_dim) # float + self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim) # float elif class_embed_type == "simple_projection": if projection_class_embeddings_input_dim is None: raise ValueError( "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set" ) - self.class_embedding = nn.Linear( - projection_class_embeddings_input_dim, time_embed_dim) + self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim) else: self.class_embedding = None @@ -307,11 +306,10 @@ def __init__( self.add_embedding = TextTimeEmbedding( text_time_embedding_from_dim, time_embed_dim, - num_heads=addition_embed_type_num_heads, ) - elif addition_embed_type is not None: - raise ValueError( - f"addition_embed_type: {addition_embed_type} must be None or 'text'." + num_heads=addition_embed_type_num_heads, ) + elif addition_embed_type is not None: + raise ValueError(f"addition_embed_type: {addition_embed_type} must be None or 'text'.") if time_embedding_act_fn is None: self.time_embed_act = None @@ -324,8 +322,7 @@ def __init__( elif time_embedding_act_fn == "gelu": self.time_embed_act = nn.GELU() else: - raise ValueError( - f"Unsupported activation function: {time_embedding_act_fn}") + raise ValueError(f"Unsupported activation function: {time_embedding_act_fn}") self.down_blocks = nn.LayerList([]) self.up_blocks = nn.LayerList([]) @@ -333,18 +330,16 @@ def __init__( if isinstance(only_cross_attention, bool): if mid_block_only_cross_attention is None: mid_block_only_cross_attention = only_cross_attention - only_cross_attention = [only_cross_attention] * len( - down_block_types) + only_cross_attention = [only_cross_attention] * len(down_block_types) if mid_block_only_cross_attention is None: mid_block_only_cross_attention = False if isinstance(attention_head_dim, int): - attention_head_dim = (attention_head_dim, ) * len(down_block_types) + attention_head_dim = (attention_head_dim,) * len(down_block_types) if isinstance(cross_attention_dim, int): - cross_attention_dim = ( - cross_attention_dim, ) * len(down_block_types) + cross_attention_dim = (cross_attention_dim,) * len(down_block_types) if isinstance(layers_per_block, int): layers_per_block = [layers_per_block] * len(down_block_types) @@ -397,7 +392,8 @@ def __init__( resnet_skip_time_act=resnet_skip_time_act, resnet_out_scale_factor=resnet_out_scale_factor, cross_attention_norm=cross_attention_norm, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) self.down_blocks.append(down_block) # mid @@ -415,7 +411,8 @@ def __init__( dual_cross_attention=dual_cross_attention, use_linear_projection=use_linear_projection, upcast_attention=upcast_attention, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn": self.mid_block = UNetMidBlock2DSimpleCrossAttn( in_channels=block_out_channels[-1], @@ -430,7 +427,8 @@ def __init__( skip_time_act=resnet_skip_time_act, only_cross_attention=mid_block_only_cross_attention, cross_attention_norm=cross_attention_norm, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif mid_block_type is None: self.mid_block = None else: @@ -452,8 +450,7 @@ def __init__( prev_output_channel = output_channel output_channel = reversed_block_out_channels[i] - input_channel = reversed_block_out_channels[min( - i + 1, len(block_out_channels) - 1)] + input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)] # add upsample block for all BUT final layer if not is_final_block: @@ -483,7 +480,8 @@ def __init__( resnet_skip_time_act=resnet_skip_time_act, resnet_out_scale_factor=resnet_out_scale_factor, cross_attention_norm=cross_attention_norm, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) self.up_blocks.append(up_block) prev_output_channel = output_channel @@ -492,7 +490,8 @@ def __init__( self.conv_norm_out = nn.GroupNorm( num_channels=block_out_channels[0], num_groups=norm_num_groups, - epsilon=norm_eps, ) + epsilon=norm_eps, + ) if act_fn == "swish": self.conv_act = lambda x: F.silu(x) elif act_fn == "mish": @@ -512,7 +511,8 @@ def __init__( block_out_channels[0], out_channels, kernel_size=conv_out_kernel, - padding=conv_out_padding, ) + padding=conv_out_padding, + ) @property def attn_processors(self) -> Dict[str, AttentionProcessor]: @@ -524,16 +524,12 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]: # set recursively processors = {} - def fn_recursive_add_processors( - name: str, - module: nn.Layer, - processors: Dict[str, AttentionProcessor]): + def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]): if hasattr(module, "set_processor"): processors[f"{name}.processor"] = module.processor for sub_name, child in module.named_children(): - fn_recursive_add_processors(f"{name}.{sub_name}", child, - processors) + fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) return processors @@ -542,9 +538,7 @@ def fn_recursive_add_processors( return processors - def set_attn_processor(self, - processor: Union[AttentionProcessor, Dict[ - str, AttentionProcessor]]): + def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): r""" Parameters: `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`): @@ -569,8 +563,7 @@ def fn_recursive_attn_processor(name: str, module: nn.Layer, processor): module.set_processor(processor.pop(f"{name}.processor")) for sub_name, child in module.named_children(): - fn_recursive_attn_processor(f"{name}.{sub_name}", child, - processor) + fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) for name, module in self.named_children(): fn_recursive_attn_processor(name, module, processor) @@ -618,8 +611,7 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer): # make smallest slice possible slice_size = num_sliceable_layers * [1] - slice_size = (num_sliceable_layers * [slice_size] - if not isinstance(slice_size, list) else slice_size) + slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size if len(slice_size) != len(sliceable_head_dims): raise ValueError( @@ -631,14 +623,12 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer): size = slice_size[i] dim = sliceable_head_dims[i] if size is not None and size > dim: - raise ValueError( - f"size {size} has to be smaller or equal to {dim}.") + raise ValueError(f"size {size} has to be smaller or equal to {dim}.") # Recursively walk through all the children. # Any children which exposes the set_attention_slice method # gets the message - def fn_recursive_set_attention_slice(module: nn.Layer, - slice_size: List[int]): + def fn_recursive_set_attention_slice(module: nn.Layer, slice_size: List[int]): if hasattr(module, "set_attention_slice"): module.set_attention_slice(slice_size.pop()) @@ -650,24 +640,22 @@ def fn_recursive_set_attention_slice(module: nn.Layer, fn_recursive_set_attention_slice(module, reversed_slice_size) def _set_gradient_checkpointing(self, module, value=False): - if isinstance( - module, - (CrossAttnDownBlock2D, DownBlock2D, CrossAttnUpBlock2D, UpBlock2D)): + if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D, CrossAttnUpBlock2D, UpBlock2D)): module.gradient_checkpointing = value def forward( - self, - sample: paddle.Tensor, - timestep: Union[paddle.Tensor, float, int], - encoder_hidden_states: paddle.Tensor, - class_labels: Optional[paddle.Tensor]=None, - timestep_cond: Optional[paddle.Tensor]=None, - attention_mask: Optional[paddle.Tensor]=None, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - down_block_additional_residuals: Optional[Tuple[ - paddle.Tensor]]=None, - mid_block_additional_residual: Optional[paddle.Tensor]=None, - return_dict: bool=True, ) -> Union[UNet2DConditionOutput, Tuple]: + self, + sample: paddle.Tensor, + timestep: Union[paddle.Tensor, float, int], + encoder_hidden_states: paddle.Tensor, + class_labels: Optional[paddle.Tensor] = None, + timestep_cond: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None, + mid_block_additional_residual: Optional[paddle.Tensor] = None, + return_dict: bool = True, + ) -> Union[UNet2DConditionOutput, Tuple]: r""" Args: sample (`paddle.Tensor`): (batch, channel, height, width) noisy inputs tensor @@ -699,8 +687,7 @@ def forward( upsample_size = None if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]): - logger.info( - "Forward upsample size to force interpolation output size.") + logger.info("Forward upsample size to force interpolation output size.") forward_upsample_size = True # prepare attention_mask @@ -720,7 +707,11 @@ def forward( timesteps = timesteps[None] # broadcast to batch dimension in a way that's compatible with ONNX/Core ML - timesteps = timesteps.expand([sample.shape[0], ]) + timesteps = timesteps.expand( + [ + sample.shape[0], + ] + ) t_emb = self.time_proj(timesteps) # `Timesteps` does not contain any weights and will always return f32 tensors @@ -732,8 +723,7 @@ def forward( if self.class_embedding is not None: if class_labels is None: - raise ValueError( - "class_labels should be provided when num_class_embeds > 0") + raise ValueError("class_labels should be provided when num_class_embeds > 0") # maybe cast it to float16 class_labels = class_labels.cast(self.dtype) @@ -771,21 +761,16 @@ def forward( # 3. down - is_controlnet = (mid_block_additional_residual is not None and - down_block_additional_residuals is not None) - is_adapter = (mid_block_additional_residual is None and - down_block_additional_residuals is not None) + is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None + is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None - down_block_res_samples = (sample, ) + down_block_res_samples = (sample,) for downsample_block in self.down_blocks: - if (hasattr(downsample_block, "has_cross_attention") and - downsample_block.has_cross_attention): + if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention: additional_kwargs = {} if is_adapter and len(down_block_additional_residuals) > 0: - additional_kwargs[ - "additional_residuals"] = down_block_additional_residuals.pop( - 0) + additional_kwargs["additional_residuals"] = down_block_additional_residuals.pop(0) sample, res_samples = downsample_block( hidden_states=sample, @@ -793,25 +778,25 @@ def forward( encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, cross_attention_kwargs=cross_attention_kwargs, - **additional_kwargs, ) + **additional_kwargs, + ) else: - sample, res_samples = downsample_block( - hidden_states=sample, temb=emb) + sample, res_samples = downsample_block(hidden_states=sample, temb=emb) if is_adapter and len(down_block_additional_residuals) > 0: sample += down_block_additional_residuals.pop(0) # westfish: add to align with torch features - res_samples = tuple(res_samples[:-1]) + (sample, ) + res_samples = tuple(res_samples[:-1]) + (sample,) down_block_res_samples += res_samples if is_controlnet: new_down_block_res_samples = () for down_block_res_sample, down_block_additional_residual in zip( - down_block_res_samples, down_block_additional_residuals): - down_block_res_sample = ( - down_block_res_sample + down_block_additional_residual) - new_down_block_res_samples += (down_block_res_sample, ) + down_block_res_samples, down_block_additional_residuals + ): + down_block_res_sample = down_block_res_sample + down_block_additional_residual + new_down_block_res_samples += (down_block_res_sample,) down_block_res_samples = new_down_block_res_samples # 4. mid @@ -821,7 +806,8 @@ def forward( emb, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, - cross_attention_kwargs=cross_attention_kwargs, ) + cross_attention_kwargs=cross_attention_kwargs, + ) if is_controlnet: sample = sample + mid_block_additional_residual @@ -830,17 +816,15 @@ def forward( for i, upsample_block in enumerate(self.up_blocks): is_final_block = i == len(self.up_blocks) - 1 - res_samples = down_block_res_samples[-len(upsample_block.resnets):] - down_block_res_samples = down_block_res_samples[:-len( - upsample_block.resnets)] + res_samples = down_block_res_samples[-len(upsample_block.resnets) :] + down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)] # if we have not reached the final block and need to forward the # upsample size, we do it here if not is_final_block and forward_upsample_size: upsample_size = down_block_res_samples[-1].shape[2:] - if (hasattr(upsample_block, "has_cross_attention") and - upsample_block.has_cross_attention): + if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention: sample = upsample_block( hidden_states=sample, temb=emb, @@ -848,13 +832,15 @@ def forward( encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=cross_attention_kwargs, upsample_size=upsample_size, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) else: sample = upsample_block( hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, - upsample_size=upsample_size, ) + upsample_size=upsample_size, + ) # 6. post-process if self.conv_norm_out: @@ -863,6 +849,6 @@ def forward( sample = self.conv_out(sample) if not return_dict: - return (sample, ) + return (sample,) return UNet2DConditionOutput(sample=sample) diff --git a/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py b/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py index f3feb516342c7..5e55038b49714 100644 --- a/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py +++ b/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py @@ -22,23 +22,24 @@ def get_down_block( - down_block_type, - num_layers, - in_channels, - out_channels, - temb_channels, - add_downsample, - resnet_eps, - resnet_act_fn, - attn_num_head_channels, - resnet_groups=None, - cross_attention_dim=None, - downsample_padding=None, - dual_cross_attention=False, - use_linear_projection=True, - only_cross_attention=False, - upcast_attention=False, - resnet_time_scale_shift="default", ): + down_block_type, + num_layers, + in_channels, + out_channels, + temb_channels, + add_downsample, + resnet_eps, + resnet_act_fn, + attn_num_head_channels, + resnet_groups=None, + cross_attention_dim=None, + downsample_padding=None, + dual_cross_attention=False, + use_linear_projection=True, + only_cross_attention=False, + upcast_attention=False, + resnet_time_scale_shift="default", +): if down_block_type == "DownBlock3D": return DownBlock3D( num_layers=num_layers, @@ -50,11 +51,11 @@ def get_down_block( resnet_act_fn=resnet_act_fn, resnet_groups=resnet_groups, downsample_padding=downsample_padding, - resnet_time_scale_shift=resnet_time_scale_shift, ) + resnet_time_scale_shift=resnet_time_scale_shift, + ) elif down_block_type == "CrossAttnDownBlock3D": if cross_attention_dim is None: - raise ValueError( - "cross_attention_dim must be specified for CrossAttnDownBlock3D") + raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock3D") return CrossAttnDownBlock3D( num_layers=num_layers, in_channels=in_channels, @@ -71,28 +72,30 @@ def get_down_block( use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, upcast_attention=upcast_attention, - resnet_time_scale_shift=resnet_time_scale_shift, ) + resnet_time_scale_shift=resnet_time_scale_shift, + ) raise ValueError(f"{down_block_type} does not exist.") def get_up_block( - up_block_type, - num_layers, - in_channels, - out_channels, - prev_output_channel, - temb_channels, - add_upsample, - resnet_eps, - resnet_act_fn, - attn_num_head_channels, - resnet_groups=None, - cross_attention_dim=None, - dual_cross_attention=False, - use_linear_projection=True, - only_cross_attention=False, - upcast_attention=False, - resnet_time_scale_shift="default", ): + up_block_type, + num_layers, + in_channels, + out_channels, + prev_output_channel, + temb_channels, + add_upsample, + resnet_eps, + resnet_act_fn, + attn_num_head_channels, + resnet_groups=None, + cross_attention_dim=None, + dual_cross_attention=False, + use_linear_projection=True, + only_cross_attention=False, + upcast_attention=False, + resnet_time_scale_shift="default", +): if up_block_type == "UpBlock3D": return UpBlock3D( num_layers=num_layers, @@ -104,11 +107,11 @@ def get_up_block( resnet_eps=resnet_eps, resnet_act_fn=resnet_act_fn, resnet_groups=resnet_groups, - resnet_time_scale_shift=resnet_time_scale_shift, ) + resnet_time_scale_shift=resnet_time_scale_shift, + ) elif up_block_type == "CrossAttnUpBlock3D": if cross_attention_dim is None: - raise ValueError( - "cross_attention_dim must be specified for CrossAttnUpBlock3D") + raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock3D") return CrossAttnUpBlock3D( num_layers=num_layers, in_channels=in_channels, @@ -125,33 +128,34 @@ def get_up_block( use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, upcast_attention=upcast_attention, - resnet_time_scale_shift=resnet_time_scale_shift, ) + resnet_time_scale_shift=resnet_time_scale_shift, + ) raise ValueError(f"{up_block_type} does not exist.") class UNetMidBlock3DCrossAttn(nn.Layer): def __init__( - self, - in_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-06, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - attn_num_head_channels=1, - output_scale_factor=1.0, - cross_attention_dim=1280, - dual_cross_attention=False, - use_linear_projection=True, - upcast_attention=False, ): + self, + in_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-06, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels=1, + output_scale_factor=1.0, + cross_attention_dim=1280, + dual_cross_attention=False, + use_linear_projection=True, + upcast_attention=False, + ): super().__init__() self.has_cross_attention = True self.attn_num_head_channels = attn_num_head_channels - resnet_groups = (resnet_groups if resnet_groups is not None else - min(in_channels // 4, 32)) + resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32) # there is always at least one resnet resnets = [ ResnetBlock2D( @@ -164,13 +168,15 @@ def __init__( time_embedding_norm=resnet_time_scale_shift, non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, - pre_norm=resnet_pre_norm, ) + pre_norm=resnet_pre_norm, + ) ] temp_convs = [ TemporalConvLayer( in_channels, in_channels, - dropout=0.1, ) + dropout=0.1, + ) ] attentions = [] temp_attentions = [] @@ -184,7 +190,9 @@ def __init__( cross_attention_dim=cross_attention_dim, norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, - upcast_attention=upcast_attention, )) + upcast_attention=upcast_attention, + ) + ) temp_attentions.append( TransformerTemporalModel( in_channels // attn_num_head_channels, @@ -192,7 +200,9 @@ def __init__( in_channels=in_channels, num_layers=1, cross_attention_dim=cross_attention_dim, - norm_num_groups=resnet_groups, )) + norm_num_groups=resnet_groups, + ) + ) resnets.append( ResnetBlock2D( in_channels=in_channels, @@ -204,38 +214,45 @@ def __init__( time_embedding_norm=resnet_time_scale_shift, non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, - pre_norm=resnet_pre_norm, )) + pre_norm=resnet_pre_norm, + ) + ) temp_convs.append( TemporalConvLayer( in_channels, in_channels, - dropout=0.1, )) + dropout=0.1, + ) + ) self.resnets = nn.LayerList(resnets) self.temp_convs = nn.LayerList(temp_convs) self.attentions = nn.LayerList(attentions) self.temp_attentions = nn.LayerList(temp_attentions) def forward( - self, - hidden_states, - temb=None, - encoder_hidden_states=None, - attention_mask=None, - num_frames=1, - cross_attention_kwargs=None, ): + self, + hidden_states, + temb=None, + encoder_hidden_states=None, + attention_mask=None, + num_frames=1, + cross_attention_kwargs=None, + ): hidden_states = self.resnets[0](hidden_states, temb) hidden_states = self.temp_convs[0](hidden_states, num_frames=num_frames) for attn, temp_attn, resnet, temp_conv in zip( - self.attentions, self.temp_attentions, self.resnets[1:], - self.temp_convs[1:]): + self.attentions, self.temp_attentions, self.resnets[1:], self.temp_convs[1:] + ): hidden_states = attn( hidden_states, encoder_hidden_states=encoder_hidden_states, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample hidden_states = temp_attn( hidden_states, num_frames=num_frames, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample hidden_states = resnet(hidden_states, temb) hidden_states = temp_conv(hidden_states, num_frames=num_frames) return hidden_states @@ -243,26 +260,27 @@ def forward( class CrossAttnDownBlock3D(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-06, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - attn_num_head_channels=1, - cross_attention_dim=1280, - output_scale_factor=1.0, - downsample_padding=1, - add_downsample=True, - dual_cross_attention=False, - use_linear_projection=False, - only_cross_attention=False, - upcast_attention=False, ): + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-06, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels=1, + cross_attention_dim=1280, + output_scale_factor=1.0, + downsample_padding=1, + add_downsample=True, + dual_cross_attention=False, + use_linear_projection=False, + only_cross_attention=False, + upcast_attention=False, + ): super().__init__() resnets = [] attentions = [] @@ -283,12 +301,16 @@ def __init__( time_embedding_norm=resnet_time_scale_shift, non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, - pre_norm=resnet_pre_norm, )) + pre_norm=resnet_pre_norm, + ) + ) temp_convs.append( TemporalConvLayer( out_channels, out_channels, - dropout=0.1, )) + dropout=0.1, + ) + ) attentions.append( Transformer2DModel( out_channels // attn_num_head_channels, @@ -299,7 +321,9 @@ def __init__( norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, - upcast_attention=upcast_attention, )) + upcast_attention=upcast_attention, + ) + ) temp_attentions.append( TransformerTemporalModel( out_channels // attn_num_head_channels, @@ -307,70 +331,79 @@ def __init__( in_channels=out_channels, num_layers=1, cross_attention_dim=cross_attention_dim, - norm_num_groups=resnet_groups, )) + norm_num_groups=resnet_groups, + ) + ) self.resnets = nn.LayerList(resnets) self.temp_convs = nn.LayerList(temp_convs) self.attentions = nn.LayerList(attentions) self.temp_attentions = nn.LayerList(temp_attentions) if add_downsample: - self.downsamplers = nn.LayerList([ - Downsample2D( - out_channels, - use_conv=True, - out_channels=out_channels, - padding=downsample_padding, - name="op", ) - ]) + self.downsamplers = nn.LayerList( + [ + Downsample2D( + out_channels, + use_conv=True, + out_channels=out_channels, + padding=downsample_padding, + name="op", + ) + ] + ) else: self.downsamplers = None self.gradient_checkpointing = False def forward( - self, - hidden_states, - temb=None, - encoder_hidden_states=None, - attention_mask=None, - num_frames=1, - cross_attention_kwargs=None, ): + self, + hidden_states, + temb=None, + encoder_hidden_states=None, + attention_mask=None, + num_frames=1, + cross_attention_kwargs=None, + ): output_states = () for resnet, temp_conv, attn, temp_attn in zip( - self.resnets, self.temp_convs, self.attentions, - self.temp_attentions): + self.resnets, self.temp_convs, self.attentions, self.temp_attentions + ): hidden_states = resnet(hidden_states, temb) hidden_states = temp_conv(hidden_states, num_frames=num_frames) hidden_states = attn( hidden_states, encoder_hidden_states=encoder_hidden_states, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample hidden_states = temp_attn( hidden_states, num_frames=num_frames, - cross_attention_kwargs=cross_attention_kwargs, ).sample - output_states += (hidden_states, ) + cross_attention_kwargs=cross_attention_kwargs, + ).sample + output_states += (hidden_states,) if self.downsamplers is not None: for downsampler in self.downsamplers: hidden_states = downsampler(hidden_states) - output_states += (hidden_states, ) + output_states += (hidden_states,) return hidden_states, output_states class DownBlock3D(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-06, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - output_scale_factor=1.0, - add_downsample=True, - downsample_padding=1, ): + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-06, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + output_scale_factor=1.0, + add_downsample=True, + downsample_padding=1, + ): super().__init__() resnets = [] temp_convs = [] @@ -387,23 +420,30 @@ def __init__( time_embedding_norm=resnet_time_scale_shift, non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, - pre_norm=resnet_pre_norm, )) + pre_norm=resnet_pre_norm, + ) + ) temp_convs.append( TemporalConvLayer( out_channels, out_channels, - dropout=0.1, )) + dropout=0.1, + ) + ) self.resnets = nn.LayerList(resnets) self.temp_convs = nn.LayerList(temp_convs) if add_downsample: - self.downsamplers = nn.LayerList([ - Downsample2D( - out_channels, - use_conv=True, - out_channels=out_channels, - padding=downsample_padding, - name="op", ) - ]) + self.downsamplers = nn.LayerList( + [ + Downsample2D( + out_channels, + use_conv=True, + out_channels=out_channels, + padding=downsample_padding, + name="op", + ) + ] + ) else: self.downsamplers = None self.gradient_checkpointing = False @@ -413,36 +453,37 @@ def forward(self, hidden_states, temb=None, num_frames=1): for resnet, temp_conv in zip(self.resnets, self.temp_convs): hidden_states = resnet(hidden_states, temb) hidden_states = temp_conv(hidden_states, num_frames=num_frames) - output_states += (hidden_states, ) + output_states += (hidden_states,) if self.downsamplers is not None: for downsampler in self.downsamplers: hidden_states = downsampler(hidden_states) - output_states += (hidden_states, ) + output_states += (hidden_states,) return hidden_states, output_states class CrossAttnUpBlock3D(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - prev_output_channel: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-06, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - attn_num_head_channels=1, - cross_attention_dim=1280, - output_scale_factor=1.0, - add_upsample=True, - dual_cross_attention=False, - use_linear_projection=False, - only_cross_attention=False, - upcast_attention=False, ): + self, + in_channels: int, + out_channels: int, + prev_output_channel: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-06, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels=1, + cross_attention_dim=1280, + output_scale_factor=1.0, + add_upsample=True, + dual_cross_attention=False, + use_linear_projection=False, + only_cross_attention=False, + upcast_attention=False, + ): super().__init__() resnets = [] temp_convs = [] @@ -451,8 +492,7 @@ def __init__( self.has_cross_attention = True self.attn_num_head_channels = attn_num_head_channels for i in range(num_layers): - res_skip_channels = in_channels if ( - i == num_layers - 1) else out_channels + res_skip_channels = in_channels if (i == num_layers - 1) else out_channels resnet_in_channels = prev_output_channel if i == 0 else out_channels resnets.append( ResnetBlock2D( @@ -465,12 +505,16 @@ def __init__( time_embedding_norm=resnet_time_scale_shift, non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, - pre_norm=resnet_pre_norm, )) + pre_norm=resnet_pre_norm, + ) + ) temp_convs.append( TemporalConvLayer( out_channels, out_channels, - dropout=0.1, )) + dropout=0.1, + ) + ) attentions.append( Transformer2DModel( out_channels // attn_num_head_channels, @@ -481,7 +525,9 @@ def __init__( norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, - upcast_attention=upcast_attention, )) + upcast_attention=upcast_attention, + ) + ) temp_attentions.append( TransformerTemporalModel( out_channels // attn_num_head_channels, @@ -489,48 +535,51 @@ def __init__( in_channels=out_channels, num_layers=1, cross_attention_dim=cross_attention_dim, - norm_num_groups=resnet_groups, )) + norm_num_groups=resnet_groups, + ) + ) self.resnets = nn.LayerList(sublayers=resnets) self.temp_convs = nn.LayerList(sublayers=temp_convs) self.attentions = nn.LayerList(sublayers=attentions) self.temp_attentions = nn.LayerList(sublayers=temp_attentions) if add_upsample: - self.upsamplers = nn.LayerList(sublayers=[ - Upsample2D( - out_channels, use_conv=True, out_channels=out_channels) - ]) + self.upsamplers = nn.LayerList( + sublayers=[Upsample2D(out_channels, use_conv=True, out_channels=out_channels)] + ) else: self.upsamplers = None self.gradient_checkpointing = False def forward( - self, - hidden_states, - res_hidden_states_tuple, - temb=None, - encoder_hidden_states=None, - upsample_size=None, - attention_mask=None, - num_frames=1, - cross_attention_kwargs=None, ): + self, + hidden_states, + res_hidden_states_tuple, + temb=None, + encoder_hidden_states=None, + upsample_size=None, + attention_mask=None, + num_frames=1, + cross_attention_kwargs=None, + ): for resnet, temp_conv, attn, temp_attn in zip( - self.resnets, self.temp_convs, self.attentions, - self.temp_attentions): + self.resnets, self.temp_convs, self.attentions, self.temp_attentions + ): # pop res hidden states res_hidden_states = res_hidden_states_tuple[-1] res_hidden_states_tuple = res_hidden_states_tuple[:-1] - hidden_states = paddle.concat( - [hidden_states, res_hidden_states], axis=1) + hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1) hidden_states = resnet(hidden_states, temb) hidden_states = temp_conv(hidden_states, num_frames=num_frames) hidden_states = attn( hidden_states, encoder_hidden_states=encoder_hidden_states, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample hidden_states = temp_attn( hidden_states, num_frames=num_frames, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample if self.upsamplers is not None: for upsampler in self.upsamplers: hidden_states = upsampler(hidden_states, upsample_size) @@ -539,26 +588,26 @@ def forward( class UpBlock3D(nn.Layer): def __init__( - self, - in_channels: int, - prev_output_channel: int, - out_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-06, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - output_scale_factor=1.0, - add_upsample=True, ): + self, + in_channels: int, + prev_output_channel: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-06, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + output_scale_factor=1.0, + add_upsample=True, + ): super().__init__() resnets = [] temp_convs = [] for i in range(num_layers): - res_skip_channels = in_channels if ( - i == num_layers - 1) else out_channels + res_skip_channels = in_channels if (i == num_layers - 1) else out_channels resnet_in_channels = prev_output_channel if i == 0 else out_channels resnets.append( ResnetBlock2D( @@ -571,36 +620,37 @@ def __init__( time_embedding_norm=resnet_time_scale_shift, non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, - pre_norm=resnet_pre_norm, )) + pre_norm=resnet_pre_norm, + ) + ) temp_convs.append( TemporalConvLayer( out_channels, out_channels, - dropout=0.1, )) + dropout=0.1, + ) + ) self.resnets = nn.LayerList(resnets) self.temp_convs = nn.LayerList(temp_convs) if add_upsample: - self.upsamplers = nn.LayerList([ - Upsample2D( - out_channels, use_conv=True, out_channels=out_channels) - ]) + self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]) else: self.upsamplers = None self.gradient_checkpointing = False def forward( - self, - hidden_states, - res_hidden_states_tuple, - temb=None, - upsample_size=None, - num_frames=1, ): + self, + hidden_states, + res_hidden_states_tuple, + temb=None, + upsample_size=None, + num_frames=1, + ): for resnet, temp_conv in zip(self.resnets, self.temp_convs): # pop res hidden states res_hidden_states = res_hidden_states_tuple[-1] res_hidden_states_tuple = res_hidden_states_tuple[:-1] - hidden_states = paddle.concat( - x=[hidden_states, res_hidden_states], axis=1) + hidden_states = paddle.concat(x=[hidden_states, res_hidden_states], axis=1) hidden_states = resnet(hidden_states, temb) hidden_states = temp_conv(hidden_states, num_frames=num_frames) if self.upsamplers is not None: diff --git a/ppdiffusers/ppdiffusers/models/unet_3d_condition.py b/ppdiffusers/ppdiffusers/models/unet_3d_condition.py index fb8ae5756d4c3..038e8c6d514a7 100644 --- a/ppdiffusers/ppdiffusers/models/unet_3d_condition.py +++ b/ppdiffusers/ppdiffusers/models/unet_3d_condition.py @@ -26,9 +26,15 @@ from .embeddings import TimestepEmbedding, Timesteps from .modeling_utils import ModelMixin from .transformer_temporal import TransformerTemporalModel -from .unet_3d_blocks import (CrossAttnDownBlock3D, CrossAttnUpBlock3D, - DownBlock3D, UNetMidBlock3DCrossAttn, UpBlock3D, - get_down_block, get_up_block) +from .unet_3d_blocks import ( + CrossAttnDownBlock3D, + CrossAttnUpBlock3D, + DownBlock3D, + UNetMidBlock3DCrossAttn, + UpBlock3D, + get_down_block, + get_up_block, +) logger = logging.get_logger(__name__) @@ -44,8 +50,7 @@ class UNet3DConditionOutput(BaseOutput): sample: paddle.Tensor -class UNet3DConditionModel(ModelMixin, ConfigMixin, - UNet2DConditionLoadersMixin): +class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): r""" UNet3DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep and returns sample shaped output. @@ -79,29 +84,32 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, @register_to_config def __init__( - self, - sample_size: Optional[int]=None, - in_channels: int=4, - out_channels: int=4, - down_block_types: Tuple[str]=( - "CrossAttnDownBlock3D", - "CrossAttnDownBlock3D", - "CrossAttnDownBlock3D", - "DownBlock3D", ), - up_block_types: Tuple[str]=( - "UpBlock3D", - "CrossAttnUpBlock3D", - "CrossAttnUpBlock3D", - "CrossAttnUpBlock3D", ), - block_out_channels: Tuple[int]=(320, 640, 1280, 1280), - layers_per_block: int=2, - downsample_padding: int=1, - mid_block_scale_factor: float=1, - act_fn: str="silu", - norm_num_groups: Optional[int]=32, - norm_eps: float=1e-05, - cross_attention_dim: int=1024, - attention_head_dim: Union[int, Tuple[int]]=64, ): + self, + sample_size: Optional[int] = None, + in_channels: int = 4, + out_channels: int = 4, + down_block_types: Tuple[str] = ( + "CrossAttnDownBlock3D", + "CrossAttnDownBlock3D", + "CrossAttnDownBlock3D", + "DownBlock3D", + ), + up_block_types: Tuple[str] = ( + "UpBlock3D", + "CrossAttnUpBlock3D", + "CrossAttnUpBlock3D", + "CrossAttnUpBlock3D", + ), + block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + layers_per_block: int = 2, + downsample_padding: int = 1, + mid_block_scale_factor: float = 1, + act_fn: str = "silu", + norm_num_groups: Optional[int] = 32, + norm_eps: float = 1e-05, + cross_attention_dim: int = 1024, + attention_head_dim: Union[int, Tuple[int]] = 64, + ): super().__init__() self.sample_size = sample_size # Check inputs @@ -113,9 +121,7 @@ def __init__( raise ValueError( f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}." ) - if not isinstance( - attention_head_dim, - int) and len(attention_head_dim) != len(down_block_types): + if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types): raise ValueError( f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}." ) @@ -126,7 +132,8 @@ def __init__( in_channels=in_channels, out_channels=block_out_channels[0], kernel_size=conv_in_kernel, - padding=conv_in_padding, ) + padding=conv_in_padding, + ) # time time_embed_dim = block_out_channels[0] * 4 self.time_proj = Timesteps(block_out_channels[0], True, 0) @@ -134,17 +141,19 @@ def __init__( self.time_embedding = TimestepEmbedding( timestep_input_dim, time_embed_dim, - act_fn=act_fn, ) + act_fn=act_fn, + ) self.transformer_in = TransformerTemporalModel( num_attention_heads=8, attention_head_dim=attention_head_dim, in_channels=block_out_channels[0], - num_layers=1, ) + num_layers=1, + ) # class embedding self.down_blocks = nn.LayerList(sublayers=[]) self.up_blocks = nn.LayerList(sublayers=[]) if isinstance(attention_head_dim, int): - attention_head_dim = (attention_head_dim, ) * len(down_block_types) + attention_head_dim = (attention_head_dim,) * len(down_block_types) # down output_channel = block_out_channels[0] @@ -165,7 +174,8 @@ def __init__( cross_attention_dim=cross_attention_dim, attn_num_head_channels=attention_head_dim[i], downsample_padding=downsample_padding, - dual_cross_attention=False, ) + dual_cross_attention=False, + ) self.down_blocks.append(down_block) # mid self.mid_block = UNetMidBlock3DCrossAttn( @@ -177,7 +187,8 @@ def __init__( cross_attention_dim=cross_attention_dim, attn_num_head_channels=attention_head_dim[-1], resnet_groups=norm_num_groups, - dual_cross_attention=False, ) + dual_cross_attention=False, + ) # count how many layers upsample the images self.num_upsamplers = 0 # up @@ -188,8 +199,7 @@ def __init__( is_final_block = i == len(block_out_channels) - 1 prev_output_channel = output_channel output_channel = reversed_block_out_channels[i] - input_channel = reversed_block_out_channels[min( - i + 1, len(block_out_channels) - 1)] + input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)] # add upsample block for all BUT final layer if not is_final_block: add_upsample = True @@ -209,14 +219,16 @@ def __init__( resnet_groups=norm_num_groups, cross_attention_dim=cross_attention_dim, attn_num_head_channels=reversed_attention_head_dim[i], - dual_cross_attention=False, ) + dual_cross_attention=False, + ) self.up_blocks.append(up_block) prev_output_channel = output_channel if norm_num_groups is not None: self.conv_norm_out = nn.GroupNorm( num_channels=block_out_channels[0], num_groups=norm_num_groups, - epsilon=norm_eps, ) + epsilon=norm_eps, + ) self.conv_act = nn.Silu() else: self.conv_norm_out = None @@ -226,7 +238,8 @@ def __init__( in_channels=block_out_channels[0], out_channels=out_channels, kernel_size=conv_out_kernel, - padding=conv_out_padding, ) + padding=conv_out_padding, + ) @property # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors @@ -239,16 +252,12 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]: # set recursively processors = {} - def fn_recursive_add_processors( - name: str, - module: nn.Layer, - processors: Dict[str, AttentionProcessor]): + def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]): if hasattr(module, "set_processor"): processors[f"{name}.processor"] = module.processor for sub_name, child in module.named_children(): - fn_recursive_add_processors(f"{name}.{sub_name}", child, - processors) + fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) return processors @@ -295,8 +304,7 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer): # make smallest slice possible slice_size = num_sliceable_layers * [1] - slice_size = (num_sliceable_layers * [slice_size] - if not isinstance(slice_size, list) else slice_size) + slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size if len(slice_size) != len(sliceable_head_dims): raise ValueError( @@ -308,14 +316,12 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer): size = slice_size[i] dim = sliceable_head_dims[i] if size is not None and size > dim: - raise ValueError( - f"size {size} has to be smaller or equal to {dim}.") + raise ValueError(f"size {size} has to be smaller or equal to {dim}.") # Recursively walk through all the children. # Any children which exposes the set_attention_slice method # gets the message - def fn_recursive_set_attention_slice(module: nn.Layer, - slice_size: List[int]): + def fn_recursive_set_attention_slice(module: nn.Layer, slice_size: List[int]): if hasattr(module, "set_attention_slice"): module.set_attention_slice(slice_size.pop()) @@ -327,9 +333,7 @@ def fn_recursive_set_attention_slice(module: nn.Layer, fn_recursive_set_attention_slice(module, reversed_slice_size) # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor - def set_attn_processor(self, - processor: Union[AttentionProcessor, Dict[ - str, AttentionProcessor]]): + def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): r""" Parameters: `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`): @@ -354,8 +358,7 @@ def fn_recursive_attn_processor(name: str, module: nn.Layer, processor): module.set_processor(processor.pop(f"{name}.processor")) for sub_name, child in module.named_children(): - fn_recursive_attn_processor(f"{name}.{sub_name}", child, - processor) + fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) for name, module in self.named_children(): fn_recursive_attn_processor(name, module, processor) @@ -368,24 +371,22 @@ def set_default_attn_processor(self): self.set_attn_processor(AttnProcessor()) def _set_gradient_checkpointing(self, module, value=False): - if isinstance( - module, - (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)): + if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)): module.gradient_checkpointing = value def forward( - self, - sample: paddle.Tensor, - timestep: Union[paddle.Tensor, float, int], - encoder_hidden_states: paddle.Tensor, - class_labels: Optional[paddle.Tensor]=None, - timestep_cond: Optional[paddle.Tensor]=None, - attention_mask: Optional[paddle.Tensor]=None, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - down_block_additional_residuals: Optional[Tuple[ - paddle.Tensor]]=None, - mid_block_additional_residual: Optional[paddle.Tensor]=None, - return_dict: bool=True, ) -> Union[UNet3DConditionOutput, Tuple]: + self, + sample: paddle.Tensor, + timestep: Union[paddle.Tensor, float, int], + encoder_hidden_states: paddle.Tensor, + class_labels: Optional[paddle.Tensor] = None, + timestep_cond: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None, + mid_block_additional_residual: Optional[paddle.Tensor] = None, + return_dict: bool = True, + ) -> Union[UNet3DConditionOutput, Tuple]: """ Args: sample (`paddle.Tensor`): (batch, num_frames, channel, height, width) noisy inputs tensor @@ -417,8 +418,7 @@ def forward( upsample_size = None if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]): - logger.info( - "Forward upsample size to force interpolation output size.") + logger.info("Forward upsample size to force interpolation output size.") forward_upsample_size = True # prepare attention_mask if attention_mask is not None: @@ -436,7 +436,11 @@ def forward( elif len(timesteps.shape) == 0: timesteps = timesteps[None] num_frames = sample.shape[2] - timesteps = timesteps.expand([sample.shape[0], ]) + timesteps = timesteps.expand( + [ + sample.shape[0], + ] + ) t_emb = self.time_proj(timesteps) # timesteps does not contain any weights and will always return f32 tensors @@ -445,38 +449,36 @@ def forward( t_emb = t_emb.cast(dtype=self.dtype) emb = self.time_embedding(t_emb, timestep_cond) emb = emb.repeat_interleave(repeats=num_frames, axis=0) - encoder_hidden_states = encoder_hidden_states.repeat_interleave( - repeats=num_frames, axis=0) - sample = sample.transpose([0, 2, 1, 3, 4]).reshape((sample.shape[ - 0] * num_frames, -1) + tuple(sample.shape[3:])) + encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, axis=0) + sample = sample.transpose([0, 2, 1, 3, 4]).reshape( + (sample.shape[0] * num_frames, -1) + tuple(sample.shape[3:]) + ) sample = self.conv_in(sample) sample = self.transformer_in( - sample, - num_frames=num_frames, - cross_attention_kwargs=cross_attention_kwargs).sample + sample, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs + ).sample # 3. down - down_block_res_samples = (sample, ) + down_block_res_samples = (sample,) for downsample_block in self.down_blocks: - if (hasattr(downsample_block, "has_cross_attention") and - downsample_block.has_cross_attention): + if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention: sample, res_samples = downsample_block( hidden_states=sample, temb=emb, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, num_frames=num_frames, - cross_attention_kwargs=cross_attention_kwargs, ) + cross_attention_kwargs=cross_attention_kwargs, + ) else: - sample, res_samples = downsample_block( - hidden_states=sample, temb=emb, num_frames=num_frames) + sample, res_samples = downsample_block(hidden_states=sample, temb=emb, num_frames=num_frames) down_block_res_samples += res_samples if down_block_additional_residuals is not None: new_down_block_res_samples = () for down_block_res_sample, down_block_additional_residual in zip( - down_block_res_samples, down_block_additional_residuals): - down_block_res_sample = ( - down_block_res_sample + down_block_additional_residual) - new_down_block_res_samples += (down_block_res_sample, ) + down_block_res_samples, down_block_additional_residuals + ): + down_block_res_sample = down_block_res_sample + down_block_additional_residual + new_down_block_res_samples += (down_block_res_sample,) down_block_res_samples = new_down_block_res_samples # 4. mid if self.mid_block is not None: @@ -486,21 +488,20 @@ def forward( encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, num_frames=num_frames, - cross_attention_kwargs=cross_attention_kwargs, ) + cross_attention_kwargs=cross_attention_kwargs, + ) if mid_block_additional_residual is not None: sample = sample + mid_block_additional_residual # 5. up for i, upsample_block in enumerate(self.up_blocks): is_final_block = i == len(self.up_blocks) - 1 - res_samples = down_block_res_samples[-len(upsample_block.resnets):] - down_block_res_samples = down_block_res_samples[:-len( - upsample_block.resnets)] + res_samples = down_block_res_samples[-len(upsample_block.resnets) :] + down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)] # if we have not reached the final block and need to forward the # upsample size, we do it here if not is_final_block and forward_upsample_size: upsample_size = down_block_res_samples[-1].shape[2:] - if (hasattr(upsample_block, "has_cross_attention") and - upsample_block.has_cross_attention): + if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention: sample = upsample_block( hidden_states=sample, temb=emb, @@ -509,23 +510,23 @@ def forward( upsample_size=upsample_size, attention_mask=attention_mask, num_frames=num_frames, - cross_attention_kwargs=cross_attention_kwargs, ) + cross_attention_kwargs=cross_attention_kwargs, + ) else: sample = upsample_block( hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size, - num_frames=num_frames, ) + num_frames=num_frames, + ) # 6. post-process if self.conv_norm_out: sample = self.conv_norm_out(sample) sample = self.conv_act(sample) sample = self.conv_out(sample) # reshape to (batch, channel, framerate, width, height) - sample = (sample[None, :] - .reshape((-1, num_frames) + tuple(sample.shape[1:])) - .transpose([0, 2, 1, 3, 4])) + sample = sample[None, :].reshape((-1, num_frames) + tuple(sample.shape[1:])).transpose([0, 2, 1, 3, 4]) if not return_dict: - return (sample, ) + return (sample,) return UNet3DConditionOutput(sample=sample) diff --git a/ppdiffusers/ppdiffusers/models/uvit.py b/ppdiffusers/ppdiffusers/models/uvit.py index f2140122e269f..eb7267d41d2a2 100644 --- a/ppdiffusers/ppdiffusers/models/uvit.py +++ b/ppdiffusers/ppdiffusers/models/uvit.py @@ -27,21 +27,15 @@ def unpatchify(x, in_chans): - patch_size = int((x.shape[2] // in_chans)**0.5) - h = w = int(x.shape[1]**0.5) - assert h * w == x.shape[1] and patch_size**2 * in_chans == x.shape[2] - x = einops.rearrange( - x, - "B (h w) (p1 p2 C) -> B C (h p1) (w p2)", - h=h, - p1=patch_size, - p2=patch_size) + patch_size = int((x.shape[2] // in_chans) ** 0.5) + h = w = int(x.shape[1] ** 0.5) + assert h * w == x.shape[1] and patch_size ** 2 * in_chans == x.shape[2] + x = einops.rearrange(x, "B (h w) (p1 p2 C) -> B C (h p1) (w p2)", h=h, p1=patch_size, p2=patch_size) return x def interpolate_pos_emb(pos_emb, old_shape, new_shape): - pos_emb = einops.rearrange( - pos_emb, "B (H W) C -> B C H W", H=old_shape[0], W=old_shape[1]) + pos_emb = einops.rearrange(pos_emb, "B (H W) C -> B C H W", H=old_shape[0], W=old_shape[1]) pos_emb = F.interpolate(pos_emb, new_shape, mode="bilinear") pos_emb = einops.rearrange(pos_emb, "B C H W -> B (H W) C") return pos_emb @@ -49,13 +43,14 @@ def interpolate_pos_emb(pos_emb, old_shape, new_shape): class Attention(nn.Layer): def __init__( - self, - dim, - num_heads=8, - qkv_bias=False, - qk_scale=None, - attn_drop=0.0, - proj_drop=0.0, ): + self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads @@ -82,9 +77,10 @@ def reshape_batch_dim_to_heads(self, tensor, transpose=True): return tensor def set_use_memory_efficient_attention_xformers( - self, - use_memory_efficient_attention_xformers: bool, - attention_op: Optional[str]=None, ): + self, + use_memory_efficient_attention_xformers: bool, + attention_op: Optional[str] = None, + ): # remove this PR: https://github.com/PaddlePaddle/Paddle/pull/56045 # if self.head_size > 128 and attention_op == "flash": # attention_op = "cutlass" @@ -96,18 +92,15 @@ def set_use_memory_efficient_attention_xformers( else: try: _ = F.scaled_dot_product_attention_( - paddle.ones( - (1, 1, 2, 40), dtype=paddle.float16), - paddle.ones( - (1, 1, 2, 40), dtype=paddle.float16), - paddle.ones( - (1, 1, 2, 40), dtype=paddle.float16), - attention_op=attention_op, ) + paddle.ones((1, 1, 2, 40), dtype=paddle.float16), + paddle.ones((1, 1, 2, 40), dtype=paddle.float16), + paddle.ones((1, 1, 2, 40), dtype=paddle.float16), + attention_op=attention_op, + ) except Exception as e: raise e - self._use_memory_efficient_attention_xformers = ( - use_memory_efficient_attention_xformers) + self._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers self._attention_op = attention_op def forward(self, x): @@ -116,14 +109,14 @@ def forward(self, x): qkv = qkv.cast(paddle.float32) query_proj, key_proj, value_proj = qkv.chunk(3, axis=-1) query_proj = self.reshape_heads_to_batch_dim( - query_proj, - transpose=not self._use_memory_efficient_attention_xformers) + query_proj, transpose=not self._use_memory_efficient_attention_xformers + ) key_proj = self.reshape_heads_to_batch_dim( - key_proj, - transpose=not self._use_memory_efficient_attention_xformers) + key_proj, transpose=not self._use_memory_efficient_attention_xformers + ) value_proj = self.reshape_heads_to_batch_dim( - value_proj, - transpose=not self._use_memory_efficient_attention_xformers) + value_proj, transpose=not self._use_memory_efficient_attention_xformers + ) if self._use_memory_efficient_attention_xformers: hidden_states = F.scaled_dot_product_attention_( @@ -134,18 +127,17 @@ def forward(self, x): scale=self.scale, dropout_p=self.attn_drop, training=self.training, - attention_op=self._attention_op, ) + attention_op=self._attention_op, + ) else: with paddle.amp.auto_cast(enable=False): - attention_scores = paddle.matmul( - query_proj * self.scale, key_proj, transpose_y=True) + attention_scores = paddle.matmul(query_proj * self.scale, key_proj, transpose_y=True) attention_probs = F.softmax(attention_scores, axis=-1) - hidden_states = paddle.matmul(attention_probs, - value_proj).cast(x.dtype) + hidden_states = paddle.matmul(attention_probs, value_proj).cast(x.dtype) hidden_states = self.reshape_batch_dim_to_heads( - hidden_states, - transpose=not self._use_memory_efficient_attention_xformers) + hidden_states, transpose=not self._use_memory_efficient_attention_xformers + ) hidden_states = self.proj_drop(self.proj(hidden_states)) return hidden_states @@ -153,18 +145,19 @@ def forward(self, x): class Block(nn.Layer): def __init__( - self, - dim, - num_heads, - mlp_ratio=4.0, - qkv_bias=False, - qk_scale=None, - drop=0.0, - attn_drop=0.0, - drop_path=0.0, - act_layer=nn.GELU, - norm_layer=nn.LayerNorm, - skip=False, ): + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + skip=False, + ): super().__init__() self.norm1 = norm_layer(dim) if skip else None self.norm2 = norm_layer(dim) @@ -175,16 +168,17 @@ def __init__( qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, - proj_drop=drop, ) - self.drop_path = DropPath( - drop_path) if drop_path > 0.0 else nn.Identity() + proj_drop=drop, + ) + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm3 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, - drop=drop, ) + drop=drop, + ) self.skip_linear = nn.Linear(2 * dim, dim) if skip else None def forward(self, x, skip=None): @@ -223,44 +217,43 @@ class UViTModel(ModelMixin, ConfigMixin): @register_to_config def __init__( - self, - sample_size=1, - img_size=64, - in_channels=4, - patch_size=2, - embed_dim=1536, - depth=30, - num_heads=24, - mlp_ratio=4.0, - qkv_bias=False, - qk_scale=None, - pos_drop_rate=0.0, - drop_rate=0.0, - attn_drop_rate=0.0, - norm_type="layer_norm", - text_dim=64, - num_text_tokens=77, - clip_img_dim=512, - use_checkpoint=False, ): + self, + sample_size=1, + img_size=64, + in_channels=4, + patch_size=2, + embed_dim=1536, + depth=30, + num_heads=24, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + pos_drop_rate=0.0, + drop_rate=0.0, + attn_drop_rate=0.0, + norm_type="layer_norm", + text_dim=64, + num_text_tokens=77, + clip_img_dim=512, + use_checkpoint=False, + ): super().__init__() self.sample_size = sample_size self.in_channels = in_channels self.patch_size = patch_size self.embed_dim = embed_dim - self.img_size = (img_size, img_size) if isinstance(img_size, - int) else img_size + self.img_size = (img_size, img_size) if isinstance(img_size, int) else img_size self.patch_embed = PatchEmbed( height=self.img_size[0], width=self.img_size[1], patch_size=patch_size, in_channels=in_channels, embed_dim=embed_dim, - add_pos_embed=False, ) - assert self.img_size[0] % patch_size == 0 and self.img_size[ - 1] % patch_size == 0 - self.num_patches = (self.img_size[0] // patch_size) * ( - self.img_size[1] // patch_size) + add_pos_embed=False, + ) + assert self.img_size[0] % patch_size == 0 and self.img_size[1] % patch_size == 0 + self.num_patches = (self.img_size[0] // patch_size) * (self.img_size[1] // patch_size) self.encode_prefix = nn.Linear(768, text_dim) @@ -274,22 +267,27 @@ def __init__( self.pos_embed = self.create_parameter( shape=(1, self.num_tokens, embed_dim), - default_initializer=nn.initializer.Constant(0.0), ) + default_initializer=nn.initializer.Constant(0.0), + ) assert norm_type == "layer_norm", "We only support norm_type == layer_norm. " norm_layer = nn.LayerNorm self.pos_drop = nn.Dropout(p=pos_drop_rate) - self.in_blocks = nn.LayerList([ - Block( - dim=embed_dim, - num_heads=num_heads, - mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - attn_drop=attn_drop_rate, - norm_layer=norm_layer, ) for _ in range(depth // 2) - ]) + self.in_blocks = nn.LayerList( + [ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + norm_layer=norm_layer, + ) + for _ in range(depth // 2) + ] + ) self.mid_block = Block( dim=embed_dim, @@ -299,20 +297,25 @@ def __init__( qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, - norm_layer=norm_layer, ) - - self.out_blocks = nn.LayerList([ - Block( - dim=embed_dim, - num_heads=num_heads, - mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - attn_drop=attn_drop_rate, - norm_layer=norm_layer, - skip=True, ) for _ in range(depth // 2) - ]) + norm_layer=norm_layer, + ) + + self.out_blocks = nn.LayerList( + [ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + norm_layer=norm_layer, + skip=True, + ) + for _ in range(depth // 2) + ] + ) self.norm = norm_layer(embed_dim) self.patch_dim = patch_size**2 * in_channels @@ -320,18 +323,18 @@ def __init__( self.token_embedding = nn.Embedding(2, embed_dim) self.pos_embed_token = self.create_parameter( - shape=(1, 1, embed_dim), - default_initializer=nn.initializer.Constant(0.0)) + shape=(1, 1, embed_dim), default_initializer=nn.initializer.Constant(0.0) + ) def forward( - self, - img: paddle.Tensor, - clip_img: paddle.Tensor, - text: paddle.Tensor, - t_img: paddle.Tensor, - t_text: paddle.Tensor, - data_type: paddle.Tensor, - return_dict=False, # TODO: nf + self, + img: paddle.Tensor, + clip_img: paddle.Tensor, + text: paddle.Tensor, + t_img: paddle.Tensor, + t_text: paddle.Tensor, + data_type: paddle.Tensor, + return_dict=False, # TODO: nf ): _, _, H, W = img.shape # TODO junnyu, support float16 @@ -343,10 +346,8 @@ def forward( clip_img = self.clip_img_embed(clip_img) text = self.text_embed(text) - t_img_token = get_timestep_embedding(t_img, self.embed_dim, True, - 0).unsqueeze(axis=1) - t_text_token = get_timestep_embedding(t_text, self.embed_dim, True, - 0).unsqueeze(axis=1) + t_img_token = get_timestep_embedding(t_img, self.embed_dim, True, 0).unsqueeze(axis=1) + t_text_token = get_timestep_embedding(t_text, self.embed_dim, True, 0).unsqueeze(axis=1) token_embed = self.token_embedding(data_type).unsqueeze(axis=1) # TODO junnyu, support float16 @@ -354,35 +355,35 @@ def forward( t_text_token = t_text_token.cast(self.dtype) token_embed = token_embed.cast(self.dtype) - x = paddle.concat( - (t_img_token, t_text_token, token_embed, text, clip_img, img), - axis=1) + x = paddle.concat((t_img_token, t_text_token, token_embed, text, clip_img, img), axis=1) num_text_tokens, num_img_tokens = text.shape[1], img.shape[1] pos_embed = paddle.concat( [ - self.pos_embed[:, :1 + 1, :], + self.pos_embed[:, : 1 + 1, :], self.pos_embed_token, - self.pos_embed[:, 1 + 1:, :], + self.pos_embed[:, 1 + 1 :, :], ], - axis=1, ) + axis=1, + ) if H == self.img_size[0] and W == self.img_size[1]: pass else: # interpolate the positional embedding when the input image is not of the default shape pos_embed_others, pos_embed_patches = paddle.split( - pos_embed, [1 + 1 + 1 + num_text_tokens + 1, self.num_patches], - axis=1) + pos_embed, [1 + 1 + 1 + num_text_tokens + 1, self.num_patches], axis=1 + ) pos_embed_patches = interpolate_pos_emb( pos_embed_patches, ( self.img_size[0] // self.patch_size, - self.img_size[1] // self.patch_size, ), - (H // self.patch_size, W // self.patch_size), ) - pos_embed = paddle.concat( - (pos_embed_others, pos_embed_patches), axis=1) + self.img_size[1] // self.patch_size, + ), + (H // self.patch_size, W // self.patch_size), + ) + pos_embed = paddle.concat((pos_embed_others, pos_embed_patches), axis=1) x = x + pos_embed x = self.pos_drop(x) @@ -405,8 +406,8 @@ def forward( token_embed_out, text_out, clip_img_out, - img_out, ) = x.split( - (1, 1, 1, num_text_tokens, 1, num_img_tokens), axis=1) + img_out, + ) = x.split((1, 1, 1, num_text_tokens, 1, num_img_tokens), axis=1) img_out = self.decoder_pred(img_out) sample_img = unpatchify(img_out, self.in_channels) @@ -419,4 +420,5 @@ def forward( return UViTModelOutput( sample_img=sample_img, sample_clip_img=sample_clip_img, - sample_text=sample_text, ) + sample_text=sample_text, + ) diff --git a/ppdiffusers/ppdiffusers/models/vae.py b/ppdiffusers/ppdiffusers/models/vae.py index f3b9a81b43a67..4b1fce10910a6 100644 --- a/ppdiffusers/ppdiffusers/models/vae.py +++ b/ppdiffusers/ppdiffusers/models/vae.py @@ -53,24 +53,20 @@ class DecoderOutput(BaseOutput): class Encoder(nn.Layer): def __init__( - self, - in_channels=3, - out_channels=3, - down_block_types=("DownEncoderBlock2D", ), - block_out_channels=(64, ), - layers_per_block=2, - norm_num_groups=32, - act_fn="silu", - double_z=True, ): + self, + in_channels=3, + out_channels=3, + down_block_types=("DownEncoderBlock2D",), + block_out_channels=(64,), + layers_per_block=2, + norm_num_groups=32, + act_fn="silu", + double_z=True, + ): super().__init__() self.layers_per_block = layers_per_block - self.conv_in = nn.Conv2D( - in_channels, - block_out_channels[0], - kernel_size=3, - stride=1, - padding=1) + self.conv_in = nn.Conv2D(in_channels, block_out_channels[0], kernel_size=3, stride=1, padding=1) self.mid_block = None self.down_blocks = nn.LayerList([]) @@ -93,7 +89,8 @@ def __init__( resnet_act_fn=act_fn, resnet_groups=norm_num_groups, attn_num_head_channels=None, - temb_channels=None, ) + temb_channels=None, + ) self.down_blocks.append(down_block) # mid @@ -105,18 +102,19 @@ def __init__( resnet_time_scale_shift="default", attn_num_head_channels=None, resnet_groups=norm_num_groups, - temb_channels=None, ) + temb_channels=None, + ) # out self.conv_norm_out = nn.GroupNorm( num_channels=block_out_channels[-1], num_groups=norm_num_groups, - epsilon=1e-6, ) + epsilon=1e-6, + ) self.conv_act = nn.Silu() conv_out_channels = 2 * out_channels if double_z else out_channels - self.conv_out = nn.Conv2D( - block_out_channels[-1], conv_out_channels, 3, padding=1) + self.conv_out = nn.Conv2D(block_out_channels[-1], conv_out_channels, 3, padding=1) self.gradient_checkpointing = False def forward(self, x): @@ -156,23 +154,19 @@ def custom_forward(*inputs): class Decoder(nn.Layer): def __init__( - self, - in_channels=3, - out_channels=3, - up_block_types=("UpDecoderBlock2D", ), - block_out_channels=(64, ), - layers_per_block=2, - norm_num_groups=32, - act_fn="silu", ): + self, + in_channels=3, + out_channels=3, + up_block_types=("UpDecoderBlock2D",), + block_out_channels=(64,), + layers_per_block=2, + norm_num_groups=32, + act_fn="silu", + ): super().__init__() self.layers_per_block = layers_per_block - self.conv_in = nn.Conv2D( - in_channels, - block_out_channels[-1], - kernel_size=3, - stride=1, - padding=1) + self.conv_in = nn.Conv2D(in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1) self.mid_block = None self.up_blocks = nn.LayerList([]) @@ -186,7 +180,8 @@ def __init__( resnet_time_scale_shift="default", attn_num_head_channels=None, resnet_groups=norm_num_groups, - temb_channels=None, ) + temb_channels=None, + ) # up reversed_block_out_channels = list(reversed(block_out_channels)) @@ -208,18 +203,15 @@ def __init__( resnet_act_fn=act_fn, resnet_groups=norm_num_groups, attn_num_head_channels=None, - temb_channels=None, ) + temb_channels=None, + ) self.up_blocks.append(up_block) prev_output_channel = output_channel # out - self.conv_norm_out = nn.GroupNorm( - num_channels=block_out_channels[0], - num_groups=norm_num_groups, - epsilon=1e-6) + self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, epsilon=1e-6) self.conv_act = nn.Silu() - self.conv_out = nn.Conv2D( - block_out_channels[0], out_channels, 3, padding=1) + self.conv_out = nn.Conv2D(block_out_channels[0], out_channels, 3, padding=1) self.gradient_checkpointing = False def forward(self, z): @@ -255,8 +247,7 @@ def custom_forward(*inputs): # (TODO, junnyu) check nan # clamp inf values to enable fp16 training - if (amp_state() or - sample.dtype == paddle.float16) and paddle.isinf(sample).any(): + if (amp_state() or sample.dtype == paddle.float16) and paddle.isinf(sample).any(): clamp_value = finfo(sample.dtype).max - 1000 sample = paddle.clip(sample, min=-clamp_value, max=clamp_value) @@ -278,14 +269,15 @@ class VectorQuantizer(nn.Layer): # backwards compatibility we use the buggy version by default, but you can # specify legacy=False to fix it. def __init__( - self, - n_e, - vq_embed_dim, - beta, - remap=None, - unknown_index="random", - sane_index_shape=False, - legacy=True, ): + self, + n_e, + vq_embed_dim, + beta, + remap=None, + unknown_index="random", + sane_index_shape=False, + legacy=True, + ): super().__init__() self.n_e = n_e self.vq_embed_dim = vq_embed_dim @@ -306,8 +298,10 @@ def __init__( if self.unknown_index == "extra": self.unknown_index = self.re_embed self.re_embed = self.re_embed + 1 - print(f"Remapping {self.n_e} indices to {self.re_embed} indices. " - f"Using {self.unknown_index} for unknown indices.") + print( + f"Remapping {self.n_e} indices to {self.re_embed} indices. " + f"Using {self.unknown_index} for unknown indices." + ) else: self.re_embed = n_e @@ -322,8 +316,7 @@ def remap_to_used(self, inds): new = match.argmax(-1) unknown = match.sum(2) < 1 if self.unknown_index == "random": - new[unknown] = paddle.randint( - 0, self.re_embed, shape=new[unknown].shape) + new[unknown] = paddle.randint(0, self.re_embed, shape=new[unknown].shape) else: new[unknown] = self.unknown_index return new.reshape(ishape) @@ -335,8 +328,7 @@ def unmap_to_all(self, inds): used = self.used.cast(inds.dtype) if self.re_embed > self.used.shape[0]: # extra token inds[inds >= self.used.shape[0]] = 0 # simply set to zero - back = paddle.take_along_axis( - used[None, :][inds.shape[0] * [0], :], inds, axis=1) + back = paddle.take_along_axis(used[None, :][inds.shape[0] * [0], :], inds, axis=1) return back.reshape(ishape) def forward(self, z): @@ -345,9 +337,11 @@ def forward(self, z): z_flattened = z.reshape([-1, self.vq_embed_dim]) # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z - d = (paddle.sum(z_flattened**2, axis=1, keepdim=True) + paddle.sum( - self.embedding.weight**2, axis=1) - 2 * paddle.matmul( - z_flattened, self.embedding.weight, transpose_y=True)) + d = ( + paddle.sum(z_flattened**2, axis=1, keepdim=True) + + paddle.sum(self.embedding.weight**2, axis=1) + - 2 * paddle.matmul(z_flattened, self.embedding.weight, transpose_y=True) + ) min_encoding_indices = paddle.argmin(d, axis=1) z_q = self.embedding(min_encoding_indices).reshape(z.shape) @@ -356,11 +350,9 @@ def forward(self, z): # compute loss for embedding if not self.legacy: - loss = self.beta * paddle.mean((z_q.detach() - z)**2) + paddle.mean( - (z_q - z.detach())**2) + loss = self.beta * paddle.mean((z_q.detach() - z) ** 2) + paddle.mean((z_q - z.detach()) ** 2) else: - loss = paddle.mean((z_q.detach() - z)**2) + self.beta * paddle.mean( - (z_q - z.detach())**2) + loss = paddle.mean((z_q.detach() - z) ** 2) + self.beta * paddle.mean((z_q - z.detach()) ** 2) # preserve gradients z_q = z + (z_q - z).detach() @@ -369,15 +361,12 @@ def forward(self, z): z_q = z_q.transpose([0, 3, 1, 2]) if self.remap is not None: - min_encoding_indices = min_encoding_indices.reshape( - [z.shape[0], -1]) # add batch axis + min_encoding_indices = min_encoding_indices.reshape([z.shape[0], -1]) # add batch axis min_encoding_indices = self.remap_to_used(min_encoding_indices) - min_encoding_indices = min_encoding_indices.reshape( - [-1, 1]) # flatten + min_encoding_indices = min_encoding_indices.reshape([-1, 1]) # flatten if self.sane_index_shape: - min_encoding_indices = min_encoding_indices.reshape( - [z_q.shape[0], z_q.shape[2], z_q.shape[3]]) + min_encoding_indices = min_encoding_indices.reshape([z_q.shape[0], z_q.shape[2], z_q.shape[3]]) return z_q, loss, (perplexity, min_encodings, min_encoding_indices) @@ -386,7 +375,11 @@ def get_codebook_entry(self, indices, shape): if self.remap is not None: indices = indices.reshape([shape[0], -1]) # add batch axis indices = self.unmap_to_all(indices) - indices = indices.reshape([-1, ]) # flatten again + indices = indices.reshape( + [ + -1, + ] + ) # flatten again # get quantized latent vectors z_q = self.embedding(indices) @@ -408,14 +401,11 @@ def __init__(self, parameters, deterministic=False): self.std = paddle.exp(0.5 * self.logvar) self.var = paddle.exp(self.logvar) if self.deterministic: - self.var = self.std = paddle.zeros_like( - self.mean, dtype=self.parameters.dtype) + self.var = self.std = paddle.zeros_like(self.mean, dtype=self.parameters.dtype) - def sample(self, - generator: Optional[paddle.Generator]=None) -> paddle.Tensor: + def sample(self, generator: Optional[paddle.Generator] = None) -> paddle.Tensor: # make sure sample is on the same device as the parameters and has same dtype - sample = randn_tensor( - self.mean.shape, generator=generator, dtype=self.parameters.dtype) + sample = randn_tensor(self.mean.shape, generator=generator, dtype=self.parameters.dtype) x = self.mean + self.std * sample return x @@ -426,21 +416,26 @@ def kl(self, other=None): if other is None: return 0.5 * paddle.sum( paddle.pow(self.mean, 2) + self.var - 1.0 - self.logvar, - axis=[1, 2, 3], ) + axis=[1, 2, 3], + ) else: return 0.5 * paddle.sum( - paddle.pow(self.mean - other.mean, 2) / other.var + self.var - / other.var - 1.0 - self.logvar + other.logvar, - axis=[1, 2, 3], ) + paddle.pow(self.mean - other.mean, 2) / other.var + + self.var / other.var + - 1.0 + - self.logvar + + other.logvar, + axis=[1, 2, 3], + ) def nll(self, sample, axis=[1, 2, 3]): if self.deterministic: return paddle.to_tensor([0.0]) logtwopi = np.log(2.0 * np.pi) return 0.5 * paddle.sum( - logtwopi + self.logvar + paddle.pow(sample - self.mean, 2) / - self.var, - axis=axis, ) + logtwopi + self.logvar + paddle.pow(sample - self.mean, 2) / self.var, + axis=axis, + ) def mode(self): return self.mean diff --git a/ppdiffusers/ppdiffusers/models/vq_model.py b/ppdiffusers/ppdiffusers/models/vq_model.py index 87a07653649cd..8104816e90486 100644 --- a/ppdiffusers/ppdiffusers/models/vq_model.py +++ b/ppdiffusers/ppdiffusers/models/vq_model.py @@ -69,20 +69,21 @@ class VQModel(ModelMixin, ConfigMixin): @register_to_config def __init__( - self, - in_channels: int=3, - out_channels: int=3, - down_block_types: Tuple[str]=("DownEncoderBlock2D", ), - up_block_types: Tuple[str]=("UpDecoderBlock2D", ), - block_out_channels: Tuple[int]=(64, ), - layers_per_block: int=1, - act_fn: str="silu", - latent_channels: int=3, - sample_size: int=32, - num_vq_embeddings: int=256, - norm_num_groups: int=32, - vq_embed_dim: Optional[int]=None, - scaling_factor: float=0.18215, ): + self, + in_channels: int = 3, + out_channels: int = 3, + down_block_types: Tuple[str] = ("DownEncoderBlock2D",), + up_block_types: Tuple[str] = ("UpDecoderBlock2D",), + block_out_channels: Tuple[int] = (64,), + layers_per_block: int = 1, + act_fn: str = "silu", + latent_channels: int = 3, + sample_size: int = 32, + num_vq_embeddings: int = 256, + norm_num_groups: int = 32, + vq_embed_dim: Optional[int] = None, + scaling_factor: float = 0.18215, + ): super().__init__() # pass init params to Encoder @@ -94,7 +95,8 @@ def __init__( layers_per_block=layers_per_block, act_fn=act_fn, norm_num_groups=norm_num_groups, - double_z=False, ) + double_z=False, + ) vq_embed_dim = vq_embed_dim if vq_embed_dim is not None else latent_channels @@ -104,7 +106,8 @@ def __init__( vq_embed_dim, beta=0.25, remap=None, - sane_index_shape=False, ) + sane_index_shape=False, + ) self.post_quant_conv = nn.Conv2D(vq_embed_dim, latent_channels, 1) # pass init params to Decoder @@ -115,22 +118,24 @@ def __init__( block_out_channels=block_out_channels, layers_per_block=layers_per_block, act_fn=act_fn, - norm_num_groups=norm_num_groups, ) + norm_num_groups=norm_num_groups, + ) - def encode(self, x: paddle.Tensor, return_dict: bool=True): + def encode(self, x: paddle.Tensor, return_dict: bool = True): h = self.encoder(x) h = self.quant_conv(h) if not return_dict: - return (h, ) + return (h,) return VQEncoderOutput(latents=h) def decode( - self, - h: paddle.Tensor, - force_not_quantize: bool=False, - return_dict: bool=True, ): + self, + h: paddle.Tensor, + force_not_quantize: bool = False, + return_dict: bool = True, + ): # cast h to float16 / float32 h = h.cast(self.dtype) # also go through quantization layer @@ -142,11 +147,11 @@ def decode( dec = self.decoder(quant) if not return_dict: - return (dec, ) + return (dec,) return DecoderOutput(sample=dec) - def forward(self, sample: paddle.Tensor, return_dict: bool=True): + def forward(self, sample: paddle.Tensor, return_dict: bool = True): r""" Args: sample (`paddle.Tensor`): Input sample. @@ -158,6 +163,6 @@ def forward(self, sample: paddle.Tensor, return_dict: bool=True): dec = self.decode(h).sample if not return_dict: - return (dec, ) + return (dec,) return DecoderOutput(sample=dec) diff --git a/ppdiffusers/ppdiffusers/optimization.py b/ppdiffusers/ppdiffusers/optimization.py index 738ef9f4d113f..d6c5efafaed3f 100644 --- a/ppdiffusers/ppdiffusers/optimization.py +++ b/ppdiffusers/ppdiffusers/optimization.py @@ -34,7 +34,7 @@ class SchedulerType(Enum): CONSTANT_WITH_WARMUP = "constant_with_warmup" -def get_constant_schedule(learning_rate: float, last_epoch: int=-1): +def get_constant_schedule(learning_rate: float, last_epoch: int = -1): """ Create a schedule with a constant learning rate, using the learning rate set in optimizer. @@ -50,9 +50,7 @@ def get_constant_schedule(learning_rate: float, last_epoch: int=-1): return LambdaDecay(learning_rate, lambda _: 1, last_epoch=last_epoch) -def get_constant_schedule_with_warmup(learning_rate: float, - num_warmup_steps: int, - last_epoch: int=-1): +def get_constant_schedule_with_warmup(learning_rate: float, num_warmup_steps: int, last_epoch: int = -1): """ Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate increases linearly between 0 and the initial lr set in the optimizer. @@ -78,10 +76,11 @@ def lr_lambda(current_step: int): def get_linear_schedule_with_warmup( - learning_rate: float, - num_warmup_steps: int, - num_training_steps: int, - last_epoch: int=-1, ): + learning_rate: float, + num_warmup_steps: int, + num_training_steps: int, + last_epoch: int = -1, +): """ Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer. @@ -105,18 +104,19 @@ def lr_lambda(current_step: int): return float(current_step) / float(max(1, num_warmup_steps)) return max( 0.0, - float(num_training_steps - current_step) / - float(max(1, num_training_steps - num_warmup_steps)), ) + float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)), + ) return LambdaDecay(learning_rate, lr_lambda, last_epoch) def get_cosine_schedule_with_warmup( - learning_rate: float, - num_warmup_steps: int, - num_training_steps: int, - num_cycles: float=0.5, - last_epoch: int=-1, ): + learning_rate: float, + num_warmup_steps: int, + num_training_steps: int, + num_cycles: float = 0.5, + last_epoch: int = -1, +): """ Create a schedule with a learning rate that decreases following the values of the cosine function between the initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the @@ -142,21 +142,19 @@ def get_cosine_schedule_with_warmup( def lr_lambda(current_step): if current_step < num_warmup_steps: return float(current_step) / float(max(1, num_warmup_steps)) - progress = float(current_step - num_warmup_steps) / float( - max(1, num_training_steps - num_warmup_steps)) - return max( - 0.0, 0.5 * - (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) + progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) + return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) return LambdaDecay(learning_rate, lr_lambda, last_epoch) def get_cosine_with_hard_restarts_schedule_with_warmup( - learning_rate: float, - num_warmup_steps: int, - num_training_steps: int, - num_cycles: int=1, - last_epoch: int=-1, ): + learning_rate: float, + num_warmup_steps: int, + num_training_steps: int, + num_cycles: int = 1, + last_epoch: int = -1, +): """ Create a schedule with a learning rate that decreases following the values of the cosine function between the initial lr set in the optimizer to 0, with several hard restarts, after a warmup period during which it increases @@ -181,25 +179,25 @@ def get_cosine_with_hard_restarts_schedule_with_warmup( def lr_lambda(current_step): if current_step < num_warmup_steps: return float(current_step) / float(max(1, num_warmup_steps)) - progress = float(current_step - num_warmup_steps) / float( - max(1, num_training_steps - num_warmup_steps)) + progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) if progress >= 1.0: return 0.0 return max( 0.0, - 0.5 * (1.0 + math.cos(math.pi * ( - (float(num_cycles) * progress) % 1.0))), ) + 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))), + ) return LambdaDecay(learning_rate, lr_lambda, last_epoch) def get_polynomial_decay_schedule_with_warmup( - learning_rate: float, - num_warmup_steps: int, - num_training_steps: int, - lr_end: float=1e-7, - power: float=1.0, - last_epoch: int=-1, ): + learning_rate: float, + num_warmup_steps: int, + num_training_steps: int, + lr_end: float = 1e-7, + power: float = 1.0, + last_epoch: int = -1, +): """ Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the optimizer to end lr defined by *lr_end*, after a warmup period during which it increases linearly from 0 to the @@ -230,8 +228,7 @@ def get_polynomial_decay_schedule_with_warmup( lr_init = learning_rate if not (lr_init > lr_end): - raise ValueError( - f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})") + raise ValueError(f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})") def lr_lambda(current_step: int): if current_step < num_warmup_steps: @@ -251,8 +248,7 @@ def lr_lambda(current_step: int): TYPE_TO_SCHEDULER_FUNCTION = { SchedulerType.LINEAR: get_linear_schedule_with_warmup, SchedulerType.COSINE: get_cosine_schedule_with_warmup, - SchedulerType.COSINE_WITH_RESTARTS: - get_cosine_with_hard_restarts_schedule_with_warmup, + SchedulerType.COSINE_WITH_RESTARTS: get_cosine_with_hard_restarts_schedule_with_warmup, SchedulerType.POLYNOMIAL: get_polynomial_decay_schedule_with_warmup, SchedulerType.CONSTANT: get_constant_schedule, SchedulerType.CONSTANT_WITH_WARMUP: get_constant_schedule_with_warmup, @@ -260,13 +256,14 @@ def lr_lambda(current_step: int): def get_scheduler( - name: Union[str, SchedulerType], - learning_rate: float=0.1, - num_warmup_steps: Optional[int]=None, - num_training_steps: Optional[int]=None, - num_cycles: int=1, - power: float=1.0, - last_epoch: int=-1, ): + name: Union[str, SchedulerType], + learning_rate: float = 0.1, + num_warmup_steps: Optional[int] = None, + num_training_steps: Optional[int] = None, + num_cycles: int = 1, + power: float = 1.0, + last_epoch: int = -1, +): """ Unified API to get any scheduler from its name. @@ -295,20 +292,18 @@ def get_scheduler( # All other schedulers require `num_warmup_steps` if num_warmup_steps is None: - raise ValueError( - f"{name} requires `num_warmup_steps`, please provide that argument.") + raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.") if name == SchedulerType.CONSTANT_WITH_WARMUP: return schedule_func( learning_rate=learning_rate, num_warmup_steps=num_warmup_steps, - last_epoch=last_epoch, ) + last_epoch=last_epoch, + ) # All other schedulers require `num_training_steps` if num_training_steps is None: - raise ValueError( - f"{name} requires `num_training_steps`, please provide that argument." - ) + raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.") if name == SchedulerType.COSINE_WITH_RESTARTS: return schedule_func( @@ -316,7 +311,8 @@ def get_scheduler( num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, num_cycles=num_cycles, - last_epoch=last_epoch, ) + last_epoch=last_epoch, + ) if name == SchedulerType.POLYNOMIAL: return schedule_func( @@ -324,10 +320,12 @@ def get_scheduler( num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, power=power, - last_epoch=last_epoch, ) + last_epoch=last_epoch, + ) return schedule_func( learning_rate=learning_rate, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, - last_epoch=last_epoch, ) + last_epoch=last_epoch, + ) diff --git a/ppdiffusers/ppdiffusers/patches/ppnlp_patch_utils.py b/ppdiffusers/ppdiffusers/patches/ppnlp_patch_utils.py index 24a72c1aa650d..74f6bdbb6b2b6 100644 --- a/ppdiffusers/ppdiffusers/patches/ppnlp_patch_utils.py +++ b/ppdiffusers/ppdiffusers/patches/ppnlp_patch_utils.py @@ -25,11 +25,25 @@ from typing import Any, Callable, Dict, List, Optional, Tuple from ..utils import ( - DIFFUSERS_CACHE, FROM_DIFFUSERS, FROM_HF_HUB, HF_HUB_OFFLINE, - LOW_CPU_MEM_USAGE_DEFAULT, PPDIFFUSERS_CACHE, TO_DIFFUSERS, _add_variant, - _get_model_file, get_logger, is_paddle_available, is_paddlenlp_available, - is_ppxformers_available, is_safetensors_available, is_torch_available, - is_torch_file, smart_load, str2bool) + DIFFUSERS_CACHE, + FROM_DIFFUSERS, + FROM_HF_HUB, + HF_HUB_OFFLINE, + LOW_CPU_MEM_USAGE_DEFAULT, + PPDIFFUSERS_CACHE, + TO_DIFFUSERS, + _add_variant, + _get_model_file, + get_logger, + is_paddle_available, + is_paddlenlp_available, + is_ppxformers_available, + is_safetensors_available, + is_torch_available, + is_torch_file, + smart_load, + str2bool, +) logger = get_logger(__name__) @@ -60,8 +74,7 @@ def copy_func(f): "Copy a non-builtin function (NB `copy.copy` does not work for this)" if not isinstance(f, FunctionType): return copy.copy(f) - fn = FunctionType(f.__code__, f.__globals__, f.__name__, f.__defaults__, - f.__closure__) + fn = FunctionType(f.__code__, f.__globals__, f.__name__, f.__defaults__, f.__closure__) fn.__kwdefaults__ = f.__kwdefaults__ fn.__dict__.update(f.__dict__) fn.__annotations__.update(f.__annotations__) @@ -81,7 +94,7 @@ def __get__(self, _, f_cls): def patch_to(cls, as_prop=False, cls_method=False): "Decorator: add `f` to `cls`" if not isinstance(cls, (tuple, list)): - cls = (cls, ) + cls = (cls,) def _inner(f): for c_ in cls: @@ -108,11 +121,11 @@ def _inner(f): def is_floating_point(x): if not isinstance(x, (paddle.Tensor, paddle.static.Variable)): - raise TypeError( - "Expected Tensor, but received type of x: {}".format(type(x))) + raise TypeError("Expected Tensor, but received type of x: {}".format(type(x))) dtype = x.dtype - is_fp_dtype = (dtype == paddle.float32 or dtype == paddle.float64 or - dtype == paddle.float16 or dtype == paddle.bfloat16) + is_fp_dtype = ( + dtype == paddle.float32 or dtype == paddle.float64 or dtype == paddle.float16 or dtype == paddle.bfloat16 + ) return is_fp_dtype if not hasattr(paddle, "is_floating_point"): @@ -219,7 +232,8 @@ def Parameter(data: paddle.Tensor, requires_grad=True): tensor = paddle.create_parameter( data.shape, dtype=data.dtype, - default_initializer=nn.initializer.Assign(data), ) + default_initializer=nn.initializer.Assign(data), + ) if not requires_grad: tensor.stop_gradient = True return tensor @@ -247,8 +261,7 @@ def get_sublayer(self, target: str): for item in atoms: if not hasattr(mod, item): - raise AttributeError(mod.__class__.__name__ + " has no " - "attribute `" + item + "`") + raise AttributeError(mod.__class__.__name__ + " has no " "attribute `" + item + "`") mod = getattr(mod, item) @@ -259,23 +272,21 @@ def get_sublayer(self, target: str): nn.Layer.get_sublayer = get_sublayer class _WrappedHook: - def __init__(self, hook: Callable, module: Optional["nn.Layer"]=None): + def __init__(self, hook: Callable, module: Optional["nn.Layer"] = None): self.hook: Callable = hook functools.update_wrapper(self, hook) self.with_module: bool = False if module is not None: - self.module: weakref.ReferenceType["nn.Layer"] = weakref.ref( - module) + self.module: weakref.ReferenceType["nn.Layer"] = weakref.ref(module) self.with_module = True def __call__(self, *args: Any, **kwargs: Any) -> Any: if self.with_module: module = self.module() if module is None: - raise RuntimeError( - "You are trying to call the hook of a dead Module!") + raise RuntimeError("You are trying to call the hook of a dead Module!") return self.hook(module, *args, **kwargs) return self.hook(*args, **kwargs) @@ -292,8 +303,7 @@ def __setstate__(self, state: Dict): if self.with_module: if state["module"] is None: - raise RuntimeError( - "You are trying to revive the hook of a dead Module!") + raise RuntimeError("You are trying to revive the hook of a dead Module!") self.module = weakref.ref(state["module"]) try: @@ -305,22 +315,20 @@ def register_load_state_dict_pre_hook(self, hook, with_module=False): if not hasattr(self, "load_state_dict_pre_hooks"): self.load_state_dict_pre_hooks = OrderedDict() handle = HookRemoveHelper(self.load_state_dict_pre_hooks) - self.load_state_dict_pre_hooks[handle._hook_id] = _WrappedHook( - hook, self if with_module else None) + self.load_state_dict_pre_hooks[handle._hook_id] = _WrappedHook(hook, self if with_module else None) return handle nn.Layer.register_load_state_dict_pre_hook = register_load_state_dict_pre_hook raw_set_state_dict = nn.Layer.set_state_dict - def set_state_dict(self, state_dict, use_structured_name: bool=True): + def set_state_dict(self, state_dict, use_structured_name: bool = True): if hasattr(self, "load_state_dict_pre_hooks"): for hook in self.load_state_dict_pre_hooks.values(): hook(state_dict) # POP is_torch_weight state_dict.pop("is_torch_weight", None) - return raw_set_state_dict( - self, state_dict, use_structured_name=use_structured_name) + return raw_set_state_dict(self, state_dict, use_structured_name=use_structured_name) nn.Layer.set_state_dict = set_state_dict nn.Layer.load_dict = nn.Layer.set_state_dict @@ -338,12 +346,12 @@ def set_state_dict(self, state_dict, use_structured_name: bool=True): from ..utils.paddle_utils import no_init_weights if is_ppxformers_available(): - from paddle.incubate.nn.memory_efficient_attention import \ - memory_efficient_attention + from paddle.incubate.nn.memory_efficient_attention import ( + memory_efficient_attention, + ) from paddle.nn.functional.flash_attention import flash_attention - sdp_kernel = paddle.nn.functional.flash_attention._select_sdp_cuda(128 + - 64) + sdp_kernel = paddle.nn.functional.flash_attention._select_sdp_cuda(128 + 64) if sdp_kernel == "mem_efficient": flash_attn_version = 1 else: @@ -353,33 +361,32 @@ def set_state_dict(self, state_dict, use_structured_name: bool=True): flash_attn_error = None try: _ = flash_attention( - paddle.ones( - (1, 1, 2, 40), dtype=paddle.float16), - paddle.ones( - (1, 1, 2, 40), dtype=paddle.float16), - paddle.ones( - (1, 1, 2, 40), dtype=paddle.float16), ) + paddle.ones((1, 1, 2, 40), dtype=paddle.float16), + paddle.ones((1, 1, 2, 40), dtype=paddle.float16), + paddle.ones((1, 1, 2, 40), dtype=paddle.float16), + ) except Exception as error: flash_attn_error = error is_support_flash_attention = False def scaled_dot_product_attention_( - query, - key, - value, - attn_mask=None, - dropout_p=0.0, - is_causal=False, - scale=None, - training=True, - attention_op=None, ): + query, + key, + value, + attn_mask=None, + dropout_p=0.0, + is_causal=False, + scale=None, + training=True, + attention_op=None, + ): if attention_op in [None, "auto"]: head_dim = query.shape[-1] attention_op = "cutlass" if is_support_flash_attention and query.dtype in [ - paddle.float16, - paddle.bfloat16, + paddle.float16, + paddle.bfloat16, ]: if flash_attn_version == 1: if head_dim <= 128: @@ -403,17 +410,12 @@ def scaled_dot_product_attention_( else: if attn_mask is not None: attn_mask = paddle.transpose(attn_mask, [0, 2, 1, 3]) - if (attn_mask.cast("float32").min() == 0 and - attn_mask.cast("float32").max() == 1): + if attn_mask.cast("float32").min() == 0 and attn_mask.cast("float32").max() == 1: attn_mask = (attn_mask.cast(s.dtype) - 1) * 10000.0 s = s + attn_mask p = paddle.nn.functional.softmax(s, axis=-1) if dropout_p > 0.0: - p = paddle.nn.functional.dropout( - p, - dropout_p, - training=training, - mode="upscale_in_train") + p = paddle.nn.functional.dropout(p, dropout_p, training=training, mode="upscale_in_train") o = paddle.matmul(p, vt) return paddle.transpose(o, [0, 2, 1, 3]) elif attention_op == "cutlass": @@ -427,7 +429,8 @@ def scaled_dot_product_attention_( None, p=dropout_p if training else 0.0, scale=scale, - training=True, ) # make sure we use training=True + training=True, + ) # make sure we use training=True elif attention_op == "flash": output = flash_attention( query, @@ -435,15 +438,13 @@ def scaled_dot_product_attention_( value, dropout=dropout_p, causal=is_causal, - return_softmax=False, )[0] + return_softmax=False, + )[0] else: - raise ValueError( - "ppxformers's attention_op shoulde be in ['cutlass', 'flash', 'math']" - ) + raise ValueError("ppxformers's attention_op shoulde be in ['cutlass', 'flash', 'math']") return output - paddle.nn.functional.scaled_dot_product_attention_ = ( - scaled_dot_product_attention_) + paddle.nn.functional.scaled_dot_product_attention_ = scaled_dot_product_attention_ @patch_to(nn.Layer, as_prop=True) def dtype(parameter: nn.Layer) -> paddle.dtype: @@ -474,8 +475,10 @@ def device(self): from shutil import copyfile import sentencepiece as spm - from paddlenlp.transformers.tokenizer_utils import (AddedToken, - PretrainedTokenizer) + from paddlenlp.transformers.tokenizer_utils import ( + AddedToken, + PretrainedTokenizer, + ) SPIECE_UNDERLINE = "▁" @@ -495,24 +498,24 @@ class XLMRobertaTokenizer(PretrainedTokenizer): model_input_names = ["input_ids", "attention_mask"] def __init__( - self, - vocab_file, - bos_token="", - eos_token="", - sep_token="", - cls_token="", - unk_token="", - pad_token="", - mask_token="", - sp_model_kwargs: Optional[Dict[str, Any]]=None, - **kwargs, ) -> None: + self, + vocab_file, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + sp_model_kwargs: Optional[Dict[str, Any]] = None, + **kwargs, + ) -> None: # Mask token behave like a normal word, i.e. include the space before it - mask_token = (AddedToken( - mask_token, lstrip=True, rstrip=False) - if isinstance(mask_token, str) else mask_token) + mask_token = ( + AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + ) - self.sp_model_kwargs = ({} if sp_model_kwargs is None else - sp_model_kwargs) + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs super().__init__( bos_token=bos_token, @@ -523,10 +526,10 @@ def __init__( pad_token=pad_token, mask_token=mask_token, sp_model_kwargs=self.sp_model_kwargs, - **kwargs, ) + **kwargs, + ) - self.sp_model = spm.SentencePieceProcessor( - **self.sp_model_kwargs) + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file @@ -547,12 +550,8 @@ def __init__( # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab self.fairseq_offset = 1 - self.fairseq_tokens_to_ids[""] = ( - len(self.sp_model) + self.fairseq_offset) - self.fairseq_ids_to_tokens = { - v: k - for k, v in self.fairseq_tokens_to_ids.items() - } + self.fairseq_tokens_to_ids[""] = len(self.sp_model) + self.fairseq_offset + self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} def __getstate__(self): state = self.__dict__.copy() @@ -567,14 +566,12 @@ def __setstate__(self, d): if not hasattr(self, "sp_model_kwargs"): self.sp_model_kwargs = {} - self.sp_model = spm.SentencePieceProcessor( - **self.sp_model_kwargs) + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.LoadFromSerializedProto(self.sp_model_proto) def build_inputs_with_special_tokens( - self, - token_ids_0: List[int], - token_ids_1: Optional[List[int]]=None) -> List[int]: + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. An XLM-RoBERTa sequence has the following format: @@ -590,17 +587,17 @@ def build_inputs_with_special_tokens( """ if token_ids_1 is None: - return [self.cls_token_id - ] + token_ids_0 + [self.sep_token_id] + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] cls = [self.cls_token_id] sep = [self.sep_token_id] return cls + token_ids_0 + sep + sep + token_ids_1 + sep def get_special_tokens_mask( - self, - token_ids_0: List[int], - token_ids_1: Optional[List[int]]=None, - already_has_special_tokens: bool=False, ) -> List[int]: + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None, + already_has_special_tokens: bool = False, + ) -> List[int]: """ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer `prepare_for_model` method. @@ -619,17 +616,16 @@ def get_special_tokens_mask( return super().get_special_tokens_mask( token_ids_0=token_ids_0, token_ids_1=token_ids_1, - already_has_special_tokens=True, ) + already_has_special_tokens=True, + ) if token_ids_1 is None: return [1] + ([0] * len(token_ids_0)) + [1] - return ([1] + ([0] * len(token_ids_0)) + [1, 1] + - ([0] * len(token_ids_1)) + [1]) + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] def create_token_type_ids_from_sequences( - self, - token_ids_0: List[int], - token_ids_1: Optional[List[int]]=None) -> List[int]: + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: """ Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does not make use of token type ids, therefore a list of zeros is returned. @@ -647,19 +643,14 @@ def create_token_type_ids_from_sequences( if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep + sep + token_ids_1 + - sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] @property def vocab_size(self): - return (len(self.sp_model) + self.fairseq_offset + 1 - ) # Add the token + return len(self.sp_model) + self.fairseq_offset + 1 # Add the token def get_vocab(self): - vocab = { - self.convert_ids_to_tokens(i): i - for i in range(self.vocab_size) - } + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab @@ -683,33 +674,28 @@ def _convert_id_to_token(self, index): def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (strings for sub-words) in a single string.""" - out_string = "".join(tokens).replace(SPIECE_UNDERLINE, - " ").strip() + out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return out_string - def save_vocabulary( - self, save_directory: str, - filename_prefix: Optional[str]=None) -> Tuple[str]: + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): - logger.error( - f"Vocabulary path ({save_directory}) should be a directory" - ) + logger.error(f"Vocabulary path ({save_directory}) should be a directory") return out_vocab_file = os.path.join( save_directory, - (filename_prefix + "-" if filename_prefix else "") + - self.resource_files_names["vocab_file"], ) + (filename_prefix + "-" if filename_prefix else "") + self.resource_files_names["vocab_file"], + ) - if os.path.abspath(self.vocab_file) != os.path.abspath( - out_vocab_file) and os.path.isfile(self.vocab_file): + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile( + self.vocab_file + ): copyfile(self.vocab_file, out_vocab_file) elif not os.path.isfile(self.vocab_file): with open(out_vocab_file, "wb") as fi: - content_spiece_model = self.sp_model.serialized_model_proto( - ) + content_spiece_model = self.sp_model.serialized_model_proto() fi.write(content_spiece_model) - return (out_vocab_file, ) + return (out_vocab_file,) paddlenlp.transformers.XLMRobertaTokenizer = XLMRobertaTokenizer @@ -719,16 +705,17 @@ def save_vocabulary( BertModel.raw_forward = BertModel.forward def forward_new( - self, - input_ids: paddle.Tensor, - token_type_ids: Optional[paddle.Tensor]=None, - position_ids: Optional[paddle.Tensor]=None, - attention_mask: Optional[paddle.Tensor]=None, - past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]]=None, - use_cache: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - output_attentions: Optional[bool]=None, - return_dict: Optional[bool]=None, ): + self, + input_ids: paddle.Tensor, + token_type_ids: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): if attention_mask is None: attention_mask = paddle.ones_like(input_ids) return self.raw_forward( @@ -740,7 +727,8 @@ def forward_new( use_cache=use_cache, output_hidden_states=output_hidden_states, output_attentions=output_attentions, - return_dict=return_dict, ) + return_dict=return_dict, + ) BertModel.forward = forward_new @@ -748,13 +736,10 @@ def forward_new( TRANSFORMERS_WEIGHTS_NAME = "pytorch_model.bin" # patch from_pretrained and save_pretrained - def from_pretrained_v3(cls, - pretrained_model_name_or_path, - *args, - from_hf_hub: bool=False, - **kwargs): - cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub - else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)) + def from_pretrained_v3(cls, pretrained_model_name_or_path, *args, from_hf_hub: bool = False, **kwargs): + cache_dir = ( + kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE) + ) ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False) force_download = kwargs.pop("force_download", False) from_diffusers = kwargs.pop("from_diffusers", None) @@ -773,8 +758,7 @@ def from_pretrained_v3(cls, paddle_dtype = _dtype subfolder = kwargs.pop("subfolder", None) variant = kwargs.pop("variant", None) - low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", - LOW_CPU_MEM_USAGE_DEFAULT) + low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", LOW_CPU_MEM_USAGE_DEFAULT) user_agent = { "ppdiffusers": __version__, @@ -787,8 +771,7 @@ def from_pretrained_v3(cls, model_kwargs = kwargs # 1. get the PretrainedConfig to init model if not isinstance(config, PretrainedConfig): - config_path = (config if config is not None else - pretrained_model_name_or_path) + config_path = config if config is not None else pretrained_model_name_or_path # TODO fix config from_pretrained # must from hf hub @@ -797,9 +780,11 @@ def from_pretrained_v3(cls, kwargs["subfolder"] = subfolder else: if subfolder is not None: - config_path = (os.path.join(config_path, subfolder) - if os.path.isdir(config_path) else - "/".join([config_path, subfolder])) + config_path = ( + os.path.join(config_path, subfolder) + if os.path.isdir(config_path) + else "/".join([config_path, subfolder]) + ) config, model_kwargs = cls.config_class.from_pretrained( config_path, @@ -807,12 +792,12 @@ def from_pretrained_v3(cls, return_unused_kwargs=True, force_download=force_download, from_hf_hub=from_hf_hub, - **kwargs, ) + **kwargs, + ) assert config is not None # we will remove in the future. - if not from_hf_hub and not os.path.exists( - os.path.join(cache_dir, config_path, "config.json")): + if not from_hf_hub and not os.path.exists(os.path.join(cache_dir, config_path, "config.json")): config.save_pretrained(os.path.join(cache_dir, config_path)) if paddle_dtype is None: @@ -825,8 +810,7 @@ def from_pretrained_v3(cls, try: model_file = _get_model_file( pretrained_model_name_or_path, - weights_name=_add_variant( - TRANSFORMERS_SAFE_WEIGHTS_NAME, variant), + weights_name=_add_variant(TRANSFORMERS_SAFE_WEIGHTS_NAME, variant), cache_dir=cache_dir, force_download=force_download, resume_download=resume_download, @@ -836,15 +820,15 @@ def from_pretrained_v3(cls, revision=revision, subfolder=subfolder, user_agent=user_agent, - from_hf_hub=from_hf_hub, ) + from_hf_hub=from_hf_hub, + ) except Exception: # noqa: E722 model_file = None pass if model_file is None: model_file = _get_model_file( pretrained_model_name_or_path, - weights_name=_add_variant(TRANSFORMERS_WEIGHTS_NAME, - variant), + weights_name=_add_variant(TRANSFORMERS_WEIGHTS_NAME, variant), cache_dir=cache_dir, force_download=force_download, resume_download=resume_download, @@ -854,7 +838,8 @@ def from_pretrained_v3(cls, revision=revision, subfolder=subfolder, user_agent=user_agent, - from_hf_hub=from_hf_hub, ) + from_hf_hub=from_hf_hub, + ) else: model_file = _get_model_file( pretrained_model_name_or_path, @@ -868,19 +853,20 @@ def from_pretrained_v3(cls, revision=revision, subfolder=subfolder, user_agent=user_agent, - from_hf_hub=from_hf_hub, ) + from_hf_hub=from_hf_hub, + ) assert model_file is not None # try load model_file with paddle / torch / safetensor state_dict = smart_load(model_file) init_contexts = [] - dtype = set(v.dtype for v in state_dict.values() - if paddle.is_tensor(v) and paddle.is_floating_point(v)) + dtype = set(v.dtype for v in state_dict.values() if paddle.is_tensor(v) and paddle.is_floating_point(v)) if len(dtype) > 1 and paddle.float32 not in dtype: raise ValueError( f"The weights of the model file {model_file} have a mixture of incompatible dtypes {dtype}. Please" - f" make sure that {model_file} weights have only one dtype.") + f" make sure that {model_file} weights have only one dtype." + ) elif len(dtype) > 1 and paddle.float32 in dtype: dtype = paddle.float32 elif len(dtype) == 0: @@ -900,22 +886,18 @@ def from_pretrained_v3(cls, model = cls(config, **model_kwargs) # convert weights - if (from_diffusers or is_torch_file(model_file)) and hasattr( - cls, "smart_convert"): + if (from_diffusers or is_torch_file(model_file)) and hasattr(cls, "smart_convert"): state_dict = cls.smart_convert(state_dict, model) loaded_state_dict_keys = list(state_dict.keys()) - ( - model, - missing_keys, - unexpected_keys, - mismatched_keys, ) = cls._load_pretrained_model_old( - model=model, - state_dict=state_dict, - loaded_keys=loaded_state_dict_keys, - ignore_mismatched_sizes=ignore_mismatched_sizes, - dtype=None, ) + (model, missing_keys, unexpected_keys, mismatched_keys,) = cls._load_pretrained_model_old( + model=model, + state_dict=state_dict, + loaded_keys=loaded_state_dict_keys, + ignore_mismatched_sizes=ignore_mismatched_sizes, + dtype=None, + ) loading_info = { "missing_keys": missing_keys, "unexpected_keys": unexpected_keys, @@ -941,9 +923,7 @@ def from_pretrained_v3(cls, " (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)." ) else: - logger.info( - f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n" - ) + logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n") if len(missing_keys) > 0: logger.warning( @@ -956,17 +936,21 @@ def from_pretrained_v3(cls, f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at" f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the checkpoint" f" was trained on, you can already use {model.__class__.__name__} for predictions without further" - " training.") + " training." + ) if len(mismatched_keys) > 0: - mismatched_warning = "\n".join([ - f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated" - for key, shape1, shape2 in mismatched_keys - ]) + mismatched_warning = "\n".join( + [ + f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated" + for key, shape1, shape2 in mismatched_keys + ] + ) logger.warning( f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at" f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not" f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be able" - " to use it for predictions and inference.") + " to use it for predictions and inference." + ) if output_loading_info: return model, loading_info @@ -979,12 +963,13 @@ def from_pretrained_v3(cls, @classmethod def _load_pretrained_model_old( - cls, - model: PretrainedModel, - state_dict: Dict[str, paddle.Tensor], - loaded_keys: List[str], - ignore_mismatched_sizes=False, - dtype=None, ) -> Tuple[List[str]]: + cls, + model: PretrainedModel, + state_dict: Dict[str, paddle.Tensor], + loaded_keys: List[str], + ignore_mismatched_sizes=False, + dtype=None, + ) -> Tuple[List[str]]: model_state_dict = model.state_dict() expected_keys = list(model_state_dict.keys()) @@ -992,8 +977,7 @@ def _load_pretrained_model_old( if len(prefix) > 0: has_prefix_module = any(s.startswith(prefix) for s in loaded_keys) - expects_prefix_module = any( - s.startswith(prefix) for s in expected_keys) + expects_prefix_module = any(s.startswith(prefix) for s in expected_keys) else: has_prefix_module = False expects_prefix_module = False @@ -1004,10 +988,7 @@ def _load_pretrained_model_old( add_prefix_to_model = has_prefix_module and not expects_prefix_module if remove_prefix_from_model: - expected_keys = [ - ".".join(s.split(".")[1:]) if s.startswith(prefix) else s - for s in expected_keys - ] + expected_keys = [".".join(s.split(".")[1:]) if s.startswith(prefix) else s for s in expected_keys] elif add_prefix_to_model: expected_keys = [".".join([prefix, s]) for s in expected_keys] @@ -1018,31 +999,26 @@ def _load_pretrained_model_old( # the user. if cls._keys_to_ignore_on_load_missing is not None: for pat in cls._keys_to_ignore_on_load_missing: - missing_keys = [ - k for k in missing_keys if re.search(pat, k) is None - ] + missing_keys = [k for k in missing_keys if re.search(pat, k) is None] if cls._keys_to_ignore_on_load_unexpected is not None: for pat in cls._keys_to_ignore_on_load_unexpected: - unexpected_keys = [ - k for k in unexpected_keys if re.search(pat, k) is None - ] + unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None] # Make sure we are able to load base models as well as derived models (with heads) start_prefix = "" model_to_load = model - if (len(cls.base_model_prefix) > 0 and - not hasattr(model, cls.base_model_prefix) and - has_prefix_module): + if len(cls.base_model_prefix) > 0 and not hasattr(model, cls.base_model_prefix) and has_prefix_module: start_prefix = cls.base_model_prefix + "." def _find_mismatched_keys( - state_dict, - model_state_dict, - loaded_keys, - add_prefix_to_model, - remove_prefix_from_model, - ignore_mismatched_sizes, ): + state_dict, + model_state_dict, + loaded_keys, + add_prefix_to_model, + remove_prefix_from_model, + ignore_mismatched_sizes, + ): mismatched_keys = [] if ignore_mismatched_sizes: for checkpoint_key in loaded_keys: @@ -1054,13 +1030,17 @@ def _find_mismatched_keys( # The model key doesn't start with `prefix` but `checkpoint_key` does so we remove it. model_key = ".".join(checkpoint_key.split(".")[1:]) - if (model_key in model_state_dict and - state_dict[checkpoint_key].shape != - model_state_dict[model_key].shape): - mismatched_keys.append(( - checkpoint_key, - state_dict[checkpoint_key].shape, - model_state_dict[model_key].shape, )) + if ( + model_key in model_state_dict + and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape + ): + mismatched_keys.append( + ( + checkpoint_key, + state_dict[checkpoint_key].shape, + model_state_dict[model_key].shape, + ) + ) del state_dict[checkpoint_key] return mismatched_keys @@ -1071,7 +1051,8 @@ def _find_mismatched_keys( loaded_keys, add_prefix_to_model, remove_prefix_from_model, - ignore_mismatched_sizes, ) + ignore_mismatched_sizes, + ) start_prefix = prefix + "." @@ -1090,8 +1071,7 @@ def _find_mismatched_keys( if add_prefix_to_model: for key in list(state_dict.keys()): if key.startswith(start_prefix): - state_dict[key.replace(start_prefix, "")] = state_dict.pop( - key) + state_dict[key.replace(start_prefix, "")] = state_dict.pop(key) if remove_prefix_from_model: for key in list(state_dict.keys()): @@ -1126,12 +1106,9 @@ def _find_mismatched_keys( # this is the temp hard code for fused-mt transformer if model.keep_in_fp32_modules(key, model.config, dtype): target_dtype = "float32" - state_dict[key] = paddle.cast( - state_dict[key], dtype=target_dtype) + state_dict[key] = paddle.cast(state_dict[key], dtype=target_dtype) else: - raise ValueError( - f"the dtype<{state_dict[key].dtype}> of current state-dict[{key}] is not valid" - ) + raise ValueError(f"the dtype<{state_dict[key].dtype}> of current state-dict[{key}] is not valid") else: dtype_prefix_len = len("paddle.") for k, v in model_to_load.state_dict().items(): @@ -1155,8 +1132,7 @@ def _find_mismatched_keys( # To avoid recursive import temporarily. import paddlenlp.ops.fast_transformer.transformer.decoding as ft_decoding - state_to_load = ft_decoding.get_ft_para_conf().fit_partial_model( - model_to_load, state_dict) + state_to_load = ft_decoding.get_ft_para_conf().fit_partial_model(model_to_load, state_dict) if paddle.in_dynamic_mode(): model_to_load.set_state_dict(state_to_load) @@ -1170,19 +1146,20 @@ def _find_mismatched_keys( @classmethod def from_pretrained( - cls, - pretrained_model_name_or_path, - *args, - from_hf_hub=False, - subfolder=None, - paddle_dtype=None, - from_diffusers=None, - variant=None, - **kwargs, ): + cls, + pretrained_model_name_or_path, + *args, + from_hf_hub=False, + subfolder=None, + paddle_dtype=None, + from_diffusers=None, + variant=None, + **kwargs, + ): try: if cls.constructed_from_pretrained_config() and ( - hasattr(cls, "smart_convert") or - hasattr(cls, "register_load_torch_hook")): + hasattr(cls, "smart_convert") or hasattr(cls, "register_load_torch_hook") + ): return from_pretrained_v3( cls, pretrained_model_name_or_path, @@ -1192,7 +1169,8 @@ def from_pretrained( paddle_dtype=paddle_dtype, from_diffusers=from_diffusers, variant=variant, - **kwargs, ) + **kwargs, + ) except Exception: pass @@ -1206,7 +1184,8 @@ def from_pretrained( from_hf_hub=from_hf_hub, subfolder=subfolder, dtype=dtype, - **kwargs, ) + **kwargs, + ) PretrainedModel.from_pretrained = from_pretrained @@ -1214,51 +1193,43 @@ def from_pretrained( from safetensors.numpy import save_file as safetensors_numpy_save_file if is_torch_available(): - from safetensors.torch import \ - save_file as safetensors_torch_save_file + from safetensors.torch import save_file as safetensors_torch_save_file if is_torch_available(): import torch def save_pretrained_v3( - self: PretrainedModel, - save_directory: str, - is_main_process: bool=True, - save_function: Callable=None, - safe_serialization: bool=False, - variant: Optional[str]=None, - to_diffusers: Optional[bool]=None, ): - from ..models.modeling_pytorch_paddle_utils import \ - convert_paddle_state_dict_to_pytorch + self: PretrainedModel, + save_directory: str, + is_main_process: bool = True, + save_function: Callable = None, + safe_serialization: bool = False, + variant: Optional[str] = None, + to_diffusers: Optional[bool] = None, + ): + from ..models.modeling_pytorch_paddle_utils import ( + convert_paddle_state_dict_to_pytorch, + ) from ..models.modeling_utils import convert_state_dict if to_diffusers is None: to_diffusers = TO_DIFFUSERS - if to_diffusers and safe_serialization and not is_safetensors_available( - ): - raise ImportError( - "`safe_serialization` requires the `safetensors library: `pip install safetensors`." - ) + if to_diffusers and safe_serialization and not is_safetensors_available(): + raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.") if os.path.isfile(save_directory): - logger.error( - f"Provided path ({save_directory}) should be a directory, not a file" - ) + logger.error(f"Provided path ({save_directory}) should be a directory, not a file") return - model_to_save = self._layers if isinstance( - self, paddle.DataParallel) else self + model_to_save = self._layers if isinstance(self, paddle.DataParallel) else self if is_main_process: try: - model_to_save.config.dtype = str(model_to_save._dtype).split( - ".")[-1] + model_to_save.config.dtype = str(model_to_save._dtype).split(".")[-1] except: model_to_save.config.dtype = "float32" # Attach architecture to the config - model_to_save.config.architectures = [ - model_to_save.__class__.__name__ - ] + model_to_save.config.architectures = [model_to_save.__class__.__name__] model_to_save.config.save_pretrained(save_directory) @@ -1273,12 +1244,10 @@ def save_pretrained_v3( if safe_serialization: if is_torch_available(): save_function = safetensors_torch_save_file - state_dict = convert_state_dict( - state_dict, framework="torch") + state_dict = convert_state_dict(state_dict, framework="torch") else: save_function = safetensors_numpy_save_file - state_dict = convert_state_dict( - state_dict, framework="numpy") + state_dict = convert_state_dict(state_dict, framework="numpy") weights_name = _add_variant("model.safetensors", variant) else: if not is_torch_available(): @@ -1287,11 +1256,9 @@ def save_pretrained_v3( ) save_function = torch.save weights_name = _add_variant("pytorch_model.bin", variant) - state_dict = convert_state_dict( - state_dict, framework="torch") + state_dict = convert_state_dict(state_dict, framework="torch") - state_dict = convert_paddle_state_dict_to_pytorch(state_dict, - model_to_save) + state_dict = convert_paddle_state_dict_to_pytorch(state_dict, model_to_save) else: save_function = paddle.save weights_name = _add_variant("model_state.pdparams", variant) @@ -1299,24 +1266,22 @@ def save_pretrained_v3( # Save the model save_function(state_dict, os.path.join(save_directory, weights_name)) - logger.info( - f"Model weights saved in {os.path.join(save_directory, weights_name)}" - ) + logger.info(f"Model weights saved in {os.path.join(save_directory, weights_name)}") def save_pretrained( - self, - save_dir: str, - is_main_process: bool=True, - state_dict=None, - save_function: Callable=None, - max_shard_size="10GB", - safe_serialization: bool=False, - variant: Optional[str]=None, - to_diffusers: Optional[bool]=None, - *args, - **kwargs, ): - if self.constructed_from_pretrained_config() and hasattr( - self, "smart_convert"): + self, + save_dir: str, + is_main_process: bool = True, + state_dict=None, + save_function: Callable = None, + max_shard_size="10GB", + safe_serialization: bool = False, + variant: Optional[str] = None, + to_diffusers: Optional[bool] = None, + *args, + **kwargs, + ): + if self.constructed_from_pretrained_config() and hasattr(self, "smart_convert"): return save_pretrained_v3( self, save_dir, @@ -1324,7 +1289,8 @@ def save_pretrained( save_function=save_function, safe_serialization=safe_serialization, variant=variant, - to_diffusers=to_diffusers, ) + to_diffusers=to_diffusers, + ) return raw_save_pretrained( self, save_dir=save_dir, @@ -1335,32 +1301,40 @@ def save_pretrained( safe_serialization=safe_serialization, variant=variant, *args, - **kwargs, ) + **kwargs, + ) PretrainedModel.save_pretrained = save_pretrained from paddlenlp.transformers import ( - BertModel, BitBackbone, ClapTextModelWithProjection, CLIPTextModel, - CLIPTextModelWithProjection, CLIPVisionModel, - CLIPVisionModelWithProjection, DPTForDepthEstimation, SpeechT5HifiGan, - T5EncoderModel) + BertModel, + BitBackbone, + ClapTextModelWithProjection, + CLIPTextModel, + CLIPTextModelWithProjection, + CLIPVisionModel, + CLIPVisionModelWithProjection, + DPTForDepthEstimation, + SpeechT5HifiGan, + T5EncoderModel, + ) if not hasattr(T5EncoderModel, "_keep_in_fp32_modules"): T5EncoderModel._keep_in_fp32_modules = ["wo"] - from ..models.modeling_pytorch_paddle_utils import \ - convert_pytorch_state_dict_to_paddle_class_method - from ..pipelines.alt_diffusion.modeling_roberta_series import \ - RobertaSeriesModelWithTransformation + from ..models.modeling_pytorch_paddle_utils import ( + convert_pytorch_state_dict_to_paddle_class_method, + ) + from ..pipelines.alt_diffusion.modeling_roberta_series import ( + RobertaSeriesModelWithTransformation, + ) from ..pipelines.deepfloyd_if.safety_checker import IFSafetyChecker - from ..pipelines.latent_diffusion.pipeline_latent_diffusion import \ - LDMBertModel - from ..pipelines.paint_by_example.image_encoder import \ - PaintByExampleImageEncoder - from ..pipelines.stable_diffusion.safety_checker import \ - StableDiffusionSafetyChecker - from ..pipelines.stable_diffusion_safe.safety_checker import \ - SafeStableDiffusionSafetyChecker + from ..pipelines.latent_diffusion.pipeline_latent_diffusion import LDMBertModel + from ..pipelines.paint_by_example.image_encoder import PaintByExampleImageEncoder + from ..pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker + from ..pipelines.stable_diffusion_safe.safety_checker import ( + SafeStableDiffusionSafetyChecker, + ) @classmethod def clip_smart_convert(cls, state_dict, pd_model): @@ -1380,7 +1354,9 @@ def clip_smart_convert(cls, state_dict, pd_model): ".pre_layrnorm.": ".ln_pre.", ".post_layernorm.": ".ln_post.", } - ignore_value = ["position_ids", ] + ignore_value = [ + "position_ids", + ] if cls in [PaintByExampleImageEncoder]: # ignore mapper. prefix, we will use convert_pytorch_state_dict_to_paddle to convert mapper.xxxx state_dict ignore_value.append("mapper.") @@ -1410,11 +1386,11 @@ def clip_smart_convert(cls, state_dict, pd_model): name = name.replace(hf_name, ppnlp_name) # step4: 0d tensor -> 1d tensor if name == "logit_scale" and value.ndim == 1: - value = value.reshape((1, )) + value = value.reshape((1,)) # step5: safety_checker need prefix "clip." if "vision_model" in name and cls in [ - StableDiffusionSafetyChecker, - SafeStableDiffusionSafetyChecker, + StableDiffusionSafetyChecker, + SafeStableDiffusionSafetyChecker, ]: name = "clip." + name new_model_state[name] = value @@ -1423,8 +1399,7 @@ def clip_smart_convert(cls, state_dict, pd_model): if cls in [PaintByExampleImageEncoder]: # convert mapper - mappersd = cls.smart_convert( - state_dict, pd_model, sub_layer="mapper.") + mappersd = cls.smart_convert(state_dict, pd_model, sub_layer="mapper.") new_model_state.update(mappersd) return new_model_state @@ -1451,10 +1426,8 @@ def bert_smart_convert(cls, state_dict, pd_model): # about cls predictions ignore "cls.predictions.transform.dense": "cls.predictions.transform", "cls.predictions.decoder.weight": "cls.predictions.decoder_weight", - "cls.predictions.transform.LayerNorm.weight": - "cls.predictions.layer_norm.weight", - "cls.predictions.transform.LayerNorm.bias": - "cls.predictions.layer_norm.bias", + "cls.predictions.transform.LayerNorm.weight": "cls.predictions.layer_norm.weight", + "cls.predictions.transform.LayerNorm.bias": "cls.predictions.layer_norm.bias", "cls.predictions.bias": "cls.predictions.decoder_bias", } ignore_value = ["position_ids"] @@ -1481,8 +1454,7 @@ def bert_smart_convert(cls, state_dict, pd_model): def ldmbert_smart_convert(cls, state_dict, pd_model): transformers2ppnlp = { "model.embed_tokens.weight": "embeddings.word_embeddings.weight", - "model.embed_positions.weight": - "embeddings.position_embeddings.weight", + "model.embed_positions.weight": "embeddings.position_embeddings.weight", "model.layer_norm.": "final_layer_norm.", "model.layers": "encoder.layers", ".self_attn_layer_norm.": ".norm1.", @@ -1513,14 +1485,14 @@ def ldmbert_smart_convert(cls, state_dict, pd_model): LDMBertModel.smart_convert = ldmbert_smart_convert for cls_ in [ - CLIPTextModel, - CLIPTextModelWithProjection, - CLIPVisionModel, - CLIPVisionModelWithProjection, - StableDiffusionSafetyChecker, - SafeStableDiffusionSafetyChecker, - PaintByExampleImageEncoder, - IFSafetyChecker, + CLIPTextModel, + CLIPTextModelWithProjection, + CLIPVisionModel, + CLIPVisionModelWithProjection, + StableDiffusionSafetyChecker, + SafeStableDiffusionSafetyChecker, + PaintByExampleImageEncoder, + IFSafetyChecker, ]: setattr(cls_, "smart_convert", clip_smart_convert) @@ -1532,8 +1504,12 @@ def ldmbert_smart_convert(cls, state_dict, pd_model): else: # NEW TRANSFORMERS CLIP MODEL from ..pipelines.stable_diffusion.hf_clip_model import ( - HFCLIPModel, HFCLIPTextModel, HFCLIPTextModelWithProjection, - HFCLIPVisionModel, HFCLIPVisionModelWithProjection) + HFCLIPModel, + HFCLIPTextModel, + HFCLIPTextModelWithProjection, + HFCLIPVisionModel, + HFCLIPVisionModelWithProjection, + ) TRANSFORMERS_CLIP_MODEL = [ HFCLIPModel, @@ -1543,29 +1519,27 @@ def ldmbert_smart_convert(cls, state_dict, pd_model): HFCLIPVisionModelWithProjection, ] for cls_ in [ - DPTForDepthEstimation, - BitBackbone, - SpeechT5HifiGan, - ClapTextModelWithProjection, - T5EncoderModel, + DPTForDepthEstimation, + BitBackbone, + SpeechT5HifiGan, + ClapTextModelWithProjection, + T5EncoderModel, ] + TRANSFORMERS_CLIP_MODEL: - setattr(cls_, "smart_convert", - convert_pytorch_state_dict_to_paddle_class_method) + setattr(cls_, "smart_convert", convert_pytorch_state_dict_to_paddle_class_method) # TODO remove this when we updage ImageProcessingMixin # patch get_image_processor_dict support subfolder. IMAGE_PROCESSOR_NAME = "preprocessor_config.json" - from paddlenlp.transformers.feature_extraction_utils import \ - FeatureExtractionMixin - from paddlenlp.transformers.image_processing_utils import \ - ImageProcessingMixin + from paddlenlp.transformers.feature_extraction_utils import FeatureExtractionMixin + from paddlenlp.transformers.image_processing_utils import ImageProcessingMixin @classmethod def get_image_processor_dict(cls, pretrained_model_name_or_path, **kwargs): from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB) - cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub - else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)) + cache_dir = ( + kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE) + ) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) proxies = kwargs.pop("proxies", None) @@ -1589,12 +1563,11 @@ def get_image_processor_dict(cls, pretrained_model_name_or_path, **kwargs): revision=revision, subfolder=subfolder, user_agent=user_agent, - from_hf_hub=from_hf_hub, ) + from_hf_hub=from_hf_hub, + ) try: # Load image_processor dict - with open( - resolved_image_processor_file, "r", - encoding="utf-8") as reader: + with open(resolved_image_processor_file, "r", encoding="utf-8") as reader: text = reader.read() image_processor_dict = json.loads(text) diff --git a/ppdiffusers/ppdiffusers/patches/tomesd_patch_utils.py b/ppdiffusers/ppdiffusers/patches/tomesd_patch_utils.py index 367da2b281b53..7000346e862f7 100644 --- a/ppdiffusers/ppdiffusers/patches/tomesd_patch_utils.py +++ b/ppdiffusers/ppdiffusers/patches/tomesd_patch_utils.py @@ -32,41 +32,36 @@ def scatter_reduce( - input: paddle.Tensor, - dim: int, - index: paddle.Tensor, - src: paddle.Tensor, - reduce: str="mean", - include_self: bool=True, ) -> paddle.Tensor: + input: paddle.Tensor, + dim: int, + index: paddle.Tensor, + src: paddle.Tensor, + reduce: str = "mean", + include_self: bool = True, +) -> paddle.Tensor: # reduce "sum", "prod", "mean", # TODO support "amax", "amin" and include_self = False if reduce in ["sum", "assign", "add"]: if reduce == "sum": reduce = "add" - input.put_along_axis_( - indices=index, values=src, axis=dim, reduce=reduce) + input.put_along_axis_(indices=index, values=src, axis=dim, reduce=reduce) elif reduce == "mean": # compute sum first input.put_along_axis_(indices=index, values=src, axis=dim, reduce="add") # compute div secondly input_div = paddle.ones_like(input).put_along_axis( indices=index, - values=paddle.to_tensor( - 1.0, dtype=input.dtype), + values=paddle.to_tensor(1.0, dtype=input.dtype), axis=dim, - reduce="add", ) + reduce="add", + ) input = input / input_div elif reduce in ["prod", "mul", "multiply"]: - input = paddle.put_along_axis( - input.cpu(), - indices=index.cpu(), - values=src.cpu(), - axis=dim, - reduce="mul")._to(device=paddle.get_device()) - else: - raise NotImplementedError( - "only support mode in ['add', 'sum', 'prod', 'mul', 'multiply', 'mean', 'assign']!" + input = paddle.put_along_axis(input.cpu(), indices=index.cpu(), values=src.cpu(), axis=dim, reduce="mul")._to( + device=paddle.get_device() ) + else: + raise NotImplementedError("only support mode in ['add', 'sum', 'prod', 'mul', 'multiply', 'mean', 'assign']!") return input @@ -75,18 +70,19 @@ def scatter_reduce( paddle.Tensor.scatter_reduce = scatter_reduce -def do_nothing(x: paddle.Tensor, mode: str=None): +def do_nothing(x: paddle.Tensor, mode: str = None): return x def bipartite_soft_matching_random2d( - metric: paddle.Tensor, - w: int, - h: int, - sx: int, - sy: int, - r: int, - no_rand: bool=False, ) -> Tuple[Callable, Callable]: + metric: paddle.Tensor, + w: int, + h: int, + sx: int, + sy: int, + r: int, + no_rand: bool = False, +) -> Tuple[Callable, Callable]: """ Partitions the tokens into src and dst and merges r tokens from src to dst. Dst tokens are partitioned by choosing one randomy in each (sx, sy) region. @@ -112,24 +108,23 @@ def bipartite_soft_matching_random2d( if no_rand: rand_idx = paddle.zeros((hsy, wsx, 1), dtype=paddle.int64) else: - rand_idx = paddle.randint( - sy * sx, shape=(hsy, wsx, 1), dtype=paddle.int64) + rand_idx = paddle.randint(sy * sx, shape=(hsy, wsx, 1), dtype=paddle.int64) # The image might not divide sx and sy, so we need to work on a view of the top left if the idx buffer instead idx_buffer_view = paddle.zeros([hsy, wsx, sy * sx], dtype=paddle.int64) idx_buffer_view.put_along_axis_( axis=2, indices=rand_idx, - values=-paddle.ones_like( - rand_idx, dtype=rand_idx.dtype), ) - idx_buffer_view = (idx_buffer_view.reshape([hsy, wsx, sy, sx]) - .transpose([0, 2, 1, 3]) - .reshape([hsy * sy, wsx * sx])) + values=-paddle.ones_like(rand_idx, dtype=rand_idx.dtype), + ) + idx_buffer_view = ( + idx_buffer_view.reshape([hsy, wsx, sy, sx]).transpose([0, 2, 1, 3]).reshape([hsy * sy, wsx * sx]) + ) # Image is not divisible by sx or sy so we need to move it into a new buffer if (hsy * sy) < h or (wsx * sx) < w: idx_buffer = paddle.zeros([h, w], dtype=paddle.int64) - idx_buffer[:(hsy * sy), :(wsx * sx)] = idx_buffer_view + idx_buffer[: (hsy * sy), : (wsx * sx)] = idx_buffer_view else: idx_buffer = idx_buffer_view @@ -147,10 +142,8 @@ def bipartite_soft_matching_random2d( def split(x): C = x.shape[-1] - src = x.take_along_axis( - indices=a_idx.expand([B, N - num_dst, C]), axis=1) - dst = x.take_along_axis( - indices=b_idx.expand([B, num_dst, C]), axis=1) + src = x.take_along_axis(indices=a_idx.expand([B, N - num_dst, C]), axis=1) + dst = x.take_along_axis(indices=b_idx.expand([B, num_dst, C]), axis=1) return src, dst # Cosine similarity between A and B @@ -178,12 +171,10 @@ def merge(x: paddle.Tensor, mode="mean") -> paddle.Tensor: src, dst = split(x) n, t1, c = src.shape - unm = src.take_along_axis( - indices=unm_idx.expand([n, t1 - r, c]), axis=-2) + unm = src.take_along_axis(indices=unm_idx.expand([n, t1 - r, c]), axis=-2) src = src.take_along_axis(indices=src_idx.expand([n, r, c]), axis=-2) - dst = scatter_reduce( - dst, -2, dst_idx.expand([n, r, c]), src, reduce=mode) + dst = scatter_reduce(dst, -2, dst_idx.expand([n, r, c]), src, reduce=mode) return paddle.concat([unm, dst], axis=1) @@ -200,25 +191,27 @@ def unmerge(x: paddle.Tensor) -> paddle.Tensor: out.put_along_axis_( indices=b_idx.expand([B, num_dst, c]), values=dst, - axis=-2, ) + axis=-2, + ) out.put_along_axis_( - indices=a_idx.expand([B, a_idx.shape[1], 1]).take_along_axis( - indices=unm_idx, axis=1).expand([B, unm_len, c]), + indices=a_idx.expand([B, a_idx.shape[1], 1]) + .take_along_axis(indices=unm_idx, axis=1) + .expand([B, unm_len, c]), values=unm, - axis=-2, ) + axis=-2, + ) out.put_along_axis_( - indices=a_idx.expand([B, a_idx.shape[1], 1]).take_along_axis( - indices=src_idx, axis=1).expand([B, r, c]), + indices=a_idx.expand([B, a_idx.shape[1], 1]).take_along_axis(indices=src_idx, axis=1).expand([B, r, c]), values=src, - axis=-2, ) + axis=-2, + ) return out return merge, unmerge -def compute_merge(x: paddle.Tensor, - tome_info: Dict[str, Any]) -> Tuple[Callable, ...]: +def compute_merge(x: paddle.Tensor, tome_info: Dict[str, Any]) -> Tuple[Callable, ...]: original_h, original_w = tome_info["size"] original_tokens = original_h * original_w downsample = int(math.ceil(math.sqrt(original_tokens // x.shape[1]))) @@ -232,8 +225,7 @@ def compute_merge(x: paddle.Tensor, # If the batch size is odd, then it's not possible for promted and unprompted images to be in the same # batch, which causes artifacts with use_rand, so force it to be off. use_rand = False if x.shape[0] % 2 == 1 else args["use_rand"] - m, u = bipartite_soft_matching_random2d(x, w, h, args["sx"], args["sy"], - r, not use_rand) + m, u = bipartite_soft_matching_random2d(x, w, h, args["sx"], args["sy"], r, not use_rand) else: m, u = (do_nothing, do_nothing) @@ -255,31 +247,27 @@ class ToMeBasicTransformerBlock(block_class): _parent = block_class def forward( - self: BasicTransformerBlock, - hidden_states, - attention_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - timestep=None, - cross_attention_kwargs=None, - class_labels=None, ) -> paddle.Tensor: + self: BasicTransformerBlock, + hidden_states, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + timestep=None, + cross_attention_kwargs=None, + class_labels=None, + ) -> paddle.Tensor: # (1) ToMe - m_a, m_c, m_m, u_a, u_c, u_m = compute_merge(hidden_states, - self._tome_info) + m_a, m_c, m_m, u_a, u_c, u_m = compute_merge(hidden_states, self._tome_info) if self.use_ada_layer_norm: norm_hidden_states = self.norm1(hidden_states, timestep) elif self.use_ada_layer_norm_zero: - ( - norm_hidden_states, - gate_msa, - shift_mlp, - scale_mlp, - gate_mlp, ) = self.norm1( - hidden_states, - timestep, - class_labels, - hidden_dtype=hidden_states.dtype, ) + (norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp,) = self.norm1( + hidden_states, + timestep, + class_labels, + hidden_dtype=hidden_states.dtype, + ) else: norm_hidden_states = self.norm1(hidden_states) @@ -287,15 +275,13 @@ def forward( norm_hidden_states = m_a(norm_hidden_states) # 1. Self-Attention - cross_attention_kwargs = (cross_attention_kwargs - if cross_attention_kwargs is not None else - {}) + cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} attn_output = self.attn1( norm_hidden_states, - encoder_hidden_states=encoder_hidden_states - if self.only_cross_attention else None, + encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, attention_mask=attention_mask, - **cross_attention_kwargs, ) + **cross_attention_kwargs, + ) if self.use_ada_layer_norm_zero: attn_output = gate_msa.unsqueeze(1) * attn_output @@ -303,9 +289,9 @@ def forward( hidden_states = u_a(attn_output) + hidden_states if self.attn2 is not None: - norm_hidden_states = (self.norm2(hidden_states, timestep) - if self.use_ada_layer_norm else - self.norm2(hidden_states)) + norm_hidden_states = ( + self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states) + ) # (4) ToMe m_c norm_hidden_states = m_c(norm_hidden_states) @@ -314,7 +300,8 @@ def forward( norm_hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=encoder_attention_mask, - **cross_attention_kwargs, ) + **cross_attention_kwargs, + ) # (5) ToMe u_c hidden_states = u_c(attn_output) + hidden_states @@ -322,9 +309,7 @@ def forward( norm_hidden_states = self.norm3(hidden_states) if self.use_ada_layer_norm_zero: - norm_hidden_states = ( - norm_hidden_states * - (1 + scale_mlp[:, None]) + shift_mlp[:, None]) + norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None] # (6) ToMe m_m norm_hidden_states = m_m(norm_hidden_states) @@ -353,8 +338,7 @@ def hook(module, args): @patch_to([DiffusionPipeline, nn.Layer]) -def remove_tome(model_or_pipe: Union[nn.Layer, DiffusionPipeline], - only_return_self: bool=True): +def remove_tome(model_or_pipe: Union[nn.Layer, DiffusionPipeline], only_return_self: bool = True): """Removes a patch from a ToMeXXX module if it was already patched.""" model_list = [] if isinstance(model_or_pipe, DiffusionPipeline): @@ -385,15 +369,16 @@ def remove_tome(model_or_pipe: Union[nn.Layer, DiffusionPipeline], @patch_to([DiffusionPipeline, nn.Layer]) def apply_tome( - model_or_pipe: Union[nn.Layer, DiffusionPipeline], - ratio: float=0.5, - max_downsample: int=1, - sx: int=2, - sy: int=2, - use_rand: bool=True, - merge_attn: bool=True, - merge_crossattn: bool=False, - merge_mlp: bool=False, ): + model_or_pipe: Union[nn.Layer, DiffusionPipeline], + ratio: float = 0.5, + max_downsample: int = 1, + sx: int = 2, + sy: int = 2, + use_rand: bool = True, + merge_attn: bool = True, + merge_crossattn: bool = False, + merge_mlp: bool = False, +): """ Patches a stable diffusion model_or_pipe with ToMe. Apply this to the highest level stable diffusion object (i.e., it should have a .unet). diff --git a/ppdiffusers/ppdiffusers/patches/webui_lora_patch_utils.py b/ppdiffusers/ppdiffusers/patches/webui_lora_patch_utils.py index 6cc870cfb75ee..89a574fe97842 100644 --- a/ppdiffusers/ppdiffusers/patches/webui_lora_patch_utils.py +++ b/ppdiffusers/ppdiffusers/patches/webui_lora_patch_utils.py @@ -42,7 +42,7 @@ def convert_pt_to_pd(state, dtype): if val.ndim == 2: val = val.T if val.ndim == 0: - val = val.reshape((1, )) + val = val.reshape((1,)) new_state[b] = val.cast(dtype) else: print(f"We find {a} not in state_dict and we will continue!") @@ -87,12 +87,10 @@ def save_lora(pipe_or_module, save_directory, WEIGHT_NAME=None): if is_torch_available(): save_function = safetensors.torch.save_file - outdict = convert_state_dict( - convert_pd_to_pt(outdict), framework="torch") + outdict = convert_state_dict(convert_pd_to_pt(outdict), framework="torch") else: save_function = safetensors.numpy.save_file - outdict = convert_state_dict( - convert_pd_to_pt(outdict), framework="numpy") + outdict = convert_state_dict(convert_pd_to_pt(outdict), framework="numpy") save_function(outdict, os.path.join(save_directory, WEIGHT_NAME)) del outdict @@ -116,15 +114,16 @@ def set_lora(self): @patch_to([DiffusionPipeline, nn.Layer]) def apply_lora( - pipe_or_module, - lora_weight_or_path=None, - rank=4, - alpha=None, - multiplier=1.0, - text_encoder_target_replace_modules=["TransformerEncoderLayer"], - unet_target_replace_modules=["Transformer2DModel", "Attention"], - enable_lora=True, - **kwargs, ): + pipe_or_module, + lora_weight_or_path=None, + rank=4, + alpha=None, + multiplier=1.0, + text_encoder_target_replace_modules=["TransformerEncoderLayer"], + unet_target_replace_modules=["Transformer2DModel", "Attention"], + enable_lora=True, + **kwargs, +): resume_download = kwargs.pop("resume_download", False) force_download = kwargs.pop("force_download", False) paddle_dtype = kwargs.pop("paddle_dtype", None) @@ -143,17 +142,16 @@ def apply_lora( lora_weight_or_path = str(lora_weight_or_path) if os.path.isfile(lora_weight_or_path): lora_weight_or_path = lora_weight_or_path - elif lora_weight_or_path.startswith( - "http://") or lora_weight_or_path.startswith("https://"): + elif lora_weight_or_path.startswith("http://") or lora_weight_or_path.startswith("https://"): lora_weight_or_path = ppdiffusers_url_download( lora_weight_or_path, cache_dir=cache_dir, resume_download=resume_download, - force_download=force_download, ) + force_download=force_download, + ) else: raise EnvironmentError(f"Please check your {lora_weight_or_path}.") - lora_weight_or_path = convert_pt_to_pd( - smart_load(lora_weight_or_path), paddle_dtype) + lora_weight_or_path = convert_pt_to_pd(smart_load(lora_weight_or_path), paddle_dtype) mayberanklist = [] maybealphalist = [] @@ -176,67 +174,64 @@ def apply_lora( if len(mayberanklist) > 20: break if len(set(mayberanklist)) > 1: - print( - f"Can't guess rank! Here are the rank list {mayberanklist}. We will use default rank {rank}." - ) + print(f"Can't guess rank! Here are the rank list {mayberanklist}. We will use default rank {rank}.") else: rank = mayberanklist[0] print(f"|---------------Currently, rank is {rank}!") if len(set(maybealphalist)) > 1: - print( - f"Can't guess alpha! Here are the rank list {maybealphalist}. We will use default alpha {alpha}" - ) + print(f"Can't guess alpha! Here are the rank list {maybealphalist}. We will use default alpha {alpha}") else: alpha = maybealphalist[0] print(f"|---------------Currently, alpha is {alpha}!") waitlist = [] if isinstance(pipe_or_module, nn.Layer): - waitlist.append(( - pipe_or_module, - text_encoder_target_replace_modules + unet_target_replace_modules, - )) + waitlist.append( + ( + pipe_or_module, + text_encoder_target_replace_modules + unet_target_replace_modules, + ) + ) else: if hasattr(pipe_or_module, "text_encoder"): - waitlist.append((pipe_or_module.text_encoder, - text_encoder_target_replace_modules)) + waitlist.append((pipe_or_module.text_encoder, text_encoder_target_replace_modules)) if hasattr(pipe_or_module, "unet"): waitlist.append((pipe_or_module.unet, unet_target_replace_modules)) lora_modules = {} for each_module, target_replace_modules in waitlist: for name1, module in each_module.named_sublayers(include_self=True): if module.__class__.__name__ in target_replace_modules: - for name2, child_module in module.named_sublayers( - include_self=True): + for name2, child_module in module.named_sublayers(include_self=True): if not getattr(child_module, "is_lora_linear", False) and ( - child_module.__class__.__name__ == "Linear" or - (child_module.__class__.__name__ == "Conv2D" and - list(child_module._kernel_size) == [1, 1])): + child_module.__class__.__name__ == "Linear" + or (child_module.__class__.__name__ == "Conv2D" and list(child_module._kernel_size) == [1, 1]) + ): # if we apply lora multi - if hasattr(child_module, - "merged") and child_module.merged: + if hasattr(child_module, "merged") and child_module.merged: with paddle.no_grad(): if child_module.is_conv: new_weight = ( - child_module.weight.squeeze([-1, -2]) - - child_module.lora_up.weight.squeeze( - [-1, -2]) - @child_module.lora_down.weight.squeeze( - [-1, -2]) * child_module.multiplier - * child_module.scale).unsqueeze( - [-1, -2]) + child_module.weight.squeeze([-1, -2]) + - child_module.lora_up.weight.squeeze([-1, -2]) + @ child_module.lora_down.weight.squeeze([-1, -2]) + * child_module.multiplier + * child_module.scale + ).unsqueeze([-1, -2]) else: - new_weight = (child_module.weight - - child_module.lora_down.weight - @child_module.lora_up.weight * - child_module.multiplier * - child_module.scale) + new_weight = ( + child_module.weight + - child_module.lora_down.weight + @ child_module.lora_up.weight + * child_module.multiplier + * child_module.scale + ) child_module.weight.set_value(new_weight) in_features, out_features = ( child_module.weight.shape[0], - child_module.weight.shape[1], ) + child_module.weight.shape[1], + ) child_module.is_conv = False child_module.merged = False @@ -250,15 +245,11 @@ def apply_lora( ) if child_module.is_conv: - child_module.lora_down = nn.Conv2D( - in_features, rank, [1, 1], bias_attr=False) - child_module.lora_up = nn.Conv2D( - rank, out_features, [1, 1], bias_attr=False) + child_module.lora_down = nn.Conv2D(in_features, rank, [1, 1], bias_attr=False) + child_module.lora_up = nn.Conv2D(rank, out_features, [1, 1], bias_attr=False) else: - child_module.lora_down = nn.Linear( - in_features, rank, bias_attr=False) - child_module.lora_up = nn.Linear( - rank, out_features, bias_attr=False) + child_module.lora_down = nn.Linear(in_features, rank, bias_attr=False) + child_module.lora_up = nn.Linear(rank, out_features, bias_attr=False) child_module.lora_down.is_lora_linear = True child_module.lora_up.is_lora_linear = True child_module.rank = rank @@ -268,13 +259,10 @@ def apply_lora( alpha = alpha.detach().cast("float32").numpy() alpha = rank if alpha is None or alpha == 0 else alpha child_module.scale = alpha / child_module.rank - child_module.register_buffer( - "alpha", paddle.to_tensor( - alpha, dtype="float32")) + child_module.register_buffer("alpha", paddle.to_tensor(alpha, dtype="float32")) # same as microsoft's - kaiming_uniform_( - child_module.lora_down.weight, a=math.sqrt(5)) + kaiming_uniform_(child_module.lora_down.weight, a=math.sqrt(5)) zeros_(child_module.lora_up.weight) child_module.multiplier = multiplier @@ -287,44 +275,47 @@ def forward_lora(self, x): with paddle.no_grad(): if self.is_conv: new_weight = ( - self.weight.squeeze([-1, -2]) - - self.lora_up.weight.squeeze( - [-1, -2]) - @self.lora_down.weight.squeeze( - [-1, -2]) * self.multiplier - * self.scale).unsqueeze( - [-1, -2]) + self.weight.squeeze([-1, -2]) + - self.lora_up.weight.squeeze([-1, -2]) + @ self.lora_down.weight.squeeze([-1, -2]) + * self.multiplier + * self.scale + ).unsqueeze([-1, -2]) else: new_weight = ( - self.weight - - self.lora_down.weight - @self.lora_up.weight * - self.multiplier * self.scale) + self.weight + - self.lora_down.weight + @ self.lora_up.weight + * self.multiplier + * self.scale + ) self.weight.set_value(new_weight) self.merged = False if not self.enable_lora: return self.raw_forward(x) - return (self.raw_forward(x) + - self.lora_up(self.lora_down(x)) * - self.multiplier * self.scale) + return ( + self.raw_forward(x) + + self.lora_up(self.lora_down(x)) * self.multiplier * self.scale + ) else: if self.enable_lora and not self.merged: with paddle.no_grad(): if self.is_conv: new_weight = ( - self.weight.squeeze([-1, -2]) + - self.lora_up.weight.squeeze( - [-1, -2]) - @self.lora_down.weight.squeeze( - [-1, -2]) * self.multiplier - * self.scale).unsqueeze( - [-1, -2]) + self.weight.squeeze([-1, -2]) + + self.lora_up.weight.squeeze([-1, -2]) + @ self.lora_down.weight.squeeze([-1, -2]) + * self.multiplier + * self.scale + ).unsqueeze([-1, -2]) else: new_weight = ( - self.weight + - self.lora_down.weight - @self.lora_up.weight * - self.multiplier * self.scale) + self.weight + + self.lora_down.weight + @ self.lora_up.weight + * self.multiplier + * self.scale + ) self.weight.set_value(new_weight) self.merged = True @@ -332,25 +323,25 @@ def forward_lora(self, x): with paddle.no_grad(): if self.is_conv: new_weight = ( - self.weight.squeeze([-1, -2]) - - self.lora_up.weight.squeeze( - [-1, -2]) - @self.lora_down.weight.squeeze( - [-1, -2]) * self.multiplier - * self.scale).unsqueeze( - [-1, -2]) + self.weight.squeeze([-1, -2]) + - self.lora_up.weight.squeeze([-1, -2]) + @ self.lora_down.weight.squeeze([-1, -2]) + * self.multiplier + * self.scale + ).unsqueeze([-1, -2]) else: new_weight = ( - self.weight - - self.lora_down.weight - @self.lora_up.weight * - self.multiplier * self.scale) + self.weight + - self.lora_down.weight + @ self.lora_up.weight + * self.multiplier + * self.scale + ) self.weight.set_value(new_weight) self.merged = False return self.raw_forward(x) - child_module.forward = MethodType(forward_lora, - child_module) + child_module.forward = MethodType(forward_lora, child_module) child_module.lora_down.training = child_module.training child_module.lora_up.training = child_module.training child_module.to(dtype=paddle_dtype) diff --git a/ppdiffusers/ppdiffusers/pipeline_utils.py b/ppdiffusers/ppdiffusers/pipeline_utils.py index 4ddfca40ac392..48a455def8412 100644 --- a/ppdiffusers/ppdiffusers/pipeline_utils.py +++ b/ppdiffusers/ppdiffusers/pipeline_utils.py @@ -18,4 +18,4 @@ # It only exists so that temporarely `from diffusers.pipelines import DiffusionPipeline` works from .pipelines import ImagePipelineOutput # noqa: F401 -from .pipelines import DiffusionPipeline, TextPipelineOutput +from .pipelines import DiffusionPipeline, TextPipelineOutput # noqa: F401 diff --git a/ppdiffusers/ppdiffusers/pipelines/__init__.py b/ppdiffusers/ppdiffusers/pipelines/__init__.py index 3c7b73e5fcf47..db10dd5dccfe7 100644 --- a/ppdiffusers/ppdiffusers/pipelines/__init__.py +++ b/ppdiffusers/ppdiffusers/pipelines/__init__.py @@ -13,10 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ..utils import (OptionalDependencyNotAvailable, is_einops_available, - is_fastdeploy_available, is_k_diffusion_available, - is_librosa_available, is_note_seq_available, - is_paddle_available, is_paddlenlp_available) +from ..utils import ( + OptionalDependencyNotAvailable, + is_einops_available, + is_fastdeploy_available, + is_k_diffusion_available, + is_librosa_available, + is_note_seq_available, + is_paddle_available, + is_paddlenlp_available, +) try: if not is_paddle_available(): @@ -30,8 +36,12 @@ from .dit import DiTPipeline from .latent_diffusion import LDMSuperResolutionPipeline from .latent_diffusion_uncond import LDMPipeline - from .pipeline_utils import (AudioPipelineOutput, DiffusionPipeline, - ImagePipelineOutput, TextPipelineOutput) + from .pipeline_utils import ( + AudioPipelineOutput, + DiffusionPipeline, + ImagePipelineOutput, + TextPipelineOutput, + ) from .pndm import PNDMPipeline from .repaint import RePaintPipeline from .score_sde_ve import ScoreSdeVePipeline @@ -51,38 +61,52 @@ except OptionalDependencyNotAvailable: from ..utils.dummy_paddle_and_paddlenlp_objects import * # noqa F403 else: - from .alt_diffusion import (AltDiffusionImg2ImgPipeline, - AltDiffusionPipeline) + from .alt_diffusion import AltDiffusionImg2ImgPipeline, AltDiffusionPipeline from .audioldm import AudioLDMPipeline from .deepfloyd_if import ( - IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, - IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline, IFPipeline, - IFSuperResolutionPipeline) + IFImg2ImgPipeline, + IFImg2ImgSuperResolutionPipeline, + IFInpaintingPipeline, + IFInpaintingSuperResolutionPipeline, + IFPipeline, + IFSuperResolutionPipeline, + ) from .latent_diffusion import LDMTextToImagePipeline from .lvdm import LVDMTextToVideoPipeline, LVDMUncondPipeline from .paint_by_example import PaintByExamplePipeline from .semantic_stable_diffusion import SemanticStableDiffusionPipeline from .stable_diffusion import ( - CycleDiffusionPipeline, StableDiffusionAdapterPipeline, + CycleDiffusionPipeline, + StableDiffusionAdapterPipeline, StableDiffusionAttendAndExcitePipeline, - StableDiffusionControlNetPipeline, StableDiffusionDepth2ImgPipeline, - StableDiffusionImageVariationPipeline, StableDiffusionImg2ImgPipeline, - StableDiffusionInpaintPipeline, StableDiffusionInpaintPipelineLegacy, + StableDiffusionControlNetPipeline, + StableDiffusionDepth2ImgPipeline, + StableDiffusionImageVariationPipeline, + StableDiffusionImg2ImgPipeline, + StableDiffusionInpaintPipeline, + StableDiffusionInpaintPipelineLegacy, StableDiffusionInstructPix2PixPipeline, - StableDiffusionLatentUpscalePipeline, StableDiffusionMegaPipeline, - StableDiffusionModelEditingPipeline, StableDiffusionPanoramaPipeline, - StableDiffusionPipeline, StableDiffusionPipelineAllinOne, - StableDiffusionPix2PixZeroPipeline, StableDiffusionSAGPipeline, - StableDiffusionUpscalePipeline, StableUnCLIPImg2ImgPipeline, - StableUnCLIPPipeline) + StableDiffusionLatentUpscalePipeline, + StableDiffusionMegaPipeline, + StableDiffusionModelEditingPipeline, + StableDiffusionPanoramaPipeline, + StableDiffusionPipeline, + StableDiffusionPipelineAllinOne, + StableDiffusionPix2PixZeroPipeline, + StableDiffusionSAGPipeline, + StableDiffusionUpscalePipeline, + StableUnCLIPImg2ImgPipeline, + StableUnCLIPPipeline, + ) from .stable_diffusion_safe import StableDiffusionPipelineSafe - from .text_to_video_synthesis import (TextToVideoSDPipeline, - TextToVideoZeroPipeline) + from .text_to_video_synthesis import TextToVideoSDPipeline, TextToVideoZeroPipeline from .unclip import UnCLIPImageVariationPipeline, UnCLIPPipeline - from .versatile_diffusion import (VersatileDiffusionDualGuidedPipeline, - VersatileDiffusionImageVariationPipeline, - VersatileDiffusionPipeline, - VersatileDiffusionTextToImagePipeline) + from .versatile_diffusion import ( + VersatileDiffusionDualGuidedPipeline, + VersatileDiffusionImageVariationPipeline, + VersatileDiffusionPipeline, + VersatileDiffusionTextToImagePipeline, + ) from .vq_diffusion import VQDiffusionPipeline try: @@ -91,12 +115,13 @@ except OptionalDependencyNotAvailable: from ..utils.dummy_fastdeploy_objects import * # noqa F403 else: - from .fastdeploy_utils import (FastDeployDiffusionPipelineMixin, - FastDeployRuntimeModel) + from .fastdeploy_utils import ( + FastDeployDiffusionPipelineMixin, + FastDeployRuntimeModel, + ) try: - if not (is_paddle_available() and is_paddlenlp_available() and - is_fastdeploy_available()): + if not (is_paddle_available() and is_paddlenlp_available() and is_fastdeploy_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from ..utils.dummy_paddle_and_paddlenlp_and_fastdeploy_objects import * # noqa F403 @@ -110,11 +135,11 @@ FastDeployStableDiffusionInpaintPipelineLegacy, FastDeployStableDiffusionMegaPipeline, FastDeployStableDiffusionPipeline, - FastDeployStableDiffusionUpscalePipeline) + FastDeployStableDiffusionUpscalePipeline, + ) try: - if not (is_paddle_available() and is_paddlenlp_available() and - is_k_diffusion_available()): + if not (is_paddle_available() and is_paddlenlp_available() and is_k_diffusion_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from ..utils.dummy_paddle_and_paddlenlp_and_k_diffusion_objects import * # noqa F403 @@ -122,8 +147,7 @@ from .stable_diffusion import StableDiffusionKDiffusionPipeline try: - if not (is_paddle_available() and is_paddlenlp_available() and - is_einops_available()): + if not (is_paddle_available() and is_paddlenlp_available() and is_einops_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from ..utils.dummy_paddle_and_paddlenlp_and_einops_objects import * # noqa F403 @@ -131,11 +155,9 @@ from .unidiffuser import UniDiffuserPipeline try: - if not (is_paddle_available() and is_paddlenlp_available() and - is_note_seq_available()): + if not (is_paddle_available() and is_paddlenlp_available() and is_note_seq_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from ..utils.dummy_paddle_and_paddlenlp_and_note_seq_objects import * # noqa F403 else: - from .spectrogram_diffusion import (MidiProcessor, - SpectrogramDiffusionPipeline) + from .spectrogram_diffusion import MidiProcessor, SpectrogramDiffusionPipeline diff --git a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/modeling_roberta_series.py b/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/modeling_roberta_series.py index 087da16f84c37..70cd40778b488 100644 --- a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/modeling_roberta_series.py +++ b/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/modeling_roberta_series.py @@ -24,9 +24,7 @@ from paddlenlp.transformers.model_outputs import ModelOutput -def create_position_ids_from_input_ids(input_ids, - padding_idx, - past_key_values_length=0): +def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0): """ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols are ignored. This is modified from fairseq's `utils.make_positions`. @@ -38,8 +36,7 @@ def create_position_ids_from_input_ids(input_ids, """ # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. mask = (input_ids != padding_idx).cast("int64") - incremental_indices = (paddle.cumsum( - mask, axis=1) + past_key_values_length) * mask + incremental_indices = (paddle.cumsum(mask, axis=1) + past_key_values_length) * mask return incremental_indices + padding_idx @@ -76,21 +73,23 @@ class RobertaSeriesConfig(XLMRobertaConfig): model_type = "roberta" def __init__( - self, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - project_dim=512, - pooler_fn="cls", - learn_encoder=False, - use_attention_mask=True, - **kwargs, ): + self, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + project_dim=512, + pooler_fn="cls", + learn_encoder=False, + use_attention_mask=True, + **kwargs, + ): kwargs["return_dict"] = kwargs.pop("return_dict", True) super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, - **kwargs, ) + **kwargs, + ) self.project_dim = project_dim self.pooler_fn = pooler_fn self.learn_encoder = learn_encoder @@ -99,9 +98,7 @@ def __init__( class RobertaSeriesModelWithTransformation(RobertaPretrainedModel): _keys_to_ignore_on_load_unexpected = [r"pooler", r"logit_scale"] - _keys_to_ignore_on_load_missing = [ - r"position_ids", r"predictions.decoder.bias" - ] + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] base_model_prefix = "roberta" config_class = RobertaSeriesConfig @@ -111,39 +108,35 @@ def __init__(self, config: RobertaSeriesConfig): # must reset _padding_idx self.roberta.embeddings.word_embeddings._padding_idx = None self.transformation = nn.Linear(config.hidden_size, config.project_dim) - self.has_pre_transformation = getattr(config, "has_pre_transformation", - False) + self.has_pre_transformation = getattr(config, "has_pre_transformation", False) if self.has_pre_transformation: - self.transformation_pre = nn.Linear(config.hidden_size, - config.project_dim) - self.pre_LN = nn.LayerNorm( - config.hidden_size, eps=config.layer_norm_eps) + self.transformation_pre = nn.Linear(config.hidden_size, config.project_dim) + self.pre_LN = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.init_weights() def forward( - self, - input_ids: Optional[paddle.Tensor]=None, - attention_mask: Optional[paddle.Tensor]=None, - token_type_ids: Optional[paddle.Tensor]=None, - position_ids: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - return_dict: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, ): - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + self, + input_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + token_type_ids: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if position_ids is None: - position_ids = create_position_ids_from_input_ids( - input_ids, self.config.pad_token_id) + position_ids = create_position_ids_from_input_ids(input_ids, self.config.pad_token_id) outputs = self.base_model( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, output_attentions=output_attentions, - output_hidden_states=True - if self.has_pre_transformation else output_hidden_states, - return_dict=return_dict, ) + output_hidden_states=True if self.has_pre_transformation else output_hidden_states, + return_dict=return_dict, + ) if self.has_pre_transformation: sequence_output2 = outputs["hidden_states"][-2] @@ -154,11 +147,13 @@ def forward( projection_state=projection_state2, last_hidden_state=outputs.last_hidden_state, hidden_states=outputs.hidden_states, - attentions=outputs.attentions, ) + attentions=outputs.attentions, + ) else: projection_state = self.transformation(outputs.last_hidden_state) return TransformationModelOutput( projection_state=projection_state, last_hidden_state=outputs.last_hidden_state, hidden_states=outputs.hidden_states, - attentions=outputs.attentions, ) + attentions=outputs.attentions, + ) diff --git a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py index a610e38dbd5ac..0dee82d33981b 100644 --- a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py @@ -24,8 +24,7 @@ from ...loaders import TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers -from ...utils import (deprecate, logging, randn_tensor, - replace_example_docstring) +from ...utils import deprecate, logging, randn_tensor, replace_example_docstring from ..pipeline_utils import DiffusionPipeline from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker from . import AltDiffusionPipelineOutput, RobertaSeriesModelWithTransformation @@ -85,37 +84,33 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin): _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae: AutoencoderKL, - text_encoder: RobertaSeriesModelWithTransformation, - tokenizer: XLMRobertaTokenizer, - unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: RobertaSeriesModelWithTransformation, + tokenizer: XLMRobertaTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + ): super().__init__() - if (hasattr(scheduler.config, "steps_offset") and - scheduler.config.steps_offset != 1): + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " "to update the config accordingly as leaving `steps_offset` might led to incorrect results" " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" - " file") - deprecate( - "steps_offset!=1", - "1.0.0", - deprecation_message, - standard_warn=False) + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["steps_offset"] = 1 scheduler._internal_dict = FrozenDict(new_config) - if (hasattr(scheduler.config, "clip_sample") and - scheduler.config.clip_sample is True): + if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`." " `clip_sample` should be set to False in the configuration file. Please make sure to update the" @@ -123,11 +118,7 @@ def __init__( " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very" " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file" ) - deprecate( - "clip_sample not set", - "1.0.0", - deprecation_message, - standard_warn=False) + deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["clip_sample"] = False scheduler._internal_dict = FrozenDict(new_config) @@ -148,12 +139,10 @@ def __init__( " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." ) - is_unet_version_less_0_9_0 = hasattr( - unet.config, "_ppdiffusers_version") and version.parse( - version.parse(unet.config._ppdiffusers_version) - .base_version) < version.parse("0.9.0.dev0") - is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and - unet.config.sample_size < 64) + is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse( + version.parse(unet.config._ppdiffusers_version).base_version + ) < version.parse("0.9.0.dev0") + is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: deprecation_message = ( "The configuration file of the unet has set the default `sample_size` to smaller than" @@ -164,12 +153,9 @@ def __init__( " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`" " in the config might lead to incorrect results in future versions. If you have downloaded this" " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" - " the `unet/config.json` file") - deprecate( - "sample_size<64", - "1.0.0", - deprecation_message, - standard_warn=False) + " the `unet/config.json` file" + ) + deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(unet.config) new_config["sample_size"] = 64 unet._internal_dict = FrozenDict(new_config) @@ -181,18 +167,20 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): r""" Encodes the prompt into text encoder hidden states. @@ -232,29 +220,31 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because XLM-Roberta can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) @@ -262,8 +252,7 @@ def _encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -273,21 +262,22 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, - self.tokenizer) + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( @@ -295,46 +285,42 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) else: has_nsfw_concept = None return image, has_nsfw_concept @@ -353,53 +339,49 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - height, - width, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -412,22 +394,25 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = ( batch_size, num_channels_latents, height // self.vae_scale_factor, - width // self.vae_scale_factor, ) + width // self.vae_scale_factor, + ) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" @@ -446,25 +431,25 @@ def prepare_latents( @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - prompt: Union[str, List[str]]=None, - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, ): + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ): r""" Function invoked when calling the pipeline for generation. @@ -545,7 +530,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -567,7 +553,8 @@ def __call__( do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -582,43 +569,38 @@ def __call__( width, prompt_embeds.dtype, generator, - latents, ) + latents, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( latent_model_input, t, encoder_hidden_states=prompt_embeds, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -631,8 +613,7 @@ def __call__( image = self.decode_latents(latents) # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker( - image, prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) # 10. Convert to PIL image = self.numpy_to_pil(image) @@ -641,11 +622,9 @@ def __call__( image = self.decode_latents(latents) # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker( - image, prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) if not return_dict: return (image, has_nsfw_concept) - return AltDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return AltDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py index 313c4e5e2eca1..232d79d8da99a 100644 --- a/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/ppdiffusers/ppdiffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -27,8 +27,13 @@ from ...loaders import TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers -from ...utils import (PIL_INTERPOLATION, deprecate, logging, randn_tensor, - replace_example_docstring) +from ...utils import ( + PIL_INTERPOLATION, + deprecate, + logging, + randn_tensor, + replace_example_docstring, +) from ..pipeline_utils import DiffusionPipeline from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker from . import AltDiffusionPipelineOutput, RobertaSeriesModelWithTransformation @@ -74,11 +79,7 @@ def preprocess(image): w, h = image[0].size w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 - image = [ - np.array(i.resize( - (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] - for i in image - ] + image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image] image = np.concatenate(image, axis=0) image = np.array(image).astype(np.float32) / 255.0 image = image.transpose(0, 3, 1, 2) @@ -90,8 +91,7 @@ def preprocess(image): # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline with Stable->Alt, CLIPTextModel->RobertaSeriesModelWithTransformation, CLIPTokenizer->XLMRobertaTokenizer, AltDiffusionSafetyChecker->StableDiffusionSafetyChecker -class AltDiffusionImg2ImgPipeline(DiffusionPipeline, - TextualInversionLoaderMixin): +class AltDiffusionImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin): r""" Pipeline for text-guided image to image generation using Alt Diffusion. @@ -128,37 +128,33 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline, _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae: AutoencoderKL, - text_encoder: RobertaSeriesModelWithTransformation, - tokenizer: XLMRobertaTokenizer, - unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: RobertaSeriesModelWithTransformation, + tokenizer: XLMRobertaTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + ): super().__init__() - if (hasattr(scheduler.config, "steps_offset") and - scheduler.config.steps_offset != 1): + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " "to update the config accordingly as leaving `steps_offset` might led to incorrect results" " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" - " file") - deprecate( - "steps_offset!=1", - "1.0.0", - deprecation_message, - standard_warn=False) + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["steps_offset"] = 1 scheduler._internal_dict = FrozenDict(new_config) - if (hasattr(scheduler.config, "clip_sample") and - scheduler.config.clip_sample is True): + if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`." " `clip_sample` should be set to False in the configuration file. Please make sure to update the" @@ -166,11 +162,7 @@ def __init__( " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very" " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file" ) - deprecate( - "clip_sample not set", - "1.0.0", - deprecation_message, - standard_warn=False) + deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["clip_sample"] = False scheduler._internal_dict = FrozenDict(new_config) @@ -191,12 +183,10 @@ def __init__( " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." ) - is_unet_version_less_0_9_0 = hasattr( - unet.config, "_ppdiffusers_version") and version.parse( - version.parse(unet.config._ppdiffusers_version) - .base_version) < version.parse("0.9.0.dev0") - is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and - unet.config.sample_size < 64) + is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse( + version.parse(unet.config._ppdiffusers_version).base_version + ) < version.parse("0.9.0.dev0") + is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: deprecation_message = ( "The configuration file of the unet has set the default `sample_size` to smaller than" @@ -207,12 +197,9 @@ def __init__( " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`" " in the config might lead to incorrect results in future versions. If you have downloaded this" " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" - " the `unet/config.json` file") - deprecate( - "sample_size<64", - "1.0.0", - deprecation_message, - standard_warn=False) + " the `unet/config.json` file" + ) + deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(unet.config) new_config["sample_size"] = 64 unet._internal_dict = FrozenDict(new_config) @@ -224,21 +211,23 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) - self.image_processor = VaeImageProcessor( - vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True) + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True) self.register_to_config( - requires_safety_checker=requires_safety_checker, ) + requires_safety_checker=requires_safety_checker, + ) def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): r""" Encodes the prompt into text encoder hidden states. @@ -278,29 +267,31 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because XLM-Roberta can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) @@ -308,8 +299,7 @@ def _encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -319,21 +309,22 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, - self.tokenizer) + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( @@ -341,36 +332,33 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds @@ -379,17 +367,14 @@ def run_safety_checker(self, image, dtype): has_nsfw_concept = None else: if paddle.is_tensor(image): - feature_extractor_input = self.image_processor.postprocess( - image, output_type="pil") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") else: - feature_extractor_input = self.image_processor.numpy_to_pil( - image) - safety_checker_input = self.feature_extractor( - feature_extractor_input, return_tensors="pd") + feature_extractor_input = self.image_processor.numpy_to_pil(image) + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pd") image, has_nsfw_concept = self.safety_checker( images=image, - clip_input=paddle.cast(safety_checker_input.pixel_values, - dtype), ) + clip_input=paddle.cast(safety_checker_input.pixel_values, dtype), + ) return image, has_nsfw_concept def decode_latents(self, latents): @@ -404,51 +389,48 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - strength, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + strength, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if strength < 0 or strength > 1: - raise ValueError( - f"The value of strength should in [0.0, 1.0] but is {strength}") + raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -461,25 +443,19 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) def get_timesteps(self, num_inference_steps, strength): # get the original timestep using init_timestep - init_timestep = min( - int(num_inference_steps * strength), num_inference_steps) + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:] + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] return timesteps, num_inference_steps - t_start - def prepare_latents(self, - image, - timestep, - batch_size, - num_images_per_prompt, - dtype, - generator=None): + def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None): if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)): raise ValueError( f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or list but is {type(image)}" @@ -496,8 +472,7 @@ def prepare_latents(self, if isinstance(generator, list): init_latents = [ - self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i]) - for i in range(batch_size) + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) ] init_latents = paddle.concat(init_latents, axis=0) else: @@ -505,8 +480,7 @@ def prepare_latents(self, init_latents = self.vae.config.scaling_factor * init_latents - if (batch_size > init_latents.shape[0] and - batch_size % init_latents.shape[0] == 0): + if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: # expand init_latents for batch_size deprecation_message = ( f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial" @@ -518,12 +492,11 @@ def prepare_latents(self, "len(prompt) != len(image)", "1.0.0", deprecation_message, - standard_warn=False, ) + standard_warn=False, + ) additional_image_per_prompt = batch_size // init_latents.shape[0] - init_latents = paddle.concat( - [init_latents] * additional_image_per_prompt, axis=0) - elif (batch_size > init_latents.shape[0] and - batch_size % init_latents.shape[0] != 0): + init_latents = paddle.concat([init_latents] * additional_image_per_prompt, axis=0) + elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: raise ValueError( f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." ) @@ -542,24 +515,24 @@ def prepare_latents(self, @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - prompt: Union[str, List[str]]=None, - image: Union[paddle.Tensor, PIL.Image.Image]=None, - strength: float=0.8, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: Optional[float]=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, ): + self, + prompt: Union[str, List[str]] = None, + image: Union[paddle.Tensor, PIL.Image.Image] = None, + strength: float = 0.8, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: Optional[float] = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ): r""" Function invoked when calling the pipeline for generation. @@ -636,7 +609,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -657,17 +631,16 @@ def __call__( do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 4. Preprocess image image = self.image_processor.preprocess(image) # 5. set timesteps self.scheduler.set_timesteps(num_inference_steps) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, - strength) - latent_timestep = timesteps[:1].tile( - (batch_size * num_images_per_prompt, )) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) + latent_timestep = timesteps[:1].tile((batch_size * num_images_per_prompt,)) # 6. Prepare latent variables latents = self.prepare_latents( @@ -676,51 +649,45 @@ def __call__( batch_size, num_images_per_prompt, prompt_embeds.dtype, - generator, ) + generator, + ) # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 8. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( latent_model_input, t, encoder_hidden_states=prompt_embeds, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) if not output_type == "latent": image = self.decode_latents(latents) - image, has_nsfw_concept = self.run_safety_checker( - image, prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) else: image = latents has_nsfw_concept = None @@ -730,11 +697,9 @@ def __call__( else: do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - image = self.image_processor.postprocess( - image, output_type=output_type, do_denormalize=do_denormalize) + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) if not return_dict: return (image, has_nsfw_concept) - return AltDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return AltDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/mel.py b/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/mel.py index ffe2c5bad7456..ca098c706711c 100644 --- a/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/mel.py +++ b/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/mel.py @@ -25,7 +25,9 @@ _import_error = "" except Exception as e: _librosa_can_be_imported = False - _import_error = f"Cannot import librosa because {e}. Make sure to correctly install librosa to be able to install it." + _import_error = ( + f"Cannot import librosa because {e}. Make sure to correctly install librosa to be able to install it." + ) from PIL import Image # noqa: E402 @@ -46,14 +48,15 @@ class Mel(ConfigMixin, SchedulerMixin): @register_to_config def __init__( - self, - x_res: int=256, - y_res: int=256, - sample_rate: int=22050, - n_fft: int=2048, - hop_length: int=512, - top_db: int=80, - n_iter: int=32, ): + self, + x_res: int = 256, + y_res: int = 256, + sample_rate: int = 22050, + n_fft: int = 2048, + hop_length: int = 512, + top_db: int = 80, + n_iter: int = 32, + ): self.hop_length = hop_length self.sr = sample_rate self.n_fft = n_fft @@ -77,7 +80,7 @@ def set_resolution(self, x_res: int, y_res: int): self.n_mels = self.y_res self.slice_size = self.x_res * self.hop_length - 1 - def load_audio(self, audio_file: str=None, raw_audio: np.ndarray=None): + def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None): """Load audio. Args: @@ -91,10 +94,12 @@ def load_audio(self, audio_file: str=None, raw_audio: np.ndarray=None): # Pad with silence if necessary. if len(self.audio) < self.x_res * self.hop_length: - self.audio = np.concatenate([ - self.audio, - np.zeros((self.x_res * self.hop_length - len(self.audio), )), - ]) + self.audio = np.concatenate( + [ + self.audio, + np.zeros((self.x_res * self.hop_length - len(self.audio),)), + ] + ) def get_number_of_slices(self) -> int: """Get number of slices in audio. @@ -104,7 +109,7 @@ def get_number_of_slices(self) -> int: """ return len(self.audio) // self.slice_size - def get_audio_slice(self, slice: int=0) -> np.ndarray: + def get_audio_slice(self, slice: int = 0) -> np.ndarray: """Get slice of audio. Args: @@ -113,7 +118,7 @@ def get_audio_slice(self, slice: int=0) -> np.ndarray: Returns: `np.ndarray`: audio as numpy array """ - return self.audio[self.slice_size * slice:self.slice_size * (slice + 1)] + return self.audio[self.slice_size * slice : self.slice_size * (slice + 1)] def get_sample_rate(self) -> int: """Get sample rate: @@ -137,11 +142,10 @@ def audio_slice_to_image(self, slice: int) -> Image.Image: sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, - n_mels=self.n_mels, ) + n_mels=self.n_mels, + ) log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db) - bytedata = (( - (log_S + self.top_db) * 255 / self.top_db).clip(0, 255) + 0.5 - ).astype(np.uint8) + bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) + 0.5).astype(np.uint8) image = Image.fromarray(bytedata) return image @@ -154,8 +158,7 @@ def image_to_audio(self, image: Image.Image) -> np.ndarray: Returns: audio (`np.ndarray`): raw audio """ - bytedata = np.frombuffer( - image.tobytes(), dtype="uint8").reshape((image.height, image.width)) + bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape((image.height, image.width)) log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db S = librosa.db_to_power(log_S) audio = librosa.feature.inverse.mel_to_audio( @@ -163,5 +166,6 @@ def image_to_audio(self, image: Image.Image) -> np.ndarray: sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, - n_iter=self.n_iter, ) + n_iter=self.n_iter, + ) return audio diff --git a/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py index 50b57cd936dac..581729f066b72 100644 --- a/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py @@ -23,8 +23,12 @@ from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import DDIMScheduler, DDPMScheduler from ...utils import randn_tensor -from ..pipeline_utils import (AudioPipelineOutput, BaseOutput, - DiffusionPipeline, ImagePipelineOutput) +from ..pipeline_utils import ( + AudioPipelineOutput, + BaseOutput, + DiffusionPipeline, + ImagePipelineOutput, +) from .mel import Mel @@ -43,14 +47,14 @@ class AudioDiffusionPipeline(DiffusionPipeline): _optional_components = ["vqvae"] def __init__( - self, - vqvae: AutoencoderKL, - unet: UNet2DConditionModel, - mel: Mel, - scheduler: Union[DDIMScheduler, DDPMScheduler], ): + self, + vqvae: AutoencoderKL, + unet: UNet2DConditionModel, + mel: Mel, + scheduler: Union[DDIMScheduler, DDPMScheduler], + ): super().__init__() - self.register_modules( - unet=unet, scheduler=scheduler, mel=mel, vqvae=vqvae) + self.register_modules(unet=unet, scheduler=scheduler, mel=mel, vqvae=vqvae) def get_input_dims(self) -> Tuple: """Returns dimension of input image @@ -62,8 +66,9 @@ def get_input_dims(self) -> Tuple: # For backwards compatibility sample_size = ( (input_module.config.sample_size, input_module.config.sample_size) - if type(input_module.config.sample_size) == int else - input_module.config.sample_size) + if type(input_module.config.sample_size) == int + else input_module.config.sample_size + ) return sample_size def get_default_steps(self) -> int: @@ -76,23 +81,25 @@ def get_default_steps(self) -> int: @paddle.no_grad() def __call__( - self, - batch_size: int=1, - audio_file: str=None, - raw_audio: np.ndarray=None, - slice: int=0, - start_step: int=0, - steps: int=None, - generator: paddle.Generator=None, - mask_start_secs: float=0, - mask_end_secs: float=0, - step_generator: paddle.Generator=None, - eta: float=0, - noise: paddle.Tensor=None, - encoding: paddle.Tensor=None, - return_dict=True, ) -> Union[Union[ - AudioPipelineOutput, ImagePipelineOutput], Tuple[List[ - Image.Image], Tuple[int, List[np.ndarray]]], ]: + self, + batch_size: int = 1, + audio_file: str = None, + raw_audio: np.ndarray = None, + slice: int = 0, + start_step: int = 0, + steps: int = None, + generator: paddle.Generator = None, + mask_start_secs: float = 0, + mask_end_secs: float = 0, + step_generator: paddle.Generator = None, + eta: float = 0, + noise: paddle.Tensor = None, + encoding: paddle.Tensor = None, + return_dict=True, + ) -> Union[ + Union[AudioPipelineOutput, ImagePipelineOutput], + Tuple[List[Image.Image], Tuple[int, List[np.ndarray]]], + ]: """Generate random mel spectrogram from audio input and convert to audio. Args: @@ -122,7 +129,8 @@ def __call__( if type(self.unet.config.sample_size) == int: self.unet.config.sample_size = ( self.unet.config.sample_size, - self.unet.config.sample_size, ) + self.unet.config.sample_size, + ) input_dims = self.get_input_dims() self.mel.set_resolution(x_res=input_dims[1], y_res=input_dims[0]) if noise is None: @@ -131,44 +139,43 @@ def __call__( batch_size, self.unet.config.in_channels, self.unet.config.sample_size[0], - self.unet.config.sample_size[1], ), - generator=generator, ) + self.unet.config.sample_size[1], + ), + generator=generator, + ) images = noise mask = None if audio_file is not None or raw_audio is not None: self.mel.load_audio(audio_file, raw_audio) input_image = self.mel.audio_slice_to_image(slice) - input_image = np.frombuffer( - input_image.tobytes(), dtype="uint8").reshape( - (input_image.height, input_image.width)) + input_image = np.frombuffer(input_image.tobytes(), dtype="uint8").reshape( + (input_image.height, input_image.width) + ) input_image = (input_image / 255) * 2 - 1 - input_images = paddle.to_tensor( - input_image[np.newaxis, :, :], dtype=paddle.float32) + input_images = paddle.to_tensor(input_image[np.newaxis, :, :], dtype=paddle.float32) if self.vqvae is not None: - input_images = self.vqvae.encode( - paddle.unsqueeze(input_images, 0)).latent_dist.sample( - generator=generator)[0] + input_images = self.vqvae.encode(paddle.unsqueeze(input_images, 0)).latent_dist.sample( + generator=generator + )[0] input_images = self.vqvae.config.scaling_factor * input_images if start_step > 0: - images[0, 0] = self.scheduler.add_noise( - input_images, noise, - self.scheduler.timesteps[start_step - 1]) + images[0, 0] = self.scheduler.add_noise(input_images, noise, self.scheduler.timesteps[start_step - 1]) - pixels_per_second = (self.unet.config.sample_size[1] * - self.mel.get_sample_rate() / self.mel.x_res / - self.mel.hop_length) + pixels_per_second = ( + self.unet.config.sample_size[1] * self.mel.get_sample_rate() / self.mel.x_res / self.mel.hop_length + ) mask_start = int(mask_start_secs * pixels_per_second) mask_end = int(mask_end_secs * pixels_per_second) mask = self.scheduler.add_noise( input_images, noise, - paddle.to_tensor(self.scheduler.timesteps[start_step:]), ) + paddle.to_tensor(self.scheduler.timesteps[start_step:]), + ) - for step, t in enumerate( - self.progress_bar(self.scheduler.timesteps[start_step:])): + for step, t in enumerate(self.progress_bar(self.scheduler.timesteps[start_step:])): if isinstance(self.unet, UNet2DConditionModel): model_output = self.unet(images, t, encoding)["sample"] else: @@ -180,13 +187,15 @@ def __call__( timestep=t, sample=images, eta=eta, - generator=step_generator, )["prev_sample"] + generator=step_generator, + )["prev_sample"] else: images = self.scheduler.step( model_output=model_output, timestep=t, sample=images, - generator=step_generator, )["prev_sample"] + generator=step_generator, + )["prev_sample"] if mask is not None: if mask_start > 0: @@ -202,20 +211,20 @@ def __call__( images = (images / 2 + 0.5).clip(0, 1) images = images.transpose([0, 2, 3, 1]).cast("float32").numpy() images = (images * 255).round().astype("uint8") - images = list((Image.fromarray(_[:, :, 0]) for _ in images) - if images.shape[3] == 1 else (Image.fromarray( - _, mode="RGB").convert("L") for _ in images)) + images = list( + (Image.fromarray(_[:, :, 0]) for _ in images) + if images.shape[3] == 1 + else (Image.fromarray(_, mode="RGB").convert("L") for _ in images) + ) audios = [self.mel.image_to_audio(_) for _ in images] if not return_dict: return images, (self.mel.get_sample_rate(), audios) - return BaseOutput( - **AudioPipelineOutput(np.array(audios)[:, np.newaxis, :]), - **ImagePipelineOutput(images)) + return BaseOutput(**AudioPipelineOutput(np.array(audios)[:, np.newaxis, :]), **ImagePipelineOutput(images)) @paddle.no_grad() - def encode(self, images: List[Image.Image], steps: int=50) -> np.ndarray: + def encode(self, images: List[Image.Image], steps: int = 50) -> np.ndarray: """Reverse step process: recover noisy image from generated image. Args: @@ -229,36 +238,30 @@ def encode(self, images: List[Image.Image], steps: int=50) -> np.ndarray: # Only works with DDIM as this method is deterministic assert isinstance(self.scheduler, DDIMScheduler) self.scheduler.set_timesteps(steps) - sample = np.array([ - np.frombuffer( - image.tobytes(), dtype="uint8").reshape( - (1, image.height, image.width)) for image in images - ]) + sample = np.array( + [np.frombuffer(image.tobytes(), dtype="uint8").reshape((1, image.height, image.width)) for image in images] + ) sample = (sample / 255) * 2 - 1 sample = paddle.to_tensor(sample) - for t in self.progress_bar( - paddle.flip(self.scheduler.timesteps, (0, ))): - prev_timestep = (t - self.scheduler.num_train_timesteps // - self.scheduler.num_inference_steps) + for t in self.progress_bar(paddle.flip(self.scheduler.timesteps, (0,))): + prev_timestep = t - self.scheduler.num_train_timesteps // self.scheduler.num_inference_steps alpha_prod_t = self.scheduler.alphas_cumprod[t] - alpha_prod_t_prev = (self.scheduler.alphas_cumprod[prev_timestep] - if prev_timestep >= 0 else - self.scheduler.final_alpha_cumprod) + alpha_prod_t_prev = ( + self.scheduler.alphas_cumprod[prev_timestep] + if prev_timestep >= 0 + else self.scheduler.final_alpha_cumprod + ) beta_prod_t = 1 - alpha_prod_t model_output = self.unet(sample, t)["sample"] - pred_sample_direction = (1 - alpha_prod_t_prev)**( - 0.5) * model_output - sample = (sample - pred_sample_direction) * alpha_prod_t_prev**( - -0.5) - sample = (sample * alpha_prod_t**(0.5) + beta_prod_t** - (0.5) * model_output) + pred_sample_direction = (1 - alpha_prod_t_prev) ** (0.5) * model_output + sample = (sample - pred_sample_direction) * alpha_prod_t_prev ** (-0.5) + sample = sample * alpha_prod_t ** (0.5) + beta_prod_t ** (0.5) * model_output return sample @staticmethod - def slerp(x0: paddle.Tensor, x1: paddle.Tensor, - alpha: float) -> paddle.Tensor: + def slerp(x0: paddle.Tensor, x1: paddle.Tensor, alpha: float) -> paddle.Tensor: """Spherical Linear intERPolation Args: @@ -270,8 +273,5 @@ def slerp(x0: paddle.Tensor, x1: paddle.Tensor, `paddle.Tensor`: interpolated tensor """ - theta = acos( - paddle.dot(paddle.flatten(x0), paddle.flatten(x1)) / - paddle.norm(x0) / paddle.norm(x1)) - return sin((1 - alpha) * theta) * x0 / sin(theta) + sin( - alpha * theta) * x1 / sin(theta) + theta = acos(paddle.dot(paddle.flatten(x0), paddle.flatten(x1)) / paddle.norm(x0) / paddle.norm(x1)) + return sin((1 - alpha) * theta) * x0 / sin(theta) + sin(alpha * theta) * x1 / sin(theta) diff --git a/ppdiffusers/ppdiffusers/pipelines/audioldm/__init__.py b/ppdiffusers/ppdiffusers/pipelines/audioldm/__init__.py index 87a892da4d792..4ab25efc20003 100644 --- a/ppdiffusers/ppdiffusers/pipelines/audioldm/__init__.py +++ b/ppdiffusers/ppdiffusers/pipelines/audioldm/__init__.py @@ -12,12 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ...utils import (OptionalDependencyNotAvailable, is_paddle_available, - is_paddlenlp_available, is_paddlenlp_version) +from ...utils import ( + OptionalDependencyNotAvailable, + is_paddle_available, + is_paddlenlp_available, + is_paddlenlp_version, +) try: - if not (is_paddlenlp_available() and is_paddle_available() and - is_paddlenlp_version(">=", "2.5.2")): + if not (is_paddlenlp_available() and is_paddle_available() and is_paddlenlp_version(">=", "2.5.2")): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from ...utils.dummy_paddle_and_paddlenlp_objects import AudioLDMPipeline diff --git a/ppdiffusers/ppdiffusers/pipelines/audioldm/pipeline_audioldm.py b/ppdiffusers/ppdiffusers/pipelines/audioldm/pipeline_audioldm.py index 0ba945ffdf429..8354d5e18ad8b 100644 --- a/ppdiffusers/ppdiffusers/pipelines/audioldm/pipeline_audioldm.py +++ b/ppdiffusers/ppdiffusers/pipelines/audioldm/pipeline_audioldm.py @@ -18,8 +18,11 @@ import numpy as np import paddle import paddle.nn.functional as F -from paddlenlp.transformers import (ClapTextModelWithProjection, - RobertaTokenizer, SpeechT5HifiGan) +from paddlenlp.transformers import ( + ClapTextModelWithProjection, + RobertaTokenizer, + SpeechT5HifiGan, +) from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers @@ -65,13 +68,14 @@ class AudioLDMPipeline(DiffusionPipeline): """ def __init__( - self, - vae: AutoencoderKL, - text_encoder: ClapTextModelWithProjection, - tokenizer: RobertaTokenizer, - unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - vocoder: SpeechT5HifiGan, ): + self, + vae: AutoencoderKL, + text_encoder: ClapTextModelWithProjection, + tokenizer: RobertaTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + vocoder: SpeechT5HifiGan, + ): super().__init__() self.register_modules( vae=vae, @@ -79,17 +83,19 @@ def __init__( tokenizer=tokenizer, unet=unet, scheduler=scheduler, - vocoder=vocoder, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + vocoder=vocoder, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) def _encode_prompt( - self, - prompt, - num_waveforms_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_waveforms_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): """ Encodes the prompt into text encoder hidden states. @@ -113,13 +119,13 @@ def _encode_prompt( argument. """ if self.text_encoder.text_model.embeddings.token_type_ids.dtype not in [ - paddle.int16, - paddle.int32, - paddle.int64, + paddle.int16, + paddle.int32, + paddle.int64, ]: self.text_encoder.text_model.embeddings.token_type_ids = ( - self.text_encoder.text_model.embeddings.token_type_ids.cast( - "int32")) + self.text_encoder.text_model.embeddings.token_type_ids.cast("int32") + ) if prompt is not None and isinstance(prompt, str): batch_size = 1 @@ -134,34 +140,35 @@ def _encode_prompt( max_length=self.tokenizer.model_max_length, return_attention_mask=True, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids attention_mask = text_inputs.attention_mask untruncated_ids = self.tokenizer( prompt, padding="longest", return_tensors="pd", - return_attention_mask=True, ).input_ids - if (untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and - not paddle.equal_all( - x=text_input_ids, y=untruncated_ids).item()): + return_attention_mask=True, + ).input_ids + if ( + untruncated_ids.shape[-1] >= text_input_ids.shape[-1] + and not paddle.equal_all(x=text_input_ids, y=untruncated_ids).item() + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( f"The following part of your input was truncated because CLAP can only handle sequences up to {self.tokenizer.model_max_length} tokens: {removed_text}" ) - prompt_embeds = self.text_encoder( - text_input_ids.cast("int32"), attention_mask=attention_mask) + prompt_embeds = self.text_encoder(text_input_ids.cast("int32"), attention_mask=attention_mask) prompt_embeds = prompt_embeds.text_embeds # additional L_2 normalization over each hidden-state prompt_embeds = F.normalize(x=prompt_embeds, axis=-1) prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) bs_embed, seq_len = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method - prompt_embeds = prompt_embeds.tile( - repeat_times=[1, num_waveforms_per_prompt]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_waveforms_per_prompt, seq_len]) + prompt_embeds = prompt_embeds.tile(repeat_times=[1, num_waveforms_per_prompt]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_waveforms_per_prompt, seq_len]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -187,33 +194,28 @@ def _encode_prompt( max_length=max_length, truncation=True, return_tensors="pd", - return_attention_mask=True, ) + return_attention_mask=True, + ) uncond_input_ids = uncond_input.input_ids attention_mask = uncond_input.attention_mask - negative_prompt_embeds = self.text_encoder( - uncond_input_ids.cast("int32"), attention_mask=attention_mask) + negative_prompt_embeds = self.text_encoder(uncond_input_ids.cast("int32"), attention_mask=attention_mask) negative_prompt_embeds = negative_prompt_embeds.text_embeds # additional L_2 normalization over each hidden-state - negative_prompt_embeds = F.normalize( - x=negative_prompt_embeds, axis=-1) + negative_prompt_embeds = F.normalize(x=negative_prompt_embeds, axis=-1) if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - repeat_times=[1, num_waveforms_per_prompt]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_waveforms_per_prompt, seq_len]) + negative_prompt_embeds = negative_prompt_embeds.tile(repeat_times=[1, num_waveforms_per_prompt]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_waveforms_per_prompt, seq_len]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - x=[negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat(x=[negative_prompt_embeds, prompt_embeds]) return prompt_embeds def decode_latents(self, latents): @@ -235,28 +237,27 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - audio_length_in_s, - vocoder_upsample_factor, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + audio_length_in_s, + vocoder_upsample_factor, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): min_audio_length_in_s = vocoder_upsample_factor * self.vae_scale_factor if audio_length_in_s < min_audio_length_in_s: raise ValueError( @@ -266,8 +267,11 @@ def check_inputs( raise ValueError( f"The number of frequency bins in the vocoder's log-mel spectrogram has to be divisible by the VAE scale factor, but got {self.vocoder.config.model_in_dim} bins and a scale factor of {self.vae_scale_factor}." ) - if (callback_steps is None or callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + if ( + callback_steps is None + or callback_steps is not None + and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}." ) @@ -279,11 +283,8 @@ def check_inputs( raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two." @@ -294,18 +295,13 @@ def check_inputs( f"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds` {negative_prompt_embeds.shape}." ) - def prepare_latents(self, - batch_size, - num_channels_latents, - height, - dtype, - generator, - latents=None): + def prepare_latents(self, batch_size, num_channels_latents, height, dtype, generator, latents=None): shape = ( batch_size, num_channels_latents, height // self.vae_scale_factor, - self.vocoder.config.model_in_dim // self.vae_scale_factor, ) + self.vocoder.config.model_in_dim // self.vae_scale_factor, + ) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( f"You have passed a list of generators of length {len(generator)}, but requested an effective batch size of {batch_size}. Make sure the batch size matches the length of the generators." @@ -322,24 +318,24 @@ def prepare_latents(self, @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - prompt: Union[str, List[str]]=None, - audio_length_in_s: Optional[float]=None, - num_inference_steps: int=10, - guidance_scale: float=2.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_waveforms_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - output_type: Optional[str]="np", ): + self, + prompt: Union[str, List[str]] = None, + audio_length_in_s: Optional[float] = None, + num_inference_steps: int = 10, + guidance_scale: float = 2.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_waveforms_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + output_type: Optional[str] = "np", + ): """ Function invoked when calling the pipeline for generation. @@ -406,18 +402,13 @@ def __call__( When returning a tuple, the first element is a list with the generated audios. """ # 0. Convert audio input length from seconds to spectrogram height - vocoder_upsample_factor = (np.prod(self.vocoder.config.upsample_rates) / - self.vocoder.config.sampling_rate) + vocoder_upsample_factor = np.prod(self.vocoder.config.upsample_rates) / self.vocoder.config.sampling_rate if audio_length_in_s is None: - audio_length_in_s = (self.unet.config.sample_size * - self.vae_scale_factor * - vocoder_upsample_factor) + audio_length_in_s = self.unet.config.sample_size * self.vae_scale_factor * vocoder_upsample_factor height = int(audio_length_in_s / vocoder_upsample_factor) - original_waveform_length = int(audio_length_in_s * - self.vocoder.config.sampling_rate) + original_waveform_length = int(audio_length_in_s * self.vocoder.config.sampling_rate) if height % self.vae_scale_factor != 0: - height = (int(np.ceil(height / self.vae_scale_factor)) * - self.vae_scale_factor) + height = int(np.ceil(height / self.vae_scale_factor)) * self.vae_scale_factor logger.info( f"Audio length in seconds {audio_length_in_s} is increased to {height * vocoder_upsample_factor} so that it can be handled by the model. It will be cut to {audio_length_in_s} after the denoising process." ) @@ -430,7 +421,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -452,7 +444,8 @@ def __call__( do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -466,21 +459,19 @@ def __call__( height, prompt_embeds.dtype, generator, - latents, ) + latents, + ) # 6. Prepare extra step kwargs extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat(x=[latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat(x=[latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( @@ -488,22 +479,19 @@ def __call__( t, encoder_hidden_states=None, class_labels=prompt_embeds, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample # perform guidance if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk( - chunks=2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if (i == len(timesteps) - 1 or i + 1 > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or i + 1 > num_warmup_steps and (i + 1) % self.scheduler.order == 0: progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -519,6 +507,6 @@ def __call__( audio = audio.numpy() if not return_dict: - return (audio, ) + return (audio,) return AudioPipelineOutput(audios=audio) diff --git a/ppdiffusers/ppdiffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py index cc5f2a1b40f43..b4bc68019bf35 100644 --- a/ppdiffusers/ppdiffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py @@ -40,13 +40,13 @@ def __init__(self, unet, scheduler): @paddle.no_grad() def __call__( - self, - batch_size: int=1, - num_inference_steps: int=100, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - audio_length_in_s: Optional[float]=None, - return_dict: bool=True, ) -> Union[AudioPipelineOutput, Tuple]: + self, + batch_size: int = 1, + num_inference_steps: int = 100, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + audio_length_in_s: Optional[float] = None, + return_dict: bool = True, + ) -> Union[AudioPipelineOutput, Tuple]: """ Args: batch_size (`int`, *optional*, defaults to 1): @@ -67,18 +67,18 @@ def __call__( True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. """ if audio_length_in_s is None: - audio_length_in_s = (self.unet.config.sample_size / - self.unet.config.sample_rate) + audio_length_in_s = self.unet.config.sample_size / self.unet.config.sample_rate sample_size = audio_length_in_s * self.unet.config.sample_rate - down_scale_factor = 2**len(self.unet.up_blocks) + down_scale_factor = 2 ** len(self.unet.up_blocks) if sample_size < 3 * down_scale_factor: raise ValueError( f"{audio_length_in_s} is too small. Make sure it's bigger or equal to {3 * down_scale_factor / self.unet.config.sample_rate}." ) original_sample_size = int(sample_size) if sample_size % down_scale_factor != 0: - sample_size = (audio_length_in_s * self.unet.config.sample_rate // - down_scale_factor + 1) * down_scale_factor + sample_size = ( + audio_length_in_s * self.unet.config.sample_rate // down_scale_factor + 1 + ) * down_scale_factor logger.info( f"{audio_length_in_s} is increased to {sample_size / self.unet.config.sample_rate} so that it can be handled by the model. It will be cut to {original_sample_size / self.unet.config.sample_rate} after the denoising process." ) @@ -105,5 +105,5 @@ def __call__( audio = audio.clip(min=-1, max=1).astype(dtype="float32").cpu().numpy() audio = audio[:, :, :original_sample_size] if not return_dict: - return (audio, ) + return (audio,) return AudioPipelineOutput(audios=audio) diff --git a/ppdiffusers/ppdiffusers/pipelines/ddim/pipeline_ddim.py b/ppdiffusers/ppdiffusers/pipelines/ddim/pipeline_ddim.py index ee8dbc0143053..2ffd3401ceb13 100644 --- a/ppdiffusers/ppdiffusers/pipelines/ddim/pipeline_ddim.py +++ b/ppdiffusers/ppdiffusers/pipelines/ddim/pipeline_ddim.py @@ -42,15 +42,15 @@ def __init__(self, unet, scheduler): @paddle.no_grad() def __call__( - self, - batch_size: int=1, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - eta: float=0.0, - num_inference_steps: int=50, - use_clipped_model_output: Optional[bool]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, ) -> Union[ImagePipelineOutput, Tuple]: + self, + batch_size: int = 1, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + eta: float = 0.0, + num_inference_steps: int = 50, + use_clipped_model_output: Optional[bool] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + ) -> Union[ImagePipelineOutput, Tuple]: """ Args: batch_size (`int`, *optional*, defaults to 1): @@ -82,19 +82,20 @@ def __call__( batch_size, self.unet.config.in_channels, self.unet.config.sample_size, - self.unet.config.sample_size, ) + self.unet.config.sample_size, + ) else: image_shape = ( batch_size, self.unet.config.in_channels, - *self.unet.config.sample_size, ) + *self.unet.config.sample_size, + ) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( f"You have passed a list of generators of length {len(generator)}, but requested an effective batch size of {batch_size}. Make sure the batch size matches the length of the generators." ) - image = randn_tensor( - image_shape, generator=generator, dtype=self.unet.dtype) + image = randn_tensor(image_shape, generator=generator, dtype=self.unet.dtype) # set step values self.scheduler.set_timesteps(num_inference_steps) @@ -112,7 +113,8 @@ def __call__( image, eta=eta, use_clipped_model_output=use_clipped_model_output, - generator=generator, ).prev_sample + generator=generator, + ).prev_sample image = (image / 2 + 0.5).clip(min=0, max=1) image = image.cpu().transpose(perm=[0, 2, 3, 1]).numpy() @@ -120,5 +122,5 @@ def __call__( image = self.numpy_to_pil(image) if not return_dict: - return (image, ) + return (image,) return ImagePipelineOutput(images=image) diff --git a/ppdiffusers/ppdiffusers/pipelines/ddpm/pipeline_ddpm.py b/ppdiffusers/ppdiffusers/pipelines/ddpm/pipeline_ddpm.py index cc73ea0e507a5..4ff2fe9a23bd9 100644 --- a/ppdiffusers/ppdiffusers/pipelines/ddpm/pipeline_ddpm.py +++ b/ppdiffusers/ppdiffusers/pipelines/ddpm/pipeline_ddpm.py @@ -38,13 +38,13 @@ def __init__(self, unet, scheduler): @paddle.no_grad() def __call__( - self, - batch_size: int=1, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - num_inference_steps: int=1000, - output_type: Optional[str]="pil", - return_dict: bool=True, ) -> Union[ImagePipelineOutput, Tuple]: + self, + batch_size: int = 1, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + num_inference_steps: int = 1000, + output_type: Optional[str] = "pil", + return_dict: bool = True, + ) -> Union[ImagePipelineOutput, Tuple]: """ Args: batch_size (`int`, *optional*, defaults to 1): @@ -70,12 +70,14 @@ def __call__( batch_size, self.unet.config.in_channels, self.unet.config.sample_size, - self.unet.config.sample_size, ) + self.unet.config.sample_size, + ) else: image_shape = ( batch_size, self.unet.config.in_channels, - *self.unet.config.sample_size, ) + *self.unet.config.sample_size, + ) image = randn_tensor(image_shape, generator=generator) # set step values self.scheduler.set_timesteps(num_inference_steps) @@ -84,12 +86,11 @@ def __call__( model_output = self.unet(image, t).sample # 2. compute previous image: x_t -> x_t-1 - image = self.scheduler.step( - model_output, t, image, generator=generator).prev_sample + image = self.scheduler.step(model_output, t, image, generator=generator).prev_sample image = (image / 2 + 0.5).clip(min=0, max=1) image = image.cpu().transpose(perm=[0, 2, 3, 1]).numpy() if output_type == "pil": image = self.numpy_to_pil(image) if not return_dict: - return (image, ) + return (image,) return ImagePipelineOutput(images=image) diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/__init__.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/__init__.py index ca49b436b3f91..fccb87f08b7b7 100644 --- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/__init__.py +++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/__init__.py @@ -18,12 +18,22 @@ import numpy as np import PIL -from ...utils import (BaseOutput, OptionalDependencyNotAvailable, - is_paddle_available, is_paddlenlp_available) -from .timesteps import (fast27_timesteps, smart27_timesteps, smart50_timesteps, - smart100_timesteps, smart185_timesteps, - super27_timesteps, super40_timesteps, - super100_timesteps) +from ...utils import ( + BaseOutput, + OptionalDependencyNotAvailable, + is_paddle_available, + is_paddlenlp_available, +) +from .timesteps import ( + fast27_timesteps, + smart27_timesteps, + smart50_timesteps, + smart100_timesteps, + smart185_timesteps, + super27_timesteps, + super40_timesteps, + super100_timesteps, +) @dataclass @@ -55,11 +65,11 @@ class IFPipelineOutput(BaseOutput): else: from .pipeline_if import IFPipeline from .pipeline_if_img2img import IFImg2ImgPipeline - from .pipeline_if_img2img_superresolution import \ - IFImg2ImgSuperResolutionPipeline + from .pipeline_if_img2img_superresolution import IFImg2ImgSuperResolutionPipeline from .pipeline_if_inpainting import IFInpaintingPipeline - from .pipeline_if_inpainting_superresolution import \ - IFInpaintingSuperResolutionPipeline + from .pipeline_if_inpainting_superresolution import ( + IFInpaintingSuperResolutionPipeline, + ) from .pipeline_if_superresolution import IFSuperResolutionPipeline from .safety_checker import IFSafetyChecker from .watermark import IFWatermarker diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if.py index 787a25590a6e1..2a7c3bddcaedd 100644 --- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if.py +++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if.py @@ -19,14 +19,19 @@ from typing import Any, Callable, Dict, List, Optional, Union import paddle -from paddlenlp.transformers import (CLIPImageProcessor, T5EncoderModel, - T5Tokenizer) +from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer from ...loaders import LoraLoaderMixin from ...models import UNet2DConditionModel from ...schedulers import DDPMScheduler -from ...utils import (BACKENDS_MAPPING, is_bs4_available, is_ftfy_available, - logging, randn_tensor, replace_example_docstring) +from ...utils import ( + BACKENDS_MAPPING, + is_bs4_available, + is_ftfy_available, + logging, + randn_tensor, + replace_example_docstring, +) from ..pipeline_utils import DiffusionPipeline from . import IFPipelineOutput from .safety_checker import IFSafetyChecker @@ -101,8 +106,8 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin): watermarker: Optional[IFWatermarker] bad_punct_regex = re.compile( - r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + - "\|" + "\\" + "\/" + "\*" + r"]{1,}") # noqa + r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}" + ) # noqa _optional_components = [ "tokenizer", @@ -113,15 +118,16 @@ class IFPipeline(DiffusionPipeline, LoraLoaderMixin): ] def __init__( - self, - tokenizer: T5Tokenizer, - text_encoder: T5EncoderModel, - unet: UNet2DConditionModel, - scheduler: DDPMScheduler, - safety_checker: Optional[IFSafetyChecker], - feature_extractor: Optional[CLIPImageProcessor], - watermarker: Optional[IFWatermarker], - requires_safety_checker: bool=True, ): + self, + tokenizer: T5Tokenizer, + text_encoder: T5EncoderModel, + unet: UNet2DConditionModel, + scheduler: DDPMScheduler, + safety_checker: Optional[IFSafetyChecker], + feature_extractor: Optional[CLIPImageProcessor], + watermarker: Optional[IFWatermarker], + requires_safety_checker: bool = True, + ): super().__init__() if safety_checker is None and requires_safety_checker: @@ -147,19 +153,21 @@ def __init__( scheduler=scheduler, safety_checker=safety_checker, feature_extractor=feature_extractor, - watermarker=watermarker, ) + watermarker=watermarker, + ) self.register_to_config(requires_safety_checker=requires_safety_checker) @paddle.no_grad() def encode_prompt( - self, - prompt, - do_classifier_free_guidance=True, - num_images_per_prompt=1, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - clean_caption: bool=False, ): + self, + prompt, + do_classifier_free_guidance=True, + num_images_per_prompt=1, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + clean_caption: bool = False, + ): r""" Encodes the prompt into text encoder hidden states. @@ -186,7 +194,8 @@ def encode_prompt( if type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) if prompt is not None and isinstance(prompt, str): batch_size = 1 @@ -199,31 +208,31 @@ def encode_prompt( max_length = 77 if prompt_embeds is None: - prompt = self._text_preprocessing( - prompt, clean_caption=clean_caption) + prompt = self._text_preprocessing(prompt, clean_caption=clean_caption) text_inputs = self.tokenizer( prompt, padding="max_length", max_length=max_length, truncation=True, add_special_tokens=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): - removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, max_length - 1:-1]) + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1]) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {max_length} tokens: {removed_text}") + f" {max_length} tokens: {removed_text}" + ) attention_mask = text_inputs.attention_mask prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] if self.text_encoder is not None: @@ -238,8 +247,7 @@ def encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -252,12 +260,12 @@ def encode_prompt( raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt - uncond_tokens = self._text_preprocessing( - uncond_tokens, clean_caption=clean_caption) + uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( uncond_tokens, @@ -266,12 +274,14 @@ def encode_prompt( truncation=True, return_attention_mask=True, add_special_tokens=True, - return_tensors="pd", ) + return_tensors="pd", + ) attention_mask = uncond_input.attention_mask negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: @@ -281,10 +291,8 @@ def encode_prompt( if dtype is not None: negative_prompt_embeds = negative_prompt_embeds.cast(dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch @@ -296,11 +304,11 @@ def encode_prompt( def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, nsfw_detected, watermark_detected = self.safety_checker( images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype), ) + clip_input=safety_checker_input.pixel_values.cast(dtype), + ) else: nsfw_detected = None watermark_detected = None @@ -314,46 +322,44 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -366,10 +372,10 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) - def prepare_intermediate_images(self, batch_size, num_channels, height, - width, dtype, generator): + def prepare_intermediate_images(self, batch_size, num_channels, height, width, dtype, generator): shape = (batch_size, num_channels, height, width) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( @@ -377,8 +383,7 @@ def prepare_intermediate_images(self, batch_size, num_channels, height, f" size of {batch_size}. Make sure the batch size matches the length of the generators." ) - intermediate_images = randn_tensor( - shape, generator=generator, dtype=dtype) + intermediate_images = randn_tensor(shape, generator=generator, dtype=dtype) # scale the initial noise by the standard deviation required by the scheduler intermediate_images = intermediate_images * self.scheduler.init_noise_sigma @@ -386,14 +391,12 @@ def prepare_intermediate_images(self, batch_size, num_channels, height, def _text_preprocessing(self, text, clean_caption=False): if clean_caption and not is_bs4_available(): - logger.warn(BACKENDS_MAPPING["bs4"][-1].format( - "Setting `clean_caption=True`")) + logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`")) logger.warn("Setting `clean_caption` to False...") clean_caption = False if clean_caption and not is_ftfy_available(): - logger.warn(BACKENDS_MAPPING["ftfy"][-1].format( - "Setting `clean_caption=True`")) + logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`")) logger.warn("Setting `clean_caption` to False...") clean_caption = False @@ -419,11 +422,13 @@ def _clean_caption(self, caption): caption = re.sub( r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", - caption, ) # regex for urls + caption, + ) # regex for urls caption = re.sub( r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", - caption, ) # regex for urls + caption, + ) # regex for urls # html: caption = BeautifulSoup(caption, features="html.parser").text @@ -450,7 +455,8 @@ def _clean_caption(self, caption): caption = re.sub( r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa "-", - caption, ) + caption, + ) # кавычки к одному стандарту caption = re.sub(r"[`´«»“”¨]", '"', caption) @@ -477,15 +483,13 @@ def _clean_caption(self, caption): # "123456.." caption = re.sub(r"\b\d{6,}\b", "", caption) # filenames: - caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", - "", caption) + caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption) # caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT""" caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT""" - caption = re.sub(self.bad_punct_regex, r" ", - caption) # ***AUSVERKAUFT***, #AUSVERKAUFT + caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT caption = re.sub(r"\s+\.\s+", r" ", caption) # " . " # this-is-my-cute-cat / this_is_my_cute_cat @@ -503,13 +507,10 @@ def _clean_caption(self, caption): caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption) caption = re.sub(r"(free\s)?download(\sfree)?", "", caption) caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption) - caption = re.sub( - r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", - caption) + caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption) caption = re.sub(r"\bpage\s+\d+\b", "", caption) - caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", - caption) # j2d1a2a... + caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a... caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption) @@ -529,26 +530,26 @@ def _clean_caption(self, caption): @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - prompt: Union[str, List[str]]=None, - num_inference_steps: int=100, - timesteps: List[int]=None, - guidance_scale: float=7.0, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - height: Optional[int]=None, - width: Optional[int]=None, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: int=1, - clean_caption: bool=True, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, ): + self, + prompt: Union[str, List[str]] = None, + num_inference_steps: int = 100, + timesteps: List[int] = None, + guidance_scale: float = 7.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + height: Optional[int] = None, + width: Optional[int] = None, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: int = 1, + clean_caption: bool = True, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ): """ Function invoked when calling the pipeline for generation. @@ -625,7 +626,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # 2. Define call parameters height = height or self.unet.config.sample_size @@ -651,11 +653,11 @@ def __call__( negative_prompt=negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, - clean_caption=clean_caption, ) + clean_caption=clean_caption, + ) if do_classifier_free_guidance: - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) # 4. Prepare timesteps if timesteps is not None: @@ -673,19 +675,19 @@ def __call__( height, width, prompt_embeds.dtype, - generator, ) + generator, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): - model_input = (paddle.concat([intermediate_images] * 2) - if do_classifier_free_guidance else - intermediate_images) + model_input = ( + paddle.concat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images + ) model_input = self.scheduler.scale_model_input(model_input, t) # predict the noise residual @@ -694,7 +696,8 @@ def __call__( t, encoder_hidden_states=prompt_embeds, cross_attention_kwargs=cross_attention_kwargs, - return_dict=False, )[0] + return_dict=False, + )[0] # perform guidance if do_classifier_free_guidance: @@ -704,27 +707,28 @@ def __call__( model_input.shape[1], noise_pred_uncond.shape[1] - model_input.shape[1], ], - axis=1, ) + axis=1, + ) noise_pred_text, predicted_variance = noise_pred_text.split( [ model_input.shape[1], noise_pred_text.shape[1] - model_input.shape[1], ], - axis=1, ) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) - noise_pred = paddle.concat( - [noise_pred, predicted_variance], axis=1) + axis=1, + ) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1) if self.scheduler.config.variance_type not in [ - "learned", - "learned_range", + "learned", + "learned_range", ]: noise_pred, _ = noise_pred.split( [ model_input.shape[1], noise_pred_uncond.shape[1] - model_input.shape[1], ], - axis=1, ) + axis=1, + ) # compute the previous noisy sample x_t -> x_t-1 intermediate_images = self.scheduler.step( @@ -732,12 +736,11 @@ def __call__( t, intermediate_images, **extra_step_kwargs, - return_dict=False, )[0] + return_dict=False, + )[0] # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, intermediate_images) @@ -750,16 +753,14 @@ def __call__( image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy() # 9. Run safety checker - image, nsfw_detected, watermark_detected = self.run_safety_checker( - image, prompt_embeds.dtype) + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype) # 10. Convert to PIL image = self.numpy_to_pil(image) # 11. Apply watermark if self.watermarker is not None: - image = self.watermarker.apply_watermark( - image, self.unet.config.sample_size) + image = self.watermarker.apply_watermark(image, self.unet.config.sample_size) elif output_type == "pd": nsfw_detected = None watermark_detected = None @@ -770,8 +771,7 @@ def __call__( image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy() # 9. Run safety checker - image, nsfw_detected, watermark_detected = self.run_safety_checker( - image, prompt_embeds.dtype) + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype) if not return_dict: return (image, nsfw_detected, watermark_detected) @@ -779,4 +779,5 @@ def __call__( return IFPipelineOutput( images=image, nsfw_detected=nsfw_detected, - watermark_detected=watermark_detected, ) + watermark_detected=watermark_detected, + ) diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py index 7fa08748a3d86..30df336ebed8c 100644 --- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py +++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py @@ -21,14 +21,19 @@ import numpy as np import paddle import PIL -from paddlenlp.transformers import (CLIPImageProcessor, T5EncoderModel, - T5Tokenizer) +from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer from ...models import UNet2DConditionModel from ...schedulers import DDPMScheduler -from ...utils import (BACKENDS_MAPPING, PIL_INTERPOLATION, is_bs4_available, - is_ftfy_available, logging, randn_tensor, - replace_example_docstring) +from ...utils import ( + BACKENDS_MAPPING, + PIL_INTERPOLATION, + is_bs4_available, + is_ftfy_available, + logging, + randn_tensor, + replace_example_docstring, +) from ..pipeline_utils import DiffusionPipeline from . import IFPipelineOutput from .safety_checker import IFSafetyChecker @@ -55,8 +60,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image: else: h = int(round(img_size / 8 / coef) * 8) - images = images.resize( - (w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None) + images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None) return images @@ -127,8 +131,8 @@ class IFImg2ImgPipeline(DiffusionPipeline): watermarker: Optional[IFWatermarker] bad_punct_regex = re.compile( - r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + - "\|" + "\\" + "\/" + "\*" + r"]{1,}") # noqa + r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}" + ) # noqa _optional_components = [ "tokenizer", @@ -139,15 +143,16 @@ class IFImg2ImgPipeline(DiffusionPipeline): ] def __init__( - self, - tokenizer: T5Tokenizer, - text_encoder: T5EncoderModel, - unet: UNet2DConditionModel, - scheduler: DDPMScheduler, - safety_checker: Optional[IFSafetyChecker], - feature_extractor: Optional[CLIPImageProcessor], - watermarker: Optional[IFWatermarker], - requires_safety_checker: bool=True, ): + self, + tokenizer: T5Tokenizer, + text_encoder: T5EncoderModel, + unet: UNet2DConditionModel, + scheduler: DDPMScheduler, + safety_checker: Optional[IFSafetyChecker], + feature_extractor: Optional[CLIPImageProcessor], + watermarker: Optional[IFWatermarker], + requires_safety_checker: bool = True, + ): super().__init__() if safety_checker is None and requires_safety_checker: @@ -173,20 +178,22 @@ def __init__( scheduler=scheduler, safety_checker=safety_checker, feature_extractor=feature_extractor, - watermarker=watermarker, ) + watermarker=watermarker, + ) self.register_to_config(requires_safety_checker=requires_safety_checker) @paddle.no_grad() # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt def encode_prompt( - self, - prompt, - do_classifier_free_guidance=True, - num_images_per_prompt=1, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - clean_caption: bool=False, ): + self, + prompt, + do_classifier_free_guidance=True, + num_images_per_prompt=1, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + clean_caption: bool = False, + ): r""" Encodes the prompt into text encoder hidden states. @@ -213,7 +220,8 @@ def encode_prompt( if type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) if prompt is not None and isinstance(prompt, str): batch_size = 1 @@ -226,31 +234,31 @@ def encode_prompt( max_length = 77 if prompt_embeds is None: - prompt = self._text_preprocessing( - prompt, clean_caption=clean_caption) + prompt = self._text_preprocessing(prompt, clean_caption=clean_caption) text_inputs = self.tokenizer( prompt, padding="max_length", max_length=max_length, truncation=True, add_special_tokens=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids - - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): - removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, max_length - 1:-1]) + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1]) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {max_length} tokens: {removed_text}") + f" {max_length} tokens: {removed_text}" + ) attention_mask = text_inputs.attention_mask prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] if self.text_encoder is not None: @@ -265,8 +273,7 @@ def encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -279,12 +286,12 @@ def encode_prompt( raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt - uncond_tokens = self._text_preprocessing( - uncond_tokens, clean_caption=clean_caption) + uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( uncond_tokens, @@ -293,12 +300,14 @@ def encode_prompt( truncation=True, return_attention_mask=True, add_special_tokens=True, - return_tensors="pd", ) + return_tensors="pd", + ) attention_mask = uncond_input.attention_mask negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: @@ -308,10 +317,8 @@ def encode_prompt( if dtype is not None: negative_prompt_embeds = negative_prompt_embeds.cast(dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch @@ -324,11 +331,11 @@ def encode_prompt( # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, nsfw_detected, watermark_detected = self.safety_checker( images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype), ) + clip_input=safety_checker_input.pixel_values.cast(dtype), + ) else: nsfw_detected = None watermark_detected = None @@ -342,48 +349,46 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - image, - batch_size, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + image, + batch_size, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -396,19 +401,23 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) if isinstance(image, list): check_image_type = image[0] else: check_image_type = image - if (not isinstance(check_image_type, paddle.Tensor) and - not isinstance(check_image_type, PIL.Image.Image) and - not isinstance(check_image_type, np.ndarray)): + if ( + not isinstance(check_image_type, paddle.Tensor) + and not isinstance(check_image_type, PIL.Image.Image) + and not isinstance(check_image_type, np.ndarray) + ): raise ValueError( "`image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" - f" {type(check_image_type)}") + f" {type(check_image_type)}" + ) if isinstance(image, list): image_batch_size = len(image) @@ -422,21 +431,17 @@ def check_inputs( assert False if batch_size != image_batch_size: - raise ValueError( - f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}" - ) + raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}") # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing def _text_preprocessing(self, text, clean_caption=False): if clean_caption and not is_bs4_available(): - logger.warn(BACKENDS_MAPPING["bs4"][-1].format( - "Setting `clean_caption=True`")) + logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`")) logger.warn("Setting `clean_caption` to False...") clean_caption = False if clean_caption and not is_ftfy_available(): - logger.warn(BACKENDS_MAPPING["ftfy"][-1].format( - "Setting `clean_caption=True`")) + logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`")) logger.warn("Setting `clean_caption` to False...") clean_caption = False @@ -463,11 +468,13 @@ def _clean_caption(self, caption): caption = re.sub( r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", - caption, ) # regex for urls + caption, + ) # regex for urls caption = re.sub( r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", - caption, ) # regex for urls + caption, + ) # regex for urls # html: caption = BeautifulSoup(caption, features="html.parser").text @@ -494,7 +501,8 @@ def _clean_caption(self, caption): caption = re.sub( r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa "-", - caption, ) + caption, + ) # кавычки к одному стандарту caption = re.sub(r"[`´«»“”¨]", '"', caption) @@ -521,15 +529,13 @@ def _clean_caption(self, caption): # "123456.." caption = re.sub(r"\b\d{6,}\b", "", caption) # filenames: - caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", - "", caption) + caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption) # caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT""" caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT""" - caption = re.sub(self.bad_punct_regex, r" ", - caption) # ***AUSVERKAUFT***, #AUSVERKAUFT + caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT caption = re.sub(r"\s+\.\s+", r" ", caption) # " . " # this-is-my-cute-cat / this_is_my_cute_cat @@ -547,13 +553,10 @@ def _clean_caption(self, caption): caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption) caption = re.sub(r"(free\s)?download(\sfree)?", "", caption) caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption) - caption = re.sub( - r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", - caption) + caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption) caption = re.sub(r"\bpage\s+\d+\b", "", caption) - caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", - caption) # j2d1a2a... + caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a... caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption) @@ -598,35 +601,24 @@ def numpy_to_pd(images): image = numpy_to_pd(image) # to pd elif isinstance(image[0], np.ndarray): - image = (np.concatenate( - image, axis=0) if image[0].ndim == 4 else np.stack( - image, axis=0)) + image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0) image = numpy_to_pd(image) elif isinstance(image[0], paddle.Tensor): - image = (paddle.concat( - image, axis=0) if image[0].ndim == 4 else paddle.stack( - image, axis=0)) + image = paddle.concat(image, axis=0) if image[0].ndim == 4 else paddle.stack(image, axis=0) return image def get_timesteps(self, num_inference_steps, strength): # get the original timestep using init_timestep - init_timestep = min( - int(num_inference_steps * strength), num_inference_steps) + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start:] return timesteps, num_inference_steps - t_start - def prepare_intermediate_images(self, - image, - timestep, - batch_size, - num_images_per_prompt, - dtype, - generator=None): + def prepare_intermediate_images(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None): _, channels, height, width = image.shape batch_size = batch_size * num_images_per_prompt @@ -649,27 +641,33 @@ def prepare_intermediate_images(self, @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - prompt: Union[str, List[str]]=None, - image: Union[PIL.Image.Image, paddle.Tensor, np.ndarray, List[ - PIL.Image.Image], List[paddle.Tensor], List[np.ndarray], ]=None, - strength: float=0.7, - num_inference_steps: int=80, - timesteps: List[int]=None, - guidance_scale: float=10.0, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: int=1, - clean_caption: bool=True, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, ): + self, + prompt: Union[str, List[str]] = None, + image: Union[ + PIL.Image.Image, + paddle.Tensor, + np.ndarray, + List[PIL.Image.Image], + List[paddle.Tensor], + List[np.ndarray], + ] = None, + strength: float = 0.7, + num_inference_steps: int = 80, + timesteps: List[int] = None, + guidance_scale: float = 10.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: int = 1, + clean_caption: bool = True, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ): """ Function invoked when calling the pipeline for generation. @@ -760,7 +758,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` @@ -775,11 +774,11 @@ def __call__( negative_prompt=negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, - clean_caption=clean_caption, ) + clean_caption=clean_caption, + ) if do_classifier_free_guidance: - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) dtype = prompt_embeds.dtype @@ -792,32 +791,29 @@ def __call__( self.scheduler.set_timesteps(num_inference_steps) timesteps = self.scheduler.timesteps - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, - strength) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) # 5. Prepare intermediate images image = self.preprocess_image(image) image = image.cast(dtype) noise_timestep = timesteps[0:1] - noise_timestep = noise_timestep.tile( - (batch_size * num_images_per_prompt, )) + noise_timestep = noise_timestep.tile((batch_size * num_images_per_prompt,)) intermediate_images = self.prepare_intermediate_images( - image, noise_timestep, batch_size, num_images_per_prompt, dtype, - generator) + image, noise_timestep, batch_size, num_images_per_prompt, dtype, generator + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): - model_input = (paddle.concat([intermediate_images] * 2) - if do_classifier_free_guidance else - intermediate_images) + model_input = ( + paddle.concat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images + ) model_input = self.scheduler.scale_model_input(model_input, t) # predict the noise residual @@ -825,7 +821,8 @@ def __call__( model_input, t, encoder_hidden_states=prompt_embeds, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample # perform guidance if do_classifier_free_guidance: @@ -835,27 +832,25 @@ def __call__( model_input.shape[1], noise_pred_uncond.shape[1] - model_input.shape[1], ], - axis=1, ) + axis=1, + ) noise_pred_text, predicted_variance = noise_pred_text.split( [ model_input.shape[1], noise_pred_text.shape[1] - model_input.shape[1], ], - axis=1, ) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) - noise_pred = paddle.concat( - [noise_pred, predicted_variance], axis=1) + axis=1, + ) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1) # compute the previous noisy sample x_t -> x_t-1 intermediate_images = self.scheduler.step( - noise_pred, t, intermediate_images, - **extra_step_kwargs).prev_sample + noise_pred, t, intermediate_images, **extra_step_kwargs + ).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, intermediate_images) @@ -868,16 +863,14 @@ def __call__( image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy() # 9. Run safety checker - image, nsfw_detected, watermark_detected = self.run_safety_checker( - image, prompt_embeds.dtype) + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype) # 10. Convert to PIL image = self.numpy_to_pil(image) # 11. Apply watermark if self.watermarker is not None: - self.watermarker.apply_watermark(image, - self.unet.config.sample_size) + self.watermarker.apply_watermark(image, self.unet.config.sample_size) elif output_type == "pd": nsfw_detected = None watermark_detected = None @@ -888,8 +881,7 @@ def __call__( image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy() # 9. Run safety checker - image, nsfw_detected, watermark_detected = self.run_safety_checker( - image, prompt_embeds.dtype) + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype) if not return_dict: return (image, nsfw_detected, watermark_detected) @@ -897,4 +889,5 @@ def __call__( return IFPipelineOutput( images=image, nsfw_detected=nsfw_detected, - watermark_detected=watermark_detected, ) + watermark_detected=watermark_detected, + ) diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py index 42dd7fa35fa27..63e586bf00e34 100644 --- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py @@ -22,14 +22,19 @@ import paddle import paddle.nn.functional as F import PIL -from paddlenlp.transformers import (CLIPImageProcessor, T5EncoderModel, - T5Tokenizer) +from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer from ...models import UNet2DConditionModel from ...schedulers import DDPMScheduler -from ...utils import (BACKENDS_MAPPING, PIL_INTERPOLATION, is_bs4_available, - is_ftfy_available, logging, randn_tensor, - replace_example_docstring) +from ...utils import ( + BACKENDS_MAPPING, + PIL_INTERPOLATION, + is_bs4_available, + is_ftfy_available, + logging, + randn_tensor, + replace_example_docstring, +) from ..pipeline_utils import DiffusionPipeline from . import IFPipelineOutput from .safety_checker import IFSafetyChecker @@ -57,8 +62,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image: else: h = int(round(img_size / 8 / coef) * 8) - images = images.resize( - (w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None) + images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None) return images @@ -130,8 +134,8 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline): watermarker: Optional[IFWatermarker] bad_punct_regex = re.compile( - r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + - "\|" + "\\" + "\/" + "\*" + r"]{1,}") # noqa + r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}" + ) # noqa _optional_components = [ "tokenizer", @@ -141,16 +145,17 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline): ] def __init__( - self, - tokenizer: T5Tokenizer, - text_encoder: T5EncoderModel, - unet: UNet2DConditionModel, - scheduler: DDPMScheduler, - image_noising_scheduler: DDPMScheduler, - safety_checker: Optional[IFSafetyChecker], - feature_extractor: Optional[CLIPImageProcessor], - watermarker: Optional[IFWatermarker], - requires_safety_checker: bool=True, ): + self, + tokenizer: T5Tokenizer, + text_encoder: T5EncoderModel, + unet: UNet2DConditionModel, + scheduler: DDPMScheduler, + image_noising_scheduler: DDPMScheduler, + safety_checker: Optional[IFSafetyChecker], + feature_extractor: Optional[CLIPImageProcessor], + watermarker: Optional[IFWatermarker], + requires_safety_checker: bool = True, + ): super().__init__() if safety_checker is None and requires_safety_checker: @@ -182,20 +187,19 @@ def __init__( image_noising_scheduler=image_noising_scheduler, safety_checker=safety_checker, feature_extractor=feature_extractor, - watermarker=watermarker, ) + watermarker=watermarker, + ) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing def _text_preprocessing(self, text, clean_caption=False): if clean_caption and not is_bs4_available(): - logger.warn(BACKENDS_MAPPING["bs4"][-1].format( - "Setting `clean_caption=True`")) + logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`")) logger.warn("Setting `clean_caption` to False...") clean_caption = False if clean_caption and not is_ftfy_available(): - logger.warn(BACKENDS_MAPPING["ftfy"][-1].format( - "Setting `clean_caption=True`")) + logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`")) logger.warn("Setting `clean_caption` to False...") clean_caption = False @@ -222,11 +226,13 @@ def _clean_caption(self, caption): caption = re.sub( r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", - caption, ) # regex for urls + caption, + ) # regex for urls caption = re.sub( r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", - caption, ) # regex for urls + caption, + ) # regex for urls # html: caption = BeautifulSoup(caption, features="html.parser").text @@ -253,7 +259,8 @@ def _clean_caption(self, caption): caption = re.sub( r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa "-", - caption, ) + caption, + ) # кавычки к одному стандарту caption = re.sub(r"[`´«»“”¨]", '"', caption) @@ -280,15 +287,13 @@ def _clean_caption(self, caption): # "123456.." caption = re.sub(r"\b\d{6,}\b", "", caption) # filenames: - caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", - "", caption) + caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption) # caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT""" caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT""" - caption = re.sub(self.bad_punct_regex, r" ", - caption) # ***AUSVERKAUFT***, #AUSVERKAUFT + caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT caption = re.sub(r"\s+\.\s+", r" ", caption) # " . " # this-is-my-cute-cat / this_is_my_cute_cat @@ -306,13 +311,10 @@ def _clean_caption(self, caption): caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption) caption = re.sub(r"(free\s)?download(\sfree)?", "", caption) caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption) - caption = re.sub( - r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", - caption) + caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption) caption = re.sub(r"\bpage\s+\d+\b", "", caption) - caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", - caption) # j2d1a2a... + caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a... caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption) @@ -332,14 +334,15 @@ def _clean_caption(self, caption): @paddle.no_grad() # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt def encode_prompt( - self, - prompt, - do_classifier_free_guidance=True, - num_images_per_prompt=1, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - clean_caption: bool=False, ): + self, + prompt, + do_classifier_free_guidance=True, + num_images_per_prompt=1, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + clean_caption: bool = False, + ): r""" Encodes the prompt into text encoder hidden states. @@ -366,7 +369,8 @@ def encode_prompt( if type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) if prompt is not None and isinstance(prompt, str): batch_size = 1 @@ -379,31 +383,31 @@ def encode_prompt( max_length = 77 if prompt_embeds is None: - prompt = self._text_preprocessing( - prompt, clean_caption=clean_caption) + prompt = self._text_preprocessing(prompt, clean_caption=clean_caption) text_inputs = self.tokenizer( prompt, padding="max_length", max_length=max_length, truncation=True, add_special_tokens=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids - - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): - removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, max_length - 1:-1]) + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1]) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {max_length} tokens: {removed_text}") + f" {max_length} tokens: {removed_text}" + ) attention_mask = text_inputs.attention_mask prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] if self.text_encoder is not None: @@ -419,8 +423,7 @@ def encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -433,12 +436,12 @@ def encode_prompt( raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt - uncond_tokens = self._text_preprocessing( - uncond_tokens, clean_caption=clean_caption) + uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( uncond_tokens, @@ -447,12 +450,14 @@ def encode_prompt( truncation=True, return_attention_mask=True, add_special_tokens=True, - return_tensors="pd", ) + return_tensors="pd", + ) attention_mask = uncond_input.attention_mask negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: @@ -461,10 +466,8 @@ def encode_prompt( if dtype is not None: negative_prompt_embeds = negative_prompt_embeds.cast(dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch @@ -477,11 +480,11 @@ def encode_prompt( # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, nsfw_detected, watermark_detected = self.safety_checker( images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype), ) + clip_input=safety_checker_input.pixel_values.cast(dtype), + ) else: nsfw_detected = None watermark_detected = None @@ -495,49 +498,47 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - image, - original_image, - batch_size, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + image, + original_image, + batch_size, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -550,7 +551,8 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) # image @@ -559,12 +561,15 @@ def check_inputs( else: check_image_type = image - if (not isinstance(check_image_type, paddle.Tensor) and - not isinstance(check_image_type, PIL.Image.Image) and - not isinstance(check_image_type, np.ndarray)): + if ( + not isinstance(check_image_type, paddle.Tensor) + and not isinstance(check_image_type, PIL.Image.Image) + and not isinstance(check_image_type, np.ndarray) + ): raise ValueError( "`image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" - f" {type(check_image_type)}") + f" {type(check_image_type)}" + ) if isinstance(image, list): image_batch_size = len(image) @@ -578,9 +583,7 @@ def check_inputs( assert False if batch_size != image_batch_size: - raise ValueError( - f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}" - ) + raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}") # original_image @@ -589,12 +592,15 @@ def check_inputs( else: check_image_type = original_image - if (not isinstance(check_image_type, paddle.Tensor) and - not isinstance(check_image_type, PIL.Image.Image) and - not isinstance(check_image_type, np.ndarray)): + if ( + not isinstance(check_image_type, paddle.Tensor) + and not isinstance(check_image_type, PIL.Image.Image) + and not isinstance(check_image_type, np.ndarray) + ): raise ValueError( "`original_image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" - f" {type(check_image_type)}") + f" {type(check_image_type)}" + ) if isinstance(original_image, list): image_batch_size = len(original_image) @@ -613,8 +619,7 @@ def check_inputs( ) # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.preprocess_image with preprocess_image -> preprocess_original_image - def preprocess_original_image(self, - image: PIL.Image.Image) -> paddle.Tensor: + def preprocess_original_image(self, image: PIL.Image.Image) -> paddle.Tensor: if not isinstance(image, list): image = [image] @@ -642,21 +647,16 @@ def numpy_to_pd(images): image = numpy_to_pd(image) # to pd elif isinstance(image[0], np.ndarray): - image = (np.concatenate( - image, axis=0) if image[0].ndim == 4 else np.stack( - image, axis=0)) + image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0) image = numpy_to_pd(image) elif isinstance(image[0], paddle.Tensor): - image = (paddle.concat( - image, axis=0) if image[0].ndim == 4 else paddle.stack( - image, axis=0)) + image = paddle.concat(image, axis=0) if image[0].ndim == 4 else paddle.stack(image, axis=0) return image # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_superresolution.IFSuperResolutionPipeline.preprocess_image - def preprocess_image(self, image: PIL.Image.Image, - num_images_per_prompt) -> paddle.Tensor: + def preprocess_image(self, image: PIL.Image.Image, num_images_per_prompt) -> paddle.Tensor: if not isinstance(image, paddle.Tensor) and not isinstance(image, list): image = [image] @@ -679,8 +679,7 @@ def preprocess_image(self, image: PIL.Image.Image, elif dims == 4: image = paddle.concat(image, axis=0) else: - raise ValueError( - f"Image must have 3 or 4 dimensions, instead got {dims}") + raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}") image = image.cast(self.unet.dtype) @@ -691,8 +690,7 @@ def preprocess_image(self, image: PIL.Image.Image, # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.get_timesteps def get_timesteps(self, num_inference_steps, strength): # get the original timestep using init_timestep - init_timestep = min( - int(num_inference_steps * strength), num_inference_steps) + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start:] @@ -700,13 +698,7 @@ def get_timesteps(self, num_inference_steps, strength): return timesteps, num_inference_steps - t_start # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.prepare_intermediate_images - def prepare_intermediate_images(self, - image, - timestep, - batch_size, - num_images_per_prompt, - dtype, - generator=None): + def prepare_intermediate_images(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None): _, channels, height, width = image.shape batch_size = batch_size * num_images_per_prompt @@ -729,30 +721,35 @@ def prepare_intermediate_images(self, @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor], - original_image: Union[PIL.Image.Image, paddle.Tensor, np.ndarray, - List[PIL.Image.Image], List[ - paddle.Tensor], List[np.ndarray], ]=None, - strength: float=0.8, - prompt: Union[str, List[str]]=None, - num_inference_steps: int=50, - timesteps: List[int]=None, - guidance_scale: float=4.0, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: int=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - noise_level: int=250, - clean_caption: bool=True, ): + self, + image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor], + original_image: Union[ + PIL.Image.Image, + paddle.Tensor, + np.ndarray, + List[PIL.Image.Image], + List[paddle.Tensor], + List[np.ndarray], + ] = None, + strength: float = 0.8, + prompt: Union[str, List[str]] = None, + num_inference_steps: int = 50, + timesteps: List[int] = None, + guidance_scale: float = 4.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + noise_level: int = 250, + clean_caption: bool = True, + ): """ Function invoked when calling the pipeline for generation. @@ -848,7 +845,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # 2. Define call parameters @@ -865,11 +863,11 @@ def __call__( negative_prompt=negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, - clean_caption=clean_caption, ) + clean_caption=clean_caption, + ) if do_classifier_free_guidance: - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) dtype = prompt_embeds.dtype @@ -882,8 +880,7 @@ def __call__( self.scheduler.set_timesteps(num_inference_steps) timesteps = self.scheduler.timesteps - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, - strength) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) # 5. prepare original image original_image = self.preprocess_original_image(original_image) @@ -891,8 +888,7 @@ def __call__( # 6. Prepare intermediate images noise_timestep = timesteps[0:1] - noise_timestep = noise_timestep.tile( - (batch_size * num_images_per_prompt, )) + noise_timestep = noise_timestep.tile((batch_size * num_images_per_prompt,)) intermediate_images = self.prepare_intermediate_images( original_image, @@ -900,21 +896,19 @@ def __call__( batch_size, num_images_per_prompt, dtype, - generator, ) + generator, + ) # 7. Prepare upscaled image and noise level _, _, height, width = original_image.shape image = self.preprocess_image(image, num_images_per_prompt) - upscaled = F.interpolate( - image, (height, width), mode="bilinear", align_corners=True) + upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True) noise_level = paddle.to_tensor([noise_level] * upscaled.shape[0]) - noise = randn_tensor( - upscaled.shape, generator=generator, dtype=upscaled.dtype) - upscaled = self.image_noising_scheduler.add_noise( - upscaled, noise, timesteps=noise_level) + noise = randn_tensor(upscaled.shape, generator=generator, dtype=upscaled.dtype) + upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level) if do_classifier_free_guidance: noise_level = paddle.concat([noise_level] * 2) @@ -923,19 +917,15 @@ def __call__( extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 9. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): model_input = paddle.concat( - [ - intermediate_images, - upscaled.cast(intermediate_images.dtype) - ], - axis=1, ) - - model_input = (paddle.concat([model_input] * 2) - if do_classifier_free_guidance else model_input) + [intermediate_images, upscaled.cast(intermediate_images.dtype)], + axis=1, + ) + + model_input = paddle.concat([model_input] * 2) if do_classifier_free_guidance else model_input model_input = self.scheduler.scale_model_input(model_input, t) # predict the noise residual @@ -944,7 +934,8 @@ def __call__( t, encoder_hidden_states=prompt_embeds, class_labels=noise_level, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample # perform guidance if do_classifier_free_guidance: @@ -952,31 +943,27 @@ def __call__( noise_pred_uncond, _ = noise_pred_uncond.split( [ model_input.shape[1] // 2, - noise_pred_uncond.shape[1] - model_input.shape[1] // - 2, + noise_pred_uncond.shape[1] - model_input.shape[1] // 2, ], - axis=1, ) + axis=1, + ) noise_pred_text, predicted_variance = noise_pred_text.split( [ model_input.shape[1] // 2, - noise_pred_text.shape[1] - model_input.shape[1] // - 2, + noise_pred_text.shape[1] - model_input.shape[1] // 2, ], - axis=1, ) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) - noise_pred = paddle.concat( - [noise_pred, predicted_variance], axis=1) + axis=1, + ) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1) # compute the previous noisy sample x_t -> x_t-1 intermediate_images = self.scheduler.step( - noise_pred, t, intermediate_images, - **extra_step_kwargs).prev_sample + noise_pred, t, intermediate_images, **extra_step_kwargs + ).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, intermediate_images) @@ -989,16 +976,14 @@ def __call__( image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy() # 11. Run safety checker - image, nsfw_detected, watermark_detected = self.run_safety_checker( - image, prompt_embeds.dtype) + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype) # 12. Convert to PIL image = self.numpy_to_pil(image) # 13. Apply watermark if self.watermarker is not None: - self.watermarker.apply_watermark(image, - self.unet.config.sample_size) + self.watermarker.apply_watermark(image, self.unet.config.sample_size) elif output_type == "pd": nsfw_detected = None watermark_detected = None @@ -1008,8 +993,7 @@ def __call__( image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy() # 11. Run safety checker - image, nsfw_detected, watermark_detected = self.run_safety_checker( - image, prompt_embeds.dtype) + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype) if not return_dict: return (image, nsfw_detected, watermark_detected) @@ -1017,4 +1001,5 @@ def __call__( return IFPipelineOutput( images=image, nsfw_detected=nsfw_detected, - watermark_detected=watermark_detected, ) + watermark_detected=watermark_detected, + ) diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py index 72fd143c156c2..5ff5992901c78 100644 --- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py +++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py @@ -21,14 +21,19 @@ import numpy as np import paddle import PIL -from paddlenlp.transformers import (CLIPImageProcessor, T5EncoderModel, - T5Tokenizer) +from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer from ...models import UNet2DConditionModel from ...schedulers import DDPMScheduler -from ...utils import (BACKENDS_MAPPING, PIL_INTERPOLATION, is_bs4_available, - is_ftfy_available, logging, randn_tensor, - replace_example_docstring) +from ...utils import ( + BACKENDS_MAPPING, + PIL_INTERPOLATION, + is_bs4_available, + is_ftfy_available, + logging, + randn_tensor, + replace_example_docstring, +) from ..pipeline_utils import DiffusionPipeline from . import IFPipelineOutput from .safety_checker import IFSafetyChecker @@ -56,8 +61,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image: else: h = int(round(img_size / 8 / coef) * 8) - images = images.resize( - (w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None) + images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None) return images @@ -130,8 +134,8 @@ class IFInpaintingPipeline(DiffusionPipeline): watermarker: Optional[IFWatermarker] bad_punct_regex = re.compile( - r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + - "\|" + "\\" + "\/" + "\*" + r"]{1,}") # noqa + r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}" + ) # noqa _optional_components = [ "tokenizer", @@ -142,15 +146,16 @@ class IFInpaintingPipeline(DiffusionPipeline): ] def __init__( - self, - tokenizer: T5Tokenizer, - text_encoder: T5EncoderModel, - unet: UNet2DConditionModel, - scheduler: DDPMScheduler, - safety_checker: Optional[IFSafetyChecker], - feature_extractor: Optional[CLIPImageProcessor], - watermarker: Optional[IFWatermarker], - requires_safety_checker: bool=True, ): + self, + tokenizer: T5Tokenizer, + text_encoder: T5EncoderModel, + unet: UNet2DConditionModel, + scheduler: DDPMScheduler, + safety_checker: Optional[IFSafetyChecker], + feature_extractor: Optional[CLIPImageProcessor], + watermarker: Optional[IFWatermarker], + requires_safety_checker: bool = True, + ): super().__init__() if safety_checker is None and requires_safety_checker: @@ -176,20 +181,22 @@ def __init__( scheduler=scheduler, safety_checker=safety_checker, feature_extractor=feature_extractor, - watermarker=watermarker, ) + watermarker=watermarker, + ) self.register_to_config(requires_safety_checker=requires_safety_checker) @paddle.no_grad() # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt def encode_prompt( - self, - prompt, - do_classifier_free_guidance=True, - num_images_per_prompt=1, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - clean_caption: bool=False, ): + self, + prompt, + do_classifier_free_guidance=True, + num_images_per_prompt=1, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + clean_caption: bool = False, + ): r""" Encodes the prompt into text encoder hidden states. @@ -216,7 +223,8 @@ def encode_prompt( if type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) if prompt is not None and isinstance(prompt, str): batch_size = 1 @@ -229,32 +237,32 @@ def encode_prompt( max_length = 77 if prompt_embeds is None: - prompt = self._text_preprocessing( - prompt, clean_caption=clean_caption) + prompt = self._text_preprocessing(prompt, clean_caption=clean_caption) text_inputs = self.tokenizer( prompt, padding="max_length", max_length=max_length, truncation=True, add_special_tokens=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids - - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): - removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, max_length - 1:-1]) + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1]) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {max_length} tokens: {removed_text}") + f" {max_length} tokens: {removed_text}" + ) attention_mask = text_inputs.attention_mask prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] if self.text_encoder is not None: @@ -269,8 +277,7 @@ def encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -283,12 +290,12 @@ def encode_prompt( raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt - uncond_tokens = self._text_preprocessing( - uncond_tokens, clean_caption=clean_caption) + uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( uncond_tokens, @@ -297,12 +304,14 @@ def encode_prompt( truncation=True, return_attention_mask=True, add_special_tokens=True, - return_tensors="pd", ) + return_tensors="pd", + ) attention_mask = uncond_input.attention_mask negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: @@ -312,10 +321,8 @@ def encode_prompt( if dtype is not None: negative_prompt_embeds = negative_prompt_embeds.cast(dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch @@ -328,11 +335,11 @@ def encode_prompt( # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, nsfw_detected, watermark_detected = self.safety_checker( images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype), ) + clip_input=safety_checker_input.pixel_values.cast(dtype), + ) else: nsfw_detected = None watermark_detected = None @@ -346,49 +353,47 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - image, - mask_image, - batch_size, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + image, + mask_image, + batch_size, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -401,7 +406,8 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) # image @@ -410,12 +416,15 @@ def check_inputs( else: check_image_type = image - if (not isinstance(check_image_type, paddle.Tensor) and - not isinstance(check_image_type, PIL.Image.Image) and - not isinstance(check_image_type, np.ndarray)): + if ( + not isinstance(check_image_type, paddle.Tensor) + and not isinstance(check_image_type, PIL.Image.Image) + and not isinstance(check_image_type, np.ndarray) + ): raise ValueError( "`image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" - f" {type(check_image_type)}") + f" {type(check_image_type)}" + ) if isinstance(image, list): image_batch_size = len(image) @@ -429,9 +438,7 @@ def check_inputs( assert False if batch_size != image_batch_size: - raise ValueError( - f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}" - ) + raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}") # mask_image @@ -440,12 +447,15 @@ def check_inputs( else: check_image_type = mask_image - if (not isinstance(check_image_type, paddle.Tensor) and - not isinstance(check_image_type, PIL.Image.Image) and - not isinstance(check_image_type, np.ndarray)): + if ( + not isinstance(check_image_type, paddle.Tensor) + and not isinstance(check_image_type, PIL.Image.Image) + and not isinstance(check_image_type, np.ndarray) + ): raise ValueError( "`mask_image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" - f" {type(check_image_type)}") + f" {type(check_image_type)}" + ) if isinstance(mask_image, list): image_batch_size = len(mask_image) @@ -466,14 +476,12 @@ def check_inputs( # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing def _text_preprocessing(self, text, clean_caption=False): if clean_caption and not is_bs4_available(): - logger.warn(BACKENDS_MAPPING["bs4"][-1].format( - "Setting `clean_caption=True`")) + logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`")) logger.warn("Setting `clean_caption` to False...") clean_caption = False if clean_caption and not is_ftfy_available(): - logger.warn(BACKENDS_MAPPING["ftfy"][-1].format( - "Setting `clean_caption=True`")) + logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`")) logger.warn("Setting `clean_caption` to False...") clean_caption = False @@ -500,11 +508,13 @@ def _clean_caption(self, caption): caption = re.sub( r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", - caption, ) # regex for urls + caption, + ) # regex for urls caption = re.sub( r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", - caption, ) # regex for urls + caption, + ) # regex for urls # html: caption = BeautifulSoup(caption, features="html.parser").text @@ -531,7 +541,8 @@ def _clean_caption(self, caption): caption = re.sub( r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa "-", - caption, ) + caption, + ) # кавычки к одному стандарту caption = re.sub(r"[`´«»“”¨]", '"', caption) @@ -558,15 +569,13 @@ def _clean_caption(self, caption): # "123456.." caption = re.sub(r"\b\d{6,}\b", "", caption) # filenames: - caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", - "", caption) + caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption) # caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT""" caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT""" - caption = re.sub(self.bad_punct_regex, r" ", - caption) # ***AUSVERKAUFT***, #AUSVERKAUFT + caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT caption = re.sub(r"\s+\.\s+", r" ", caption) # " . " # this-is-my-cute-cat / this_is_my_cute_cat @@ -584,13 +593,10 @@ def _clean_caption(self, caption): caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption) caption = re.sub(r"(free\s)?download(\sfree)?", "", caption) caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption) - caption = re.sub( - r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", - caption) + caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption) caption = re.sub(r"\bpage\s+\d+\b", "", caption) - caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", - caption) # j2d1a2a... + caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a... caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption) @@ -636,15 +642,11 @@ def numpy_to_pd(images): image = numpy_to_pd(image) # to pd elif isinstance(image[0], np.ndarray): - image = (np.concatenate( - image, axis=0) if image[0].ndim == 4 else np.stack( - image, axis=0)) + image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0) image = numpy_to_pd(image) elif isinstance(image[0], paddle.Tensor): - image = (paddle.concat( - image, axis=0) if image[0].ndim == 4 else paddle.stack( - image, axis=0)) + image = paddle.concat(image, axis=0) if image[0].ndim == 4 else paddle.stack(image, axis=0) return image @@ -653,10 +655,9 @@ def preprocess_mask_image(self, mask_image) -> paddle.Tensor: mask_image = [mask_image] if isinstance(mask_image[0], paddle.Tensor): - mask_image = (paddle.concat( - mask_image, axis=0) - if mask_image[0].ndim == 4 else paddle.stack( - mask_image, axis=0)) + mask_image = ( + paddle.concat(mask_image, axis=0) if mask_image[0].ndim == 4 else paddle.stack(mask_image, axis=0) + ) if mask_image.ndim == 2: # Batch and add channel dim for single mask @@ -692,8 +693,7 @@ def preprocess_mask_image(self, mask_image) -> paddle.Tensor: mask_image = paddle.to_tensor(mask_image) elif isinstance(mask_image[0], np.ndarray): - mask_image = np.concatenate( - [m[None, None, :] for m in mask_image], axis=0) + mask_image = np.concatenate([m[None, None, :] for m in mask_image], axis=0) mask_image[mask_image < 0.5] = 0 mask_image[mask_image >= 0.5] = 1 @@ -704,8 +704,7 @@ def preprocess_mask_image(self, mask_image) -> paddle.Tensor: # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.get_timesteps def get_timesteps(self, num_inference_steps, strength): # get the original timestep using init_timestep - init_timestep = min( - int(num_inference_steps * strength), num_inference_steps) + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start:] @@ -713,14 +712,15 @@ def get_timesteps(self, num_inference_steps, strength): return timesteps, num_inference_steps - t_start def prepare_intermediate_images( - self, - image, - timestep, - batch_size, - num_images_per_prompt, - dtype, - mask_image, - generator=None, ): + self, + image, + timestep, + batch_size, + num_images_per_prompt, + dtype, + mask_image, + generator=None, + ): image_batch_size, channels, height, width = image.shape batch_size = batch_size * num_images_per_prompt @@ -745,29 +745,41 @@ def prepare_intermediate_images( @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - prompt: Union[str, List[str]]=None, - image: Union[PIL.Image.Image, paddle.Tensor, np.ndarray, List[ - PIL.Image.Image], List[paddle.Tensor], List[np.ndarray], ]=None, - mask_image: Union[PIL.Image.Image, paddle.Tensor, np.ndarray, List[ - PIL.Image.Image], List[paddle.Tensor], List[np.ndarray], ]=None, - strength: float=1.0, - num_inference_steps: int=50, - timesteps: List[int]=None, - guidance_scale: float=7.0, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: int=1, - clean_caption: bool=True, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, ): + self, + prompt: Union[str, List[str]] = None, + image: Union[ + PIL.Image.Image, + paddle.Tensor, + np.ndarray, + List[PIL.Image.Image], + List[paddle.Tensor], + List[np.ndarray], + ] = None, + mask_image: Union[ + PIL.Image.Image, + paddle.Tensor, + np.ndarray, + List[PIL.Image.Image], + List[paddle.Tensor], + List[np.ndarray], + ] = None, + strength: float = 1.0, + num_inference_steps: int = 50, + timesteps: List[int] = None, + guidance_scale: float = 7.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: int = 1, + clean_caption: bool = True, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ): """ Function invoked when calling the pipeline for generation. @@ -864,7 +876,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` @@ -879,11 +892,11 @@ def __call__( negative_prompt=negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, - clean_caption=clean_caption, ) + clean_caption=clean_caption, + ) if do_classifier_free_guidance: - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) dtype = prompt_embeds.dtype @@ -896,8 +909,7 @@ def __call__( self.scheduler.set_timesteps(num_inference_steps) timesteps = self.scheduler.timesteps - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, - strength) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) # 5. Prepare intermediate images image = self.preprocess_image(image) @@ -907,15 +919,12 @@ def __call__( mask_image = mask_image.cast(dtype) if mask_image.shape[0] == 1: - mask_image = mask_image.repeat_interleave( - batch_size * num_images_per_prompt, axis=0) + mask_image = mask_image.repeat_interleave(batch_size * num_images_per_prompt, axis=0) else: - mask_image = mask_image.repeat_interleave( - num_images_per_prompt, axis=0) + mask_image = mask_image.repeat_interleave(num_images_per_prompt, axis=0) noise_timestep = timesteps[0:1] - noise_timestep = noise_timestep.tile( - (batch_size * num_images_per_prompt, )) + noise_timestep = noise_timestep.tile((batch_size * num_images_per_prompt,)) intermediate_images = self.prepare_intermediate_images( image, @@ -924,19 +933,19 @@ def __call__( num_images_per_prompt, dtype, mask_image, - generator, ) + generator, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): - model_input = (paddle.concat([intermediate_images] * 2) - if do_classifier_free_guidance else - intermediate_images) + model_input = ( + paddle.concat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images + ) model_input = self.scheduler.scale_model_input(model_input, t) # predict the noise residual @@ -944,7 +953,8 @@ def __call__( model_input, t, encoder_hidden_states=prompt_embeds, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample # perform guidance if do_classifier_free_guidance: @@ -954,33 +964,29 @@ def __call__( model_input.shape[1], noise_pred_uncond.shape[1] - model_input.shape[1], ], - axis=1, ) + axis=1, + ) noise_pred_text, predicted_variance = noise_pred_text.split( [ model_input.shape[1], noise_pred_text.shape[1] - model_input.shape[1], ], - axis=1, ) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) - noise_pred = paddle.concat( - [noise_pred, predicted_variance], axis=1) + axis=1, + ) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1) # compute the previous noisy sample x_t -> x_t-1 prev_intermediate_images = intermediate_images intermediate_images = self.scheduler.step( - noise_pred, t, intermediate_images, - **extra_step_kwargs).prev_sample + noise_pred, t, intermediate_images, **extra_step_kwargs + ).prev_sample - intermediate_images = ( - 1 - mask_image - ) * prev_intermediate_images + mask_image * intermediate_images + intermediate_images = (1 - mask_image) * prev_intermediate_images + mask_image * intermediate_images # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, intermediate_images) @@ -993,16 +999,14 @@ def __call__( image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy() # 9. Run safety checker - image, nsfw_detected, watermark_detected = self.run_safety_checker( - image, prompt_embeds.dtype) + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype) # 10. Convert to PIL image = self.numpy_to_pil(image) # 11. Apply watermark if self.watermarker is not None: - self.watermarker.apply_watermark(image, - self.unet.config.sample_size) + self.watermarker.apply_watermark(image, self.unet.config.sample_size) elif output_type == "pd": nsfw_detected = None watermark_detected = None @@ -1013,8 +1017,7 @@ def __call__( image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy() # 9. Run safety checker - image, nsfw_detected, watermark_detected = self.run_safety_checker( - image, prompt_embeds.dtype) + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype) if not return_dict: return (image, nsfw_detected, watermark_detected) @@ -1022,4 +1025,5 @@ def __call__( return IFPipelineOutput( images=image, nsfw_detected=nsfw_detected, - watermark_detected=watermark_detected, ) + watermark_detected=watermark_detected, + ) diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py index a9d271872306a..7b1c73e660a40 100644 --- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py @@ -22,14 +22,19 @@ import paddle import paddle.nn.functional as F import PIL -from paddlenlp.transformers import (CLIPImageProcessor, T5EncoderModel, - T5Tokenizer) +from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer from ...models import UNet2DConditionModel from ...schedulers import DDPMScheduler -from ...utils import (BACKENDS_MAPPING, PIL_INTERPOLATION, is_bs4_available, - is_ftfy_available, logging, randn_tensor, - replace_example_docstring) +from ...utils import ( + BACKENDS_MAPPING, + PIL_INTERPOLATION, + is_bs4_available, + is_ftfy_available, + logging, + randn_tensor, + replace_example_docstring, +) from ..pipeline_utils import DiffusionPipeline from . import IFPipelineOutput from .safety_checker import IFSafetyChecker @@ -57,8 +62,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image: else: h = int(round(img_size / 8 / coef) * 8) - images = images.resize( - (w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None) + images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None) return images @@ -132,8 +136,8 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline): watermarker: Optional[IFWatermarker] bad_punct_regex = re.compile( - r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + - "\|" + "\\" + "\/" + "\*" + r"]{1,}") # noqa + r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}" + ) # noqa _optional_components = [ "tokenizer", @@ -144,16 +148,17 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline): ] def __init__( - self, - tokenizer: T5Tokenizer, - text_encoder: T5EncoderModel, - unet: UNet2DConditionModel, - scheduler: DDPMScheduler, - image_noising_scheduler: DDPMScheduler, - safety_checker: Optional[IFSafetyChecker], - feature_extractor: Optional[CLIPImageProcessor], - watermarker: Optional[IFWatermarker], - requires_safety_checker: bool=True, ): + self, + tokenizer: T5Tokenizer, + text_encoder: T5EncoderModel, + unet: UNet2DConditionModel, + scheduler: DDPMScheduler, + image_noising_scheduler: DDPMScheduler, + safety_checker: Optional[IFSafetyChecker], + feature_extractor: Optional[CLIPImageProcessor], + watermarker: Optional[IFWatermarker], + requires_safety_checker: bool = True, + ): super().__init__() if safety_checker is None and requires_safety_checker: @@ -185,20 +190,19 @@ def __init__( image_noising_scheduler=image_noising_scheduler, safety_checker=safety_checker, feature_extractor=feature_extractor, - watermarker=watermarker, ) + watermarker=watermarker, + ) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing def _text_preprocessing(self, text, clean_caption=False): if clean_caption and not is_bs4_available(): - logger.warn(BACKENDS_MAPPING["bs4"][-1].format( - "Setting `clean_caption=True`")) + logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`")) logger.warn("Setting `clean_caption` to False...") clean_caption = False if clean_caption and not is_ftfy_available(): - logger.warn(BACKENDS_MAPPING["ftfy"][-1].format( - "Setting `clean_caption=True`")) + logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`")) logger.warn("Setting `clean_caption` to False...") clean_caption = False @@ -225,11 +229,13 @@ def _clean_caption(self, caption): caption = re.sub( r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", - caption, ) # regex for urls + caption, + ) # regex for urls caption = re.sub( r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", - caption, ) # regex for urls + caption, + ) # regex for urls # html: caption = BeautifulSoup(caption, features="html.parser").text @@ -256,7 +262,8 @@ def _clean_caption(self, caption): caption = re.sub( r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa "-", - caption, ) + caption, + ) # кавычки к одному стандарту caption = re.sub(r"[`´«»“”¨]", '"', caption) @@ -283,15 +290,13 @@ def _clean_caption(self, caption): # "123456.." caption = re.sub(r"\b\d{6,}\b", "", caption) # filenames: - caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", - "", caption) + caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption) # caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT""" caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT""" - caption = re.sub(self.bad_punct_regex, r" ", - caption) # ***AUSVERKAUFT***, #AUSVERKAUFT + caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT caption = re.sub(r"\s+\.\s+", r" ", caption) # " . " # this-is-my-cute-cat / this_is_my_cute_cat @@ -309,13 +314,10 @@ def _clean_caption(self, caption): caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption) caption = re.sub(r"(free\s)?download(\sfree)?", "", caption) caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption) - caption = re.sub( - r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", - caption) + caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption) caption = re.sub(r"\bpage\s+\d+\b", "", caption) - caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", - caption) # j2d1a2a... + caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a... caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption) @@ -335,14 +337,15 @@ def _clean_caption(self, caption): @paddle.no_grad() # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt def encode_prompt( - self, - prompt, - do_classifier_free_guidance=True, - num_images_per_prompt=1, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - clean_caption: bool=False, ): + self, + prompt, + do_classifier_free_guidance=True, + num_images_per_prompt=1, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + clean_caption: bool = False, + ): r""" Encodes the prompt into text encoder hidden states. @@ -369,7 +372,8 @@ def encode_prompt( if type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) if prompt is not None and isinstance(prompt, str): batch_size = 1 @@ -382,31 +386,31 @@ def encode_prompt( max_length = 77 if prompt_embeds is None: - prompt = self._text_preprocessing( - prompt, clean_caption=clean_caption) + prompt = self._text_preprocessing(prompt, clean_caption=clean_caption) text_inputs = self.tokenizer( prompt, padding="max_length", max_length=max_length, truncation=True, add_special_tokens=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids - - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): - removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, max_length - 1:-1]) + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1]) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {max_length} tokens: {removed_text}") + f" {max_length} tokens: {removed_text}" + ) attention_mask = text_inputs.attention_mask prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] if self.text_encoder is not None: @@ -421,8 +425,7 @@ def encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -435,12 +438,12 @@ def encode_prompt( raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt - uncond_tokens = self._text_preprocessing( - uncond_tokens, clean_caption=clean_caption) + uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( uncond_tokens, @@ -449,12 +452,14 @@ def encode_prompt( truncation=True, return_attention_mask=True, add_special_tokens=True, - return_tensors="pd", ) + return_tensors="pd", + ) attention_mask = uncond_input.attention_mask negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: @@ -464,10 +469,8 @@ def encode_prompt( if dtype is not None: negative_prompt_embeds = negative_prompt_embeds.cast(dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch @@ -480,11 +483,11 @@ def encode_prompt( # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, nsfw_detected, watermark_detected = self.safety_checker( images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype), ) + clip_input=safety_checker_input.pixel_values.cast(dtype), + ) else: nsfw_detected = None watermark_detected = None @@ -498,50 +501,48 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - image, - original_image, - mask_image, - batch_size, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + image, + original_image, + mask_image, + batch_size, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -554,7 +555,8 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) # image @@ -563,12 +565,15 @@ def check_inputs( else: check_image_type = image - if (not isinstance(check_image_type, paddle.Tensor) and - not isinstance(check_image_type, PIL.Image.Image) and - not isinstance(check_image_type, np.ndarray)): + if ( + not isinstance(check_image_type, paddle.Tensor) + and not isinstance(check_image_type, PIL.Image.Image) + and not isinstance(check_image_type, np.ndarray) + ): raise ValueError( "`image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" - f" {type(check_image_type)}") + f" {type(check_image_type)}" + ) if isinstance(image, list): image_batch_size = len(image) @@ -582,9 +587,7 @@ def check_inputs( assert False if batch_size != image_batch_size: - raise ValueError( - f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}" - ) + raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}") # original_image @@ -593,12 +596,15 @@ def check_inputs( else: check_image_type = original_image - if (not isinstance(check_image_type, paddle.Tensor) and - not isinstance(check_image_type, PIL.Image.Image) and - not isinstance(check_image_type, np.ndarray)): + if ( + not isinstance(check_image_type, paddle.Tensor) + and not isinstance(check_image_type, PIL.Image.Image) + and not isinstance(check_image_type, np.ndarray) + ): raise ValueError( "`original_image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" - f" {type(check_image_type)}") + f" {type(check_image_type)}" + ) if isinstance(original_image, list): image_batch_size = len(original_image) @@ -623,12 +629,15 @@ def check_inputs( else: check_image_type = mask_image - if (not isinstance(check_image_type, paddle.Tensor) and - not isinstance(check_image_type, PIL.Image.Image) and - not isinstance(check_image_type, np.ndarray)): + if ( + not isinstance(check_image_type, paddle.Tensor) + and not isinstance(check_image_type, PIL.Image.Image) + and not isinstance(check_image_type, np.ndarray) + ): raise ValueError( "`mask_image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" - f" {type(check_image_type)}") + f" {type(check_image_type)}" + ) if isinstance(mask_image, list): image_batch_size = len(mask_image) @@ -647,8 +656,7 @@ def check_inputs( ) # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.preprocess_image with preprocess_image -> preprocess_original_image - def preprocess_original_image(self, - image: PIL.Image.Image) -> paddle.Tensor: + def preprocess_original_image(self, image: PIL.Image.Image) -> paddle.Tensor: if not isinstance(image, list): image = [image] @@ -676,21 +684,16 @@ def numpy_to_pd(images): image = numpy_to_pd(image) # to pd elif isinstance(image[0], np.ndarray): - image = (np.concatenate( - image, axis=0) if image[0].ndim == 4 else np.stack( - image, axis=0)) + image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0) image = numpy_to_pd(image) elif isinstance(image[0], paddle.Tensor): - image = (paddle.concat( - image, axis=0) if image[0].ndim == 4 else paddle.stack( - image, axis=0)) + image = paddle.concat(image, axis=0) if image[0].ndim == 4 else paddle.stack(image, axis=0) return image # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_superresolution.IFSuperResolutionPipeline.preprocess_image - def preprocess_image(self, image: PIL.Image.Image, - num_images_per_prompt) -> paddle.Tensor: + def preprocess_image(self, image: PIL.Image.Image, num_images_per_prompt) -> paddle.Tensor: if not isinstance(image, paddle.Tensor) and not isinstance(image, list): image = [image] @@ -713,8 +716,7 @@ def preprocess_image(self, image: PIL.Image.Image, elif dims == 4: image = paddle.concat(image, axis=0) else: - raise ValueError( - f"Image must have 3 or 4 dimensions, instead got {dims}") + raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}") image = image.cast(self.unet.dtype) @@ -728,10 +730,9 @@ def preprocess_mask_image(self, mask_image) -> paddle.Tensor: mask_image = [mask_image] if isinstance(mask_image[0], paddle.Tensor): - mask_image = (paddle.concat( - mask_image, axis=0) - if mask_image[0].ndim == 4 else paddle.stack( - mask_image, axis=0)) + mask_image = ( + paddle.concat(mask_image, axis=0) if mask_image[0].ndim == 4 else paddle.stack(mask_image, axis=0) + ) if mask_image.ndim == 2: # Batch and add channel dim for single mask @@ -767,8 +768,7 @@ def preprocess_mask_image(self, mask_image) -> paddle.Tensor: mask_image = paddle.to_tensor(mask_image) elif isinstance(mask_image[0], np.ndarray): - mask_image = np.concatenate( - [m[None, None, :] for m in mask_image], axis=0) + mask_image = np.concatenate([m[None, None, :] for m in mask_image], axis=0) mask_image[mask_image < 0.5] = 0 mask_image[mask_image >= 0.5] = 1 @@ -779,8 +779,7 @@ def preprocess_mask_image(self, mask_image) -> paddle.Tensor: # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.get_timesteps def get_timesteps(self, num_inference_steps, strength): # get the original timestep using init_timestep - init_timestep = min( - int(num_inference_steps * strength), num_inference_steps) + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start:] @@ -789,14 +788,15 @@ def get_timesteps(self, num_inference_steps, strength): # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if_inpainting.IFInpaintingPipeline.prepare_intermediate_images def prepare_intermediate_images( - self, - image, - timestep, - batch_size, - num_images_per_prompt, - dtype, - mask_image, - generator=None, ): + self, + image, + timestep, + batch_size, + num_images_per_prompt, + dtype, + mask_image, + generator=None, + ): image_batch_size, channels, height, width = image.shape batch_size = batch_size * num_images_per_prompt @@ -821,32 +821,43 @@ def prepare_intermediate_images( @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor], - original_image: Union[PIL.Image.Image, paddle.Tensor, np.ndarray, - List[PIL.Image.Image], List[ - paddle.Tensor], List[np.ndarray], ]=None, - mask_image: Union[PIL.Image.Image, paddle.Tensor, np.ndarray, List[ - PIL.Image.Image], List[paddle.Tensor], List[np.ndarray], ]=None, - strength: float=0.8, - prompt: Union[str, List[str]]=None, - num_inference_steps: int=100, - timesteps: List[int]=None, - guidance_scale: float=4.0, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: int=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - noise_level: int=0, - clean_caption: bool=True, ): + self, + image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor], + original_image: Union[ + PIL.Image.Image, + paddle.Tensor, + np.ndarray, + List[PIL.Image.Image], + List[paddle.Tensor], + List[np.ndarray], + ] = None, + mask_image: Union[ + PIL.Image.Image, + paddle.Tensor, + np.ndarray, + List[PIL.Image.Image], + List[paddle.Tensor], + List[np.ndarray], + ] = None, + strength: float = 0.8, + prompt: Union[str, List[str]] = None, + num_inference_steps: int = 100, + timesteps: List[int] = None, + guidance_scale: float = 4.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + noise_level: int = 0, + clean_caption: bool = True, + ): """ Function invoked when calling the pipeline for generation. @@ -948,7 +959,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # 2. Define call parameters @@ -965,11 +977,11 @@ def __call__( negative_prompt=negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, - clean_caption=clean_caption, ) + clean_caption=clean_caption, + ) if do_classifier_free_guidance: - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) dtype = prompt_embeds.dtype @@ -982,8 +994,7 @@ def __call__( self.scheduler.set_timesteps(num_inference_steps) timesteps = self.scheduler.timesteps - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, - strength) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) # 5. prepare original image original_image = self.preprocess_original_image(original_image) @@ -994,16 +1005,13 @@ def __call__( mask_image = mask_image.cast(dtype) if mask_image.shape[0] == 1: - mask_image = mask_image.repeat_interleave( - batch_size * num_images_per_prompt, axis=0) + mask_image = mask_image.repeat_interleave(batch_size * num_images_per_prompt, axis=0) else: - mask_image = mask_image.repeat_interleave( - num_images_per_prompt, axis=0) + mask_image = mask_image.repeat_interleave(num_images_per_prompt, axis=0) # 6. Prepare intermediate images noise_timestep = timesteps[0:1] - noise_timestep = noise_timestep.tile( - (batch_size * num_images_per_prompt, )) + noise_timestep = noise_timestep.tile((batch_size * num_images_per_prompt,)) intermediate_images = self.prepare_intermediate_images( original_image, @@ -1012,21 +1020,19 @@ def __call__( num_images_per_prompt, dtype, mask_image, - generator, ) + generator, + ) # 7. Prepare upscaled image and noise level _, _, height, width = original_image.shape image = self.preprocess_image(image, num_images_per_prompt) - upscaled = F.interpolate( - image, (height, width), mode="bilinear", align_corners=True) + upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True) noise_level = paddle.to_tensor([noise_level] * upscaled.shape[0]) - noise = randn_tensor( - upscaled.shape, generator=generator, dtype=upscaled.dtype) - upscaled = self.image_noising_scheduler.add_noise( - upscaled, noise, timesteps=noise_level) + noise = randn_tensor(upscaled.shape, generator=generator, dtype=upscaled.dtype) + upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level) if do_classifier_free_guidance: noise_level = paddle.concat([noise_level] * 2) @@ -1035,19 +1041,15 @@ def __call__( extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 9. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): model_input = paddle.concat( - [ - intermediate_images, - upscaled.cast(intermediate_images.dtype) - ], - axis=1, ) - - model_input = (paddle.concat([model_input] * 2) - if do_classifier_free_guidance else model_input) + [intermediate_images, upscaled.cast(intermediate_images.dtype)], + axis=1, + ) + + model_input = paddle.concat([model_input] * 2) if do_classifier_free_guidance else model_input model_input = self.scheduler.scale_model_input(model_input, t) # predict the noise residual @@ -1056,7 +1058,8 @@ def __call__( t, encoder_hidden_states=prompt_embeds, class_labels=noise_level, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample # perform guidance if do_classifier_free_guidance: @@ -1064,37 +1067,31 @@ def __call__( noise_pred_uncond, _ = noise_pred_uncond.split( [ model_input.shape[1] // 2, - noise_pred_uncond.shape[1] - model_input.shape[1] // - 2, + noise_pred_uncond.shape[1] - model_input.shape[1] // 2, ], - axis=1, ) + axis=1, + ) noise_pred_text, predicted_variance = noise_pred_text.split( [ model_input.shape[1] // 2, - noise_pred_text.shape[1] - model_input.shape[1] // - 2, + noise_pred_text.shape[1] - model_input.shape[1] // 2, ], - axis=1, ) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) - noise_pred = paddle.concat( - [noise_pred, predicted_variance], axis=1) + axis=1, + ) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1) # compute the previous noisy sample x_t -> x_t-1 prev_intermediate_images = intermediate_images intermediate_images = self.scheduler.step( - noise_pred, t, intermediate_images, - **extra_step_kwargs).prev_sample + noise_pred, t, intermediate_images, **extra_step_kwargs + ).prev_sample - intermediate_images = ( - 1 - mask_image - ) * prev_intermediate_images + mask_image * intermediate_images + intermediate_images = (1 - mask_image) * prev_intermediate_images + mask_image * intermediate_images # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, intermediate_images) @@ -1107,16 +1104,14 @@ def __call__( image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy() # 11. Run safety checker - image, nsfw_detected, watermark_detected = self.run_safety_checker( - image, prompt_embeds.dtype) + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype) # 12. Convert to PIL image = self.numpy_to_pil(image) # 13. Apply watermark if self.watermarker is not None: - self.watermarker.apply_watermark(image, - self.unet.config.sample_size) + self.watermarker.apply_watermark(image, self.unet.config.sample_size) elif output_type == "pd": nsfw_detected = None watermark_detected = None @@ -1127,8 +1122,7 @@ def __call__( image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy() # 11. Run safety checker - image, nsfw_detected, watermark_detected = self.run_safety_checker( - image, prompt_embeds.dtype) + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype) if not return_dict: return (image, nsfw_detected, watermark_detected) @@ -1136,4 +1130,5 @@ def __call__( return IFPipelineOutput( images=image, nsfw_detected=nsfw_detected, - watermark_detected=watermark_detected, ) + watermark_detected=watermark_detected, + ) diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py index b2aa43abe1a5c..ce92083c54c1a 100644 --- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py @@ -22,13 +22,18 @@ import paddle import paddle.nn.functional as F import PIL -from paddlenlp.transformers import (CLIPImageProcessor, T5EncoderModel, - T5Tokenizer) +from paddlenlp.transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer from ...models import UNet2DConditionModel from ...schedulers import DDPMScheduler -from ...utils import (BACKENDS_MAPPING, is_bs4_available, is_ftfy_available, - logging, randn_tensor, replace_example_docstring) +from ...utils import ( + BACKENDS_MAPPING, + is_bs4_available, + is_ftfy_available, + logging, + randn_tensor, + replace_example_docstring, +) from ..pipeline_utils import DiffusionPipeline from . import IFPipelineOutput from .safety_checker import IFSafetyChecker @@ -86,8 +91,8 @@ class IFSuperResolutionPipeline(DiffusionPipeline): watermarker: Optional[IFWatermarker] bad_punct_regex = re.compile( - r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + - "\|" + "\\" + "\/" + "\*" + r"]{1,}") # noqa + r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}" + ) # noqa _optional_components = [ "tokenizer", @@ -98,16 +103,17 @@ class IFSuperResolutionPipeline(DiffusionPipeline): ] def __init__( - self, - tokenizer: T5Tokenizer, - text_encoder: T5EncoderModel, - unet: UNet2DConditionModel, - scheduler: DDPMScheduler, - image_noising_scheduler: DDPMScheduler, - safety_checker: Optional[IFSafetyChecker], - feature_extractor: Optional[CLIPImageProcessor], - watermarker: Optional[IFWatermarker], - requires_safety_checker: bool=True, ): + self, + tokenizer: T5Tokenizer, + text_encoder: T5EncoderModel, + unet: UNet2DConditionModel, + scheduler: DDPMScheduler, + image_noising_scheduler: DDPMScheduler, + safety_checker: Optional[IFSafetyChecker], + feature_extractor: Optional[CLIPImageProcessor], + watermarker: Optional[IFWatermarker], + requires_safety_checker: bool = True, + ): super().__init__() if safety_checker is None and requires_safety_checker: @@ -139,20 +145,19 @@ def __init__( image_noising_scheduler=image_noising_scheduler, safety_checker=safety_checker, feature_extractor=feature_extractor, - watermarker=watermarker, ) + watermarker=watermarker, + ) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing def _text_preprocessing(self, text, clean_caption=False): if clean_caption and not is_bs4_available(): - logger.warn(BACKENDS_MAPPING["bs4"][-1].format( - "Setting `clean_caption=True`")) + logger.warn(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`")) logger.warn("Setting `clean_caption` to False...") clean_caption = False if clean_caption and not is_ftfy_available(): - logger.warn(BACKENDS_MAPPING["ftfy"][-1].format( - "Setting `clean_caption=True`")) + logger.warn(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`")) logger.warn("Setting `clean_caption` to False...") clean_caption = False @@ -179,11 +184,13 @@ def _clean_caption(self, caption): caption = re.sub( r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", - caption, ) # regex for urls + caption, + ) # regex for urls caption = re.sub( r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa "", - caption, ) # regex for urls + caption, + ) # regex for urls # html: caption = BeautifulSoup(caption, features="html.parser").text @@ -210,7 +217,8 @@ def _clean_caption(self, caption): caption = re.sub( r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa "-", - caption, ) + caption, + ) # кавычки к одному стандарту caption = re.sub(r"[`´«»“”¨]", '"', caption) @@ -237,15 +245,13 @@ def _clean_caption(self, caption): # "123456.." caption = re.sub(r"\b\d{6,}\b", "", caption) # filenames: - caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", - "", caption) + caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption) # caption = re.sub(r"[\"\']{2,}", r'"', caption) # """AUSVERKAUFT""" caption = re.sub(r"[\.]{2,}", r" ", caption) # """AUSVERKAUFT""" - caption = re.sub(self.bad_punct_regex, r" ", - caption) # ***AUSVERKAUFT***, #AUSVERKAUFT + caption = re.sub(self.bad_punct_regex, r" ", caption) # ***AUSVERKAUFT***, #AUSVERKAUFT caption = re.sub(r"\s+\.\s+", r" ", caption) # " . " # this-is-my-cute-cat / this_is_my_cute_cat @@ -263,13 +269,10 @@ def _clean_caption(self, caption): caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption) caption = re.sub(r"(free\s)?download(\sfree)?", "", caption) caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption) - caption = re.sub( - r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", - caption) + caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption) caption = re.sub(r"\bpage\s+\d+\b", "", caption) - caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", - caption) # j2d1a2a... + caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a... caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption) @@ -289,14 +292,15 @@ def _clean_caption(self, caption): @paddle.no_grad() # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt def encode_prompt( - self, - prompt, - do_classifier_free_guidance=True, - num_images_per_prompt=1, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - clean_caption: bool=False, ): + self, + prompt, + do_classifier_free_guidance=True, + num_images_per_prompt=1, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + clean_caption: bool = False, + ): r""" Encodes the prompt into text encoder hidden states. @@ -323,7 +327,8 @@ def encode_prompt( if type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) if prompt is not None and isinstance(prompt, str): batch_size = 1 @@ -336,32 +341,32 @@ def encode_prompt( max_length = 77 if prompt_embeds is None: - prompt = self._text_preprocessing( - prompt, clean_caption=clean_caption) + prompt = self._text_preprocessing(prompt, clean_caption=clean_caption) text_inputs = self.tokenizer( prompt, padding="max_length", max_length=max_length, truncation=True, add_special_tokens=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids - - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): - removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, max_length - 1:-1]) + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1]) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {max_length} tokens: {removed_text}") + f" {max_length} tokens: {removed_text}" + ) attention_mask = text_inputs.attention_mask prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] if self.text_encoder is not None: @@ -376,8 +381,7 @@ def encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -390,12 +394,12 @@ def encode_prompt( raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt - uncond_tokens = self._text_preprocessing( - uncond_tokens, clean_caption=clean_caption) + uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( uncond_tokens, @@ -404,12 +408,14 @@ def encode_prompt( truncation=True, return_attention_mask=True, add_special_tokens=True, - return_tensors="pd", ) + return_tensors="pd", + ) attention_mask = uncond_input.attention_mask negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: @@ -419,10 +425,8 @@ def encode_prompt( if dtype is not None: negative_prompt_embeds = negative_prompt_embeds.cast(dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch @@ -435,11 +439,11 @@ def encode_prompt( # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, nsfw_detected, watermark_detected = self.safety_checker( images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype), ) + clip_input=safety_checker_input.pixel_values.cast(dtype), + ) else: nsfw_detected = None watermark_detected = None @@ -453,49 +457,47 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - image, - batch_size, - noise_level, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + image, + batch_size, + noise_level, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -508,10 +510,10 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) - if (noise_level < 0 or noise_level >= - self.image_noising_scheduler.config.num_train_timesteps): + if noise_level < 0 or noise_level >= self.image_noising_scheduler.config.num_train_timesteps: raise ValueError( f"`noise_level`: {noise_level} must be a valid timestep in `self.noising_scheduler`, [0, {self.image_noising_scheduler.config.num_train_timesteps})" ) @@ -521,12 +523,15 @@ def check_inputs( else: check_image_type = image - if (not isinstance(check_image_type, paddle.Tensor) and - not isinstance(check_image_type, PIL.Image.Image) and - not isinstance(check_image_type, np.ndarray)): + if ( + not isinstance(check_image_type, paddle.Tensor) + and not isinstance(check_image_type, PIL.Image.Image) + and not isinstance(check_image_type, np.ndarray) + ): raise ValueError( "`image` has to be of type `paddle.Tensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is" - f" {type(check_image_type)}") + f" {type(check_image_type)}" + ) if isinstance(image, list): image_batch_size = len(image) @@ -540,13 +545,10 @@ def check_inputs( assert False if batch_size != image_batch_size: - raise ValueError( - f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}" - ) + raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}") # Copied from ppdiffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_intermediate_images - def prepare_intermediate_images(self, batch_size, num_channels, height, - width, dtype, generator): + def prepare_intermediate_images(self, batch_size, num_channels, height, width, dtype, generator): shape = (batch_size, num_channels, height, width) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( @@ -554,8 +556,7 @@ def prepare_intermediate_images(self, batch_size, num_channels, height, f" size of {batch_size}. Make sure the batch size matches the length of the generators." ) - intermediate_images = randn_tensor( - shape, generator=generator, dtype=dtype) + intermediate_images = randn_tensor(shape, generator=generator, dtype=dtype) # scale the initial noise by the standard deviation required by the scheduler intermediate_images = intermediate_images * self.scheduler.init_noise_sigma @@ -584,8 +585,7 @@ def preprocess_image(self, image, num_images_per_prompt): elif dims == 4: image = paddle.concat(image, axis=0) else: - raise ValueError( - f"Image must have 3 or 4 dimensions, instead got {dims}") + raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}") image = image.cast(self.unet.dtype) @@ -596,28 +596,28 @@ def preprocess_image(self, image, num_images_per_prompt): @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - prompt: Union[str, List[str]]=None, - height: int=None, - width: int=None, - image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor]=None, - num_inference_steps: int=50, - timesteps: List[int]=None, - guidance_scale: float=4.0, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: int=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - noise_level: int=250, - clean_caption: bool=True, ): + self, + prompt: Union[str, List[str]] = None, + height: int = None, + width: int = None, + image: Union[PIL.Image.Image, np.ndarray, paddle.Tensor] = None, + num_inference_steps: int = 50, + timesteps: List[int] = None, + guidance_scale: float = 4.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + noise_level: int = 250, + clean_caption: bool = True, + ): """ Function invoked when calling the pipeline for generation. @@ -709,7 +709,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # 2. Define call parameters @@ -729,11 +730,11 @@ def __call__( negative_prompt=negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, - clean_caption=clean_caption, ) + clean_caption=clean_caption, + ) if do_classifier_free_guidance: - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) # 4. Prepare timesteps if timesteps is not None: @@ -752,39 +753,33 @@ def __call__( height, width, prompt_embeds.dtype, - generator, ) + generator, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. Prepare upscaled image and noise level image = self.preprocess_image(image, num_images_per_prompt) - upscaled = F.interpolate( - image, (height, width), mode="bilinear", align_corners=True) + upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True) noise_level = paddle.to_tensor([noise_level] * upscaled.shape[0]) - noise = randn_tensor( - upscaled.shape, generator=generator, dtype=upscaled.dtype) - upscaled = self.image_noising_scheduler.add_noise( - upscaled, noise, timesteps=noise_level) + noise = randn_tensor(upscaled.shape, generator=generator, dtype=upscaled.dtype) + upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level) if do_classifier_free_guidance: noise_level = paddle.concat([noise_level] * 2) # 8. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): model_input = paddle.concat( - [ - intermediate_images, - upscaled.cast(intermediate_images.dtype) - ], - axis=1, ) - - model_input = (paddle.concat([model_input] * 2) - if do_classifier_free_guidance else model_input) + [intermediate_images, upscaled.cast(intermediate_images.dtype)], + axis=1, + ) + + model_input = paddle.concat([model_input] * 2) if do_classifier_free_guidance else model_input model_input = self.scheduler.scale_model_input(model_input, t) # predict the noise residual @@ -794,7 +789,8 @@ def __call__( encoder_hidden_states=prompt_embeds, class_labels=noise_level, cross_attention_kwargs=cross_attention_kwargs, - return_dict=False, )[0] + return_dict=False, + )[0] # perform guidance if do_classifier_free_guidance: @@ -802,21 +798,19 @@ def __call__( noise_pred_uncond, _ = noise_pred_uncond.split( [ model_input.shape[1] // 2, - noise_pred_uncond.shape[1] - model_input.shape[1] // - 2, + noise_pred_uncond.shape[1] - model_input.shape[1] // 2, ], - axis=1, ) + axis=1, + ) noise_pred_text, predicted_variance = noise_pred_text.split( [ model_input.shape[1] // 2, - noise_pred_text.shape[1] - model_input.shape[1] // - 2, + noise_pred_text.shape[1] - model_input.shape[1] // 2, ], - axis=1, ) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) - noise_pred = paddle.concat( - [noise_pred, predicted_variance], axis=1) + axis=1, + ) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1) # compute the previous noisy sample x_t -> x_t-1 intermediate_images = self.scheduler.step( @@ -824,12 +818,11 @@ def __call__( t, intermediate_images, **extra_step_kwargs, - return_dict=False, )[0] + return_dict=False, + )[0] # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, intermediate_images) @@ -842,16 +835,14 @@ def __call__( image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy() # 10. Run safety checker - image, nsfw_detected, watermark_detected = self.run_safety_checker( - image, prompt_embeds.dtype) + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype) # 11. Convert to PIL image = self.numpy_to_pil(image) # 12. Apply watermark if self.watermarker is not None: - self.watermarker.apply_watermark(image, - self.unet.config.sample_size) + self.watermarker.apply_watermark(image, self.unet.config.sample_size) elif output_type == "pd": nsfw_detected = None watermark_detected = None @@ -862,8 +853,7 @@ def __call__( image = image.cpu().transpose([0, 2, 3, 1]).cast("float32").numpy() # 10. Run safety checker - image, nsfw_detected, watermark_detected = self.run_safety_checker( - image, prompt_embeds.dtype) + image, nsfw_detected, watermark_detected = self.run_safety_checker(image, prompt_embeds.dtype) if not return_dict: return (image, nsfw_detected, watermark_detected) @@ -871,4 +861,5 @@ def __call__( return IFPipelineOutput( images=image, nsfw_detected=nsfw_detected, - watermark_detected=watermark_detected, ) + watermark_detected=watermark_detected, + ) diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/safety_checker.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/safety_checker.py index 8fcd1ab740f28..e4f32ce9b69a9 100644 --- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/safety_checker.py +++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/safety_checker.py @@ -15,8 +15,11 @@ import numpy as np import paddle import paddle.nn as nn -from paddlenlp.transformers import (CLIPConfig, CLIPVisionModelWithProjection, - PretrainedModel) +from paddlenlp.transformers import ( + CLIPConfig, + CLIPVisionModelWithProjection, + PretrainedModel, +) from ...utils import logging @@ -46,7 +49,8 @@ def forward(self, clip_input, images, p_threshold=0.5, w_threshold=0.5): if any(nsfw_detected): logger.warning( "Potential NSFW content was detected in one or more images. A black image will be returned instead." - " Try again with a different prompt and/or seed.") + " Try again with a different prompt and/or seed." + ) for idx, nsfw_detected_ in enumerate(nsfw_detected): if nsfw_detected_: @@ -60,7 +64,8 @@ def forward(self, clip_input, images, p_threshold=0.5, w_threshold=0.5): if any(watermark_detected): logger.warning( "Potential watermarked content was detected in one or more images. A black image will be returned instead." - " Try again with a different prompt and/or seed.") + " Try again with a different prompt and/or seed." + ) for idx, watermark_detected_ in enumerate(watermark_detected): if watermark_detected_: diff --git a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/watermark.py b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/watermark.py index 998eb357d858a..ad156baf5b46f 100644 --- a/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/watermark.py +++ b/ppdiffusers/ppdiffusers/pipelines/deepfloyd_if/watermark.py @@ -29,8 +29,8 @@ def __init__(self): self.register_buffer( "watermark_image", - paddle.zeros( - (62, 62, 4), dtype=paddle.get_default_dtype()), ) + paddle.zeros((62, 62, 4), dtype=paddle.get_default_dtype()), + ) self.watermark_image_as_pil = None def apply_watermark(self, images: List[PIL.Image.Image], sample_size=None): @@ -45,9 +45,8 @@ def apply_watermark(self, images: List[PIL.Image.Image], sample_size=None): img_h, img_w = (int(h / coef), int(w / coef)) if coef < 1 else (h, w) S1, S2 = 1024**2, img_w * img_h - K = (S2 / S1)**0.5 - wm_size, wm_x, wm_y = int(K * - 62), img_w - int(14 * K), img_h - int(14 * K) + K = (S2 / S1) ** 0.5 + wm_size, wm_x, wm_y = int(K * 62), img_w - int(14 * K), img_h - int(14 * K) if self.watermark_image_as_pil is None: watermark_image = self.watermark_image.cpu().numpy().astype("uint8") @@ -55,12 +54,14 @@ def apply_watermark(self, images: List[PIL.Image.Image], sample_size=None): self.watermark_image_as_pil = watermark_image wm_img = self.watermark_image_as_pil.resize( - (wm_size, wm_size), PIL_INTERPOLATION["bicubic"], reducing_gap=None) + (wm_size, wm_size), PIL_INTERPOLATION["bicubic"], reducing_gap=None + ) for pil_img in images: pil_img.paste( wm_img, box=(wm_x - wm_size, wm_y - wm_size, wm_x, wm_y), - mask=wm_img.split()[-1], ) + mask=wm_img.split()[-1], + ) return images diff --git a/ppdiffusers/ppdiffusers/pipelines/dit/pipeline_dit.py b/ppdiffusers/ppdiffusers/pipelines/dit/pipeline_dit.py index faf4f122a123f..ff5d4541cde55 100644 --- a/ppdiffusers/ppdiffusers/pipelines/dit/pipeline_dit.py +++ b/ppdiffusers/ppdiffusers/pipelines/dit/pipeline_dit.py @@ -44,14 +44,14 @@ class DiTPipeline(DiffusionPipeline): """ def __init__( - self, - transformer: Transformer2DModel, - vae: AutoencoderKL, - scheduler: KarrasDiffusionSchedulers, - id2label: Optional[Dict[int, str]]=None, ): + self, + transformer: Transformer2DModel, + vae: AutoencoderKL, + scheduler: KarrasDiffusionSchedulers, + id2label: Optional[Dict[int, str]] = None, + ): super().__init__() - self.register_modules( - transformer=transformer, vae=vae, scheduler=scheduler) + self.register_modules(transformer=transformer, vae=vae, scheduler=scheduler) # create a imagenet -> id dictionary for easier use self.labels = {} @@ -88,14 +88,14 @@ def get_label_ids(self, label: Union[str, List[str]]) -> List[int]: @paddle.no_grad() def __call__( - self, - class_labels: List[int], - guidance_scale: float=4.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - num_inference_steps: int=50, - output_type: Optional[str]="pil", - return_dict: bool=True, ) -> Union[ImagePipelineOutput, Tuple]: + self, + class_labels: List[int], + guidance_scale: float = 4.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + num_inference_steps: int = 50, + output_type: Optional[str] = "pil", + return_dict: bool = True, + ) -> Union[ImagePipelineOutput, Tuple]: r""" Function invoked when calling the pipeline for generation. @@ -123,24 +123,22 @@ def __call__( latents = randn_tensor( shape=(batch_size, latent_channels, latent_size, latent_size), generator=generator, - dtype=self.transformer.dtype, ) - latent_model_input = (paddle.concat([latents] * 2) - if guidance_scale > 1 else latents) + dtype=self.transformer.dtype, + ) + latent_model_input = paddle.concat([latents] * 2) if guidance_scale > 1 else latents class_labels = paddle.to_tensor(class_labels).flatten() class_null = paddle.to_tensor([1000] * batch_size) - class_labels_input = (paddle.concat([class_labels, class_null], 0) - if guidance_scale > 1 else class_labels) + class_labels_input = paddle.concat([class_labels, class_null], 0) if guidance_scale > 1 else class_labels # set step values self.scheduler.set_timesteps(num_inference_steps) for t in self.progress_bar(self.scheduler.timesteps): if guidance_scale > 1: - half = latent_model_input[:len(latent_model_input) // 2] + half = latent_model_input[: len(latent_model_input) // 2] latent_model_input = paddle.concat([half, half], axis=0) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) timesteps = t if not paddle.is_tensor(timesteps): @@ -154,22 +152,25 @@ def __call__( elif len(timesteps.shape) == 0: timesteps = timesteps[None] # broadcast to batch dimension in a way that's compatible with ONNX/Core ML - timesteps = timesteps.expand([latent_model_input.shape[0], ]) + timesteps = timesteps.expand( + [ + latent_model_input.shape[0], + ] + ) # predict noise model_output noise_pred = self.transformer( - latent_model_input, - timestep=timesteps, - class_labels=class_labels_input).sample + latent_model_input, timestep=timesteps, class_labels=class_labels_input + ).sample # perform guidance if guidance_scale > 1: eps, rest = ( noise_pred[:, :latent_channels], - noise_pred[:, latent_channels:], ) + noise_pred[:, latent_channels:], + ) bs = eps.shape[0] # TODO torch.split vs paddle.split - cond_eps, uncond_eps = paddle.split( - eps, [bs // 2, bs - bs // 2], axis=0) + cond_eps, uncond_eps = paddle.split(eps, [bs // 2, bs - bs // 2], axis=0) half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps) eps = paddle.concat([half_eps, half_eps], axis=0) @@ -182,13 +183,13 @@ def __call__( model_output, _ = paddle.split( noise_pred, [latent_channels, noise_pred.shape[1] - latent_channels], - axis=1, ) + axis=1, + ) else: model_output = noise_pred # compute previous image: x_t -> x_t-1 - latent_model_input = self.scheduler.step( - model_output, t, latent_model_input).prev_sample + latent_model_input = self.scheduler.step(model_output, t, latent_model_input).prev_sample if guidance_scale > 1: latents, _ = latent_model_input.chunk(2, axis=0) @@ -207,6 +208,6 @@ def __call__( samples = self.numpy_to_pil(samples) if not return_dict: - return (samples, ) + return (samples,) return ImagePipelineOutput(images=samples) diff --git a/ppdiffusers/ppdiffusers/pipelines/fastdeploy_utils.py b/ppdiffusers/ppdiffusers/pipelines/fastdeploy_utils.py index 8f75881eec2ef..9b672f9c0f8a5 100644 --- a/ppdiffusers/ppdiffusers/pipelines/fastdeploy_utils.py +++ b/ppdiffusers/ppdiffusers/pipelines/fastdeploy_utils.py @@ -26,18 +26,38 @@ from ..image_processor import VaeImageProcessor from ..schedulers import ( - DDIMScheduler, DDPMScheduler, DEISMultistepScheduler, - DPMSolverMultistepScheduler, DPMSolverSinglestepScheduler, - EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, - HeunDiscreteScheduler, KDPM2AncestralDiscreteScheduler, - KDPM2DiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler, - PreconfigEulerAncestralDiscreteScheduler, PreconfigLMSDiscreteScheduler, - UniPCMultistepScheduler) + DDIMScheduler, + DDPMScheduler, + DEISMultistepScheduler, + DPMSolverMultistepScheduler, + DPMSolverSinglestepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + HeunDiscreteScheduler, + KDPM2AncestralDiscreteScheduler, + KDPM2DiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + PreconfigEulerAncestralDiscreteScheduler, + PreconfigLMSDiscreteScheduler, + UniPCMultistepScheduler, +) from ..utils import ( - DIFFUSERS_CACHE, FASTDEPLOY_MODEL_NAME, FASTDEPLOY_WEIGHTS_NAME, - FROM_HF_HUB, HF_HUB_OFFLINE, ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME, - PPDIFFUSERS_CACHE, _add_variant, _get_model_file, is_fastdeploy_available, - is_paddle_available, logging, randn_tensor) + DIFFUSERS_CACHE, + FASTDEPLOY_MODEL_NAME, + FASTDEPLOY_WEIGHTS_NAME, + FROM_HF_HUB, + HF_HUB_OFFLINE, + ONNX_EXTERNAL_WEIGHTS_NAME, + ONNX_WEIGHTS_NAME, + PPDIFFUSERS_CACHE, + _add_variant, + _get_model_file, + is_fastdeploy_available, + is_paddle_available, + logging, + randn_tensor, +) from ..version import VERSION as __version__ __all__ = ["FastDeployRuntimeModel", "FastDeployDiffusionPipelineMixin"] @@ -54,9 +74,7 @@ def fdtensor2pdtensor(fdtensor: "fd.C.FDTensor"): pdtensor = paddle.utils.dlpack.from_dlpack(dltensor) return pdtensor - def pdtensor2fdtensor(pdtensor: paddle.Tensor, - name: str="", - share_with_raw_ptr=False): + def pdtensor2fdtensor(pdtensor: paddle.Tensor, name: str = "", share_with_raw_ptr=False): if not share_with_raw_ptr: dltensor = paddle.utils.dlpack.to_dlpack(pdtensor) return fd.C.FDTensor.from_dlpack(name, dltensor) @@ -67,7 +85,8 @@ def pdtensor2fdtensor(pdtensor: paddle.Tensor, pdtensor.shape, pdtensor.dtype.name, str(pdtensor.place), - int(pdtensor.place.gpu_device_id()), ) + int(pdtensor.place.gpu_device_id()), + ) logger = logging.get_logger(__name__) @@ -88,7 +107,8 @@ def pdtensor2fdtensor(pdtensor: paddle.Tensor, [^\\()\[\]:]+| : """, - re.X, ) + re.X, +) def parse_prompt_attention(text): @@ -207,32 +227,20 @@ def get_prompts_with_weights(pipe, prompt: List[str], max_length: int): tokens.append(text_token) weights.append(text_weight) if truncated: - logger.warning( - "Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples" - ) + logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples") return tokens, weights -def pad_tokens_and_weights(tokens, - weights, - max_length, - bos, - eos, - pad, - no_boseos_middle=True, - chunk_length=77): +def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos_middle=True, chunk_length=77): r""" Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length. """ max_embeddings_multiples = (max_length - 2) // (chunk_length - 2) - weights_length = (max_length if no_boseos_middle else - max_embeddings_multiples * chunk_length) + weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length for i in range(len(tokens)): - tokens[i] = ([bos] + tokens[i] + [eos] + [pad] * - (max_length - 2 - len(tokens[i]))) + tokens[i] = [bos] + tokens[i] + [eos] + [pad] * (max_length - 2 - len(tokens[i])) if no_boseos_middle: - weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - - len(weights[i])) + weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i])) else: w = [] if len(weights[i]) == 0: @@ -240,23 +248,21 @@ def pad_tokens_and_weights(tokens, else: for j in range(max_embeddings_multiples): w.append(1.0) # weight for starting token in this chunk - w += weights[i][j * (chunk_length - 2):min( - len(weights[i]), (j + 1) * (chunk_length - 2))] + w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))] w.append(1.0) # weight for ending token in this chunk w += [1.0] * (weights_length - len(w)) weights[i] = w[:] # we must to tensor first! - return paddle.to_tensor( - tokens, dtype="int64"), paddle.to_tensor( - weights, dtype="float32") + return paddle.to_tensor(tokens, dtype="int64"), paddle.to_tensor(weights, dtype="float32") def get_unweighted_text_embeddings( - pipe, - text_input: paddle.Tensor, - chunk_length: int, - no_boseos_middle: Optional[bool]=True, - infer_op=None, ): + pipe, + text_input: paddle.Tensor, + chunk_length: int, + no_boseos_middle: Optional[bool] = True, + infer_op=None, +): """ When the length of tokens is a multiple of the capacity of the text encoder, it should be split into chunks and sent to the text encoder individually. @@ -267,8 +273,7 @@ def get_unweighted_text_embeddings( text_embeddings = [] for i in range(max_embeddings_multiples): # extract the i-th chunk - text_input_chunk = text_input[:, i * (chunk_length - 2):(i + 1) * ( - chunk_length - 2) + 2].clone() + text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone() # cover the head and the tail by the starting and the ending tokens text_input_chunk[:, 0] = text_input[0, 0] @@ -282,7 +287,8 @@ def get_unweighted_text_embeddings( text_embedding = pipe.text_encoder( input_ids=text_input_chunk, infer_op=infer_op, - output_shape=output_shape, )[0] + output_shape=output_shape, + )[0] if no_boseos_middle: if i == 0: # discard the ending token @@ -305,20 +311,22 @@ def get_unweighted_text_embeddings( text_embeddings = pipe.text_encoder( input_ids=text_input, infer_op=infer_op, - output_shape=output_shape, )[0] + output_shape=output_shape, + )[0] return text_embeddings def get_weighted_text_embeddings( - pipe, - prompt: Union[str, List[str]], - uncond_prompt: Optional[Union[str, List[str]]]=None, - max_embeddings_multiples: Optional[int]=1, - no_boseos_middle: Optional[bool]=False, - skip_parsing: Optional[bool]=False, - skip_weighting: Optional[bool]=False, - infer_op=None, - **kwargs, ): + pipe, + prompt: Union[str, List[str]], + uncond_prompt: Optional[Union[str, List[str]]] = None, + max_embeddings_multiples: Optional[int] = 1, + no_boseos_middle: Optional[bool] = False, + skip_parsing: Optional[bool] = False, + skip_weighting: Optional[bool] = False, + infer_op=None, + **kwargs, +): r""" Prompts can be assigned with local weights using brackets. For example, prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful', @@ -342,24 +350,19 @@ def get_weighted_text_embeddings( skip_weighting (`bool`, *optional*, defaults to `False`): Skip the weighting. When the parsing is skipped, it is forced True. """ - max_length = (pipe.tokenizer.model_max_length - 2 - ) * max_embeddings_multiples + 2 + max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2 if isinstance(prompt, str): prompt = [prompt] if not skip_parsing: - prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, - max_length - 2) + prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2) if uncond_prompt is not None: if isinstance(uncond_prompt, str): uncond_prompt = [uncond_prompt] - uncond_tokens, uncond_weights = get_prompts_with_weights( - pipe, uncond_prompt, max_length - 2) + uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2) else: prompt_tokens = [ - token[1:-1] - for token in pipe.tokenizer( - prompt, max_length=max_length, truncation=True).input_ids + token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids ] prompt_weights = [[1.0] * len(token) for token in prompt_tokens] if uncond_prompt is not None: @@ -367,33 +370,26 @@ def get_weighted_text_embeddings( uncond_prompt = [uncond_prompt] uncond_tokens = [ token[1:-1] - for token in pipe.tokenizer( - uncond_prompt, max_length=max_length, truncation=True) - .input_ids + for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids ] uncond_weights = [[1.0] * len(token) for token in uncond_tokens] # round up the longest length of tokens to a multiple of (model_max_length - 2) max_length = max([len(token) for token in prompt_tokens]) if uncond_prompt is not None: - max_length = max(max_length, - max([len(token) for token in uncond_tokens])) + max_length = max(max_length, max([len(token) for token in uncond_tokens])) max_embeddings_multiples = min( max_embeddings_multiples, - (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1, ) + (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1, + ) max_embeddings_multiples = max(1, max_embeddings_multiples) - max_length = (pipe.tokenizer.model_max_length - 2 - ) * max_embeddings_multiples + 2 + max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2 # pad the length of tokens and weights # support bert tokenizer - bos = (pipe.tokenizer.bos_token_id - if pipe.tokenizer.bos_token_id is not None else - pipe.tokenizer.cls_token_id) - eos = (pipe.tokenizer.eos_token_id - if pipe.tokenizer.eos_token_id is not None else - pipe.tokenizer.sep_token_id) + bos = pipe.tokenizer.bos_token_id if pipe.tokenizer.bos_token_id is not None else pipe.tokenizer.cls_token_id + eos = pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id is not None else pipe.tokenizer.sep_token_id pad = pipe.tokenizer.pad_token_id prompt_tokens, prompt_weights = pad_tokens_and_weights( @@ -404,7 +400,8 @@ def get_weighted_text_embeddings( eos, pad, no_boseos_middle=no_boseos_middle, - chunk_length=pipe.tokenizer.model_max_length, ) + chunk_length=pipe.tokenizer.model_max_length, + ) if uncond_prompt is not None: uncond_tokens, uncond_weights = pad_tokens_and_weights( uncond_tokens, @@ -414,35 +411,34 @@ def get_weighted_text_embeddings( eos, pad, no_boseos_middle=no_boseos_middle, - chunk_length=pipe.tokenizer.model_max_length, ) + chunk_length=pipe.tokenizer.model_max_length, + ) # get the embeddings text_embeddings = get_unweighted_text_embeddings( pipe, prompt_tokens, pipe.tokenizer.model_max_length, no_boseos_middle=no_boseos_middle, - infer_op=infer_op, ) + infer_op=infer_op, + ) if uncond_prompt is not None: uncond_embeddings = get_unweighted_text_embeddings( pipe, uncond_tokens, pipe.tokenizer.model_max_length, no_boseos_middle=no_boseos_middle, - infer_op=infer_op, ) + infer_op=infer_op, + ) # assign weights to the prompts and normalize in the sense of mean # TODO: should we normalize by chunk or in a whole (current implementation)? if (not skip_parsing) and (not skip_weighting): previous_mean = text_embeddings.mean(axis=[-2, -1]) text_embeddings *= prompt_weights.unsqueeze(-1) - text_embeddings *= ( - (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1) - .unsqueeze(-1)) + text_embeddings *= (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1) if uncond_prompt is not None: previous_mean = uncond_embeddings.mean(axis=[-2, -1]) uncond_embeddings *= uncond_weights.unsqueeze(-1) - uncond_embeddings *= ( - (previous_mean / uncond_embeddings.mean(axis=[-2, -1])) - .unsqueeze(-1).unsqueeze(-1)) + uncond_embeddings *= (previous_mean / uncond_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1) if uncond_prompt is not None: return text_embeddings, uncond_embeddings @@ -459,8 +455,7 @@ def prepare_infer_op_dict(self, infer_op_dict=None, **kwargs): continue module = getattr(self, name) if isinstance(module, FastDeployRuntimeModel): - infer_op = (infer_op_dict.get(name, "zero_copy_infer") - if module.is_spport_zero_copy() else "raw") + infer_op = infer_op_dict.get(name, "zero_copy_infer") if module.is_spport_zero_copy() else "raw" # if parse_prompt_type in ["lpw", "webui"] and name in ["text_encoder"]: # if infer_op != "raw": # logger.warning( @@ -470,19 +465,16 @@ def prepare_infer_op_dict(self, infer_op_dict=None, **kwargs): new_infer_op_dict[name] = infer_op return new_infer_op_dict - def post_init(self, - vae_scaling_factor=0.18215, - vae_scale_factor=8, - dtype="float32"): + def post_init(self, vae_scaling_factor=0.18215, vae_scale_factor=8, dtype="float32"): self.vae_scaling_factor = vae_scaling_factor self.vae_scale_factor = vae_scale_factor - self.image_processor = VaeImageProcessor( - vae_scale_factor=vae_scale_factor, do_convert_rgb=True) + self.image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor, do_convert_rgb=True) self.control_image_processor = VaeImageProcessor( vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, - do_normalize=False, ) + do_normalize=False, + ) self.dtype = dtype self.supported_scheduler = [ "pndm", @@ -533,53 +525,44 @@ def text_encoder_hidden_states_dim(self): def change_scheduler(self, scheduler_type="ddim", inplace=True): scheduler_type = scheduler_type.lower() if scheduler_type == "pndm": - scheduler = PNDMScheduler.from_config( - self.orginal_scheduler_config, skip_prk_steps=True) + scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True) elif scheduler_type == "lms": - scheduler = LMSDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "preconfig-lms": - scheduler = PreconfigLMSDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = PreconfigLMSDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "heun": - scheduler = HeunDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "euler": - scheduler = EulerDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "euler-ancestral": - scheduler = EulerAncestralDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "preconfig-euler-ancestral": - scheduler = PreconfigEulerAncestralDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = PreconfigEulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "dpm-multi": - scheduler = DPMSolverMultistepScheduler.from_config( - self.orginal_scheduler_config) + scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "dpm-single": - scheduler = DPMSolverSinglestepScheduler.from_config( - self.orginal_scheduler_config) + scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "kdpm2-ancestral": - scheduler = KDPM2AncestralDiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "kdpm2": - scheduler = KDPM2DiscreteScheduler.from_config( - self.orginal_scheduler_config) + scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "unipc-multi": - scheduler = UniPCMultistepScheduler.from_config( - self.orginal_scheduler_config) + scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config) elif scheduler_type == "ddim": scheduler = DDIMScheduler.from_config( self.orginal_scheduler_config, steps_offset=1, clip_sample=False, - set_alpha_to_one=False, ) + set_alpha_to_one=False, + ) elif scheduler_type == "ddpm": - scheduler = DDPMScheduler.from_config(self.orginal_scheduler_config, - ) + scheduler = DDPMScheduler.from_config( + self.orginal_scheduler_config, + ) elif scheduler_type == "deis-multi": scheduler = DEISMultistepScheduler.from_config( - self.orginal_scheduler_config, ) + self.orginal_scheduler_config, + ) else: raise ValueError( f"Scheduler of type {scheduler_type} doesn't exist! Please choose in {self.supported_scheduler}!" @@ -590,16 +573,13 @@ def change_scheduler(self, scheduler_type="ddim", inplace=True): def get_timesteps(self, num_inference_steps, strength=1.0): if strength >= 1: - return self.scheduler.timesteps.cast( - self.dtype), num_inference_steps + return self.scheduler.timesteps.cast(self.dtype), num_inference_steps # get the original timestep using init_timestep - init_timestep = min( - int(num_inference_steps * strength), num_inference_steps) + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[ - t_start * self.scheduler.order:].cast(self.dtype) + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :].cast(self.dtype) if hasattr(self.scheduler, "step_index_offset"): self.scheduler.step_index_offset = t_start * self.scheduler.order @@ -615,24 +595,24 @@ def get_timesteps(self, num_inference_steps, strength=1.0): return timesteps, num_inference_steps def prepare_controlnet_cond( - self, - controlnet_cond, - controlnet_conditioning_scale, - width, - height, - batch_size, - num_images_per_prompt, - do_classifier_free_guidance=False, ): + self, + controlnet_cond, + controlnet_conditioning_scale, + width, + height, + batch_size, + num_images_per_prompt, + do_classifier_free_guidance=False, + ): control_image = self.control_image_processor.preprocess( controlnet_cond, height=height, - width=width, ) + width=width, + ) if isinstance(controlnet_conditioning_scale, (float, int)): - controlnet_conditioning_scale = paddle.to_tensor( - [controlnet_conditioning_scale] * 13, dtype=self.dtype) + controlnet_conditioning_scale = paddle.to_tensor([controlnet_conditioning_scale] * 13, dtype=self.dtype) elif isinstance(controlnet_conditioning_scale, (list, tuple)): - controlnet_conditioning_scale = paddle.to_tensor( - controlnet_conditioning_scale, dtype=self.dtype) + controlnet_conditioning_scale = paddle.to_tensor(controlnet_conditioning_scale, dtype=self.dtype) else: raise ValueError( f"`controlnet_conditioning_scale` has to be of type `float` or `int` or `list` or `tuple` but is {type(controlnet_conditioning_scale)}" @@ -650,40 +630,40 @@ def prepare_controlnet_cond( return control_image, controlnet_conditioning_scale def check_inputs( - self, - prompt, - height=512, - width=512, - callback_steps=1, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, - strength=1.0, ): + self, + prompt, + height=512, + width=512, + callback_steps=1, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + strength=1.0, + ): if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0: raise ValueError( f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}." ) if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -696,24 +676,25 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) if strength < 0 or strength > 1: - raise ValueError( - f"The value of strength should in [0.0, 1.0] but is {strength}") + raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") def prepare_latents( - self, - batch_size, - height, - width, - generator, - latents=None, - image=None, - timestep=None, - is_strength_max=True, - return_noise=False, - return_image_latents=False, - infer_op=None, ): + self, + batch_size, + height, + width, + generator, + latents=None, + image=None, + timestep=None, + is_strength_max=True, + return_noise=False, + return_image_latents=False, + infer_op=None, + ): shape = [ batch_size, self.vae_decoder_num_latent_channels, @@ -739,46 +720,44 @@ def prepare_latents( if latents is None: noise = randn_tensor(shape, generator=generator, dtype=self.dtype) # if strength is 1. then initialise the latents to noise, else initial to image + noise - latents = (noise if is_strength_max else - self.scheduler.add_noise(image_latents, noise, timestep)) + latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep) # if pure noise then scale the initial latents by the Scheduler's init sigma - latents = (latents * self.scheduler.init_noise_sigma - if is_strength_max else latents) + latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents else: noise = latents if str(noise.dtype).replace("paddle.", "") != self.dtype: noise = noise.cast(self.dtype) latents = noise * self.scheduler.init_noise_sigma - outputs = (latents, ) + outputs = (latents,) if return_noise: - outputs += (noise, ) + outputs += (noise,) if return_image_latents: - outputs += (image_latents, ) + outputs += (image_latents,) if len(outputs) == 1: outputs = latents return outputs def prepare_mask_latents( - self, - mask, - masked_image, - batch_size, - height, - width, - do_classifier_free_guidance, - return_masked_image_latents=True, - infer_op=None, ): + self, + mask, + masked_image, + batch_size, + height, + width, + do_classifier_free_guidance, + return_masked_image_latents=True, + infer_op=None, + ): # resize the mask to latents shape as we concatenate the mask to the latents # we do that before converting to dtype to avoid breaking in case we're using cpu_offload # and half precision mask = paddle.nn.functional.interpolate( - mask, - size=(height // self.vae_scale_factor, - width // self.vae_scale_factor)) + mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor) + ) mask = mask.cast(dtype=self.dtype) # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method @@ -791,8 +770,7 @@ def prepare_mask_latents( ) mask = mask.tile([batch_size // mask.shape[0], 1, 1, 1]) - mask = paddle.concat([mask] * - 2) if do_classifier_free_guidance else mask + mask = paddle.concat([mask] * 2) if do_classifier_free_guidance else mask if not return_masked_image_latents: return mask @@ -805,20 +783,18 @@ def prepare_mask_latents( f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed." " Make sure the number of images that you pass is divisible by the total requested batch size." ) - masked_image_latents = masked_image_latents.tile( - [batch_size // masked_image_latents.shape[0], 1, 1, 1]) + masked_image_latents = masked_image_latents.tile([batch_size // masked_image_latents.shape[0], 1, 1, 1]) - masked_image_latents = (paddle.concat([masked_image_latents] * 2) - if do_classifier_free_guidance else - masked_image_latents) + masked_image_latents = ( + paddle.concat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents + ) # aligning device to prevent device errors when concating it with the latent model input masked_image_latents = masked_image_latents.cast(dtype=self.dtype) return mask, masked_image_latents def is_scheduler_support_step_index(self): - kwargs_keys = set( - inspect.signature(self.scheduler.step).parameters.keys()) + kwargs_keys = set(inspect.signature(self.scheduler.step).parameters.keys()) return "kwargs" in kwargs_keys or "step_index" in kwargs_keys def _encode_vae_image(self, image: paddle.Tensor, infer_op=None, **kwargs): @@ -832,14 +808,12 @@ def _encode_vae_image(self, image: paddle.Tensor, infer_op=None, **kwargs): image_latents = self.vae_encoder( sample=image, infer_op=infer_op, - output_shape=output_shape, )[0] + output_shape=output_shape, + )[0] return self.vae_scaling_factor * image_latents - def _decode_vae_latents(self, - latents: paddle.Tensor, - infer_op=None, - **kwargs): + def _decode_vae_latents(self, latents: paddle.Tensor, infer_op=None, **kwargs): latents_shape = latents.shape output_shape = [ latents_shape[0], @@ -850,22 +824,24 @@ def _decode_vae_latents(self, images_vae = self.vae_decoder( latent_sample=latents, infer_op=infer_op, - output_shape=output_shape, )[0] + output_shape=output_shape, + )[0] return images_vae def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - infer_op=None, - parse_prompt_type: Optional[str]="lpw", - max_embeddings_multiples: Optional[int]=3, - **kwargs, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + infer_op=None, + parse_prompt_type: Optional[str] = "lpw", + max_embeddings_multiples: Optional[int] = 3, + **kwargs, + ): if parse_prompt_type == "lpw": return self._encode_prompt_lpw( prompt, @@ -876,7 +852,8 @@ def _encode_prompt( negative_prompt_embeds=negative_prompt_embeds, max_embeddings_multiples=max_embeddings_multiples, infer_op="raw", # NOTE: we can't use zero copy! - **kwargs, ) + **kwargs, + ) elif parse_prompt_type == "raw": return self._encode_prompt_raw( prompt, @@ -885,22 +862,23 @@ def _encode_prompt( negative_prompt=negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, - infer_op=infer_op, ) + infer_op=infer_op, + ) elif parse_prompt_type == "webui": - raise NotImplementedError( - "`parse_prompt_type=webui` is not implemented yet.") + raise NotImplementedError("`parse_prompt_type=webui` is not implemented yet.") def _encode_prompt_lpw( - self, - prompt: Union[str, List[str]], - num_images_per_prompt: int, - do_classifier_free_guidance: bool, - negative_prompt: Union[str, List[str]], - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - infer_op=None, - max_embeddings_multiples: Optional[int]=3, - **kwargs, ): + self, + prompt: Union[str, List[str]], + num_images_per_prompt: int, + do_classifier_free_guidance: bool, + negative_prompt: Union[str, List[str]], + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + infer_op=None, + max_embeddings_multiples: Optional[int] = 3, + **kwargs, + ): r""" Encodes the prompt into text encoder hidden states. @@ -930,18 +908,19 @@ def _encode_prompt_lpw( if do_classifier_free_guidance: if negative_prompt is None: uncond_tokens = [""] * batch_size - elif prompt is not None and type(prompt) is not type( - negative_prompt): + elif prompt is not None and type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt @@ -951,37 +930,35 @@ def _encode_prompt_lpw( uncond_prompt=uncond_tokens, max_embeddings_multiples=max_embeddings_multiples, infer_op=infer_op, - **kwargs, ) + **kwargs, + ) bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds def _encode_prompt_raw( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - infer_op=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + infer_op=None, + ): r""" Encodes the prompt into text encoder hidden states. @@ -1018,21 +995,22 @@ def _encode_prompt_raw( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", - return_tensors="pd").input_ids # check + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids # check - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) prompt_embeds = self.text_encoder( input_ids=text_input_ids, @@ -1041,13 +1019,13 @@ def _encode_prompt_raw( batch_size, self.tokenizer.model_max_length, self.text_encoder_hidden_states_dim, - ], )[0] + ], + )[0] bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -1056,14 +1034,16 @@ def _encode_prompt_raw( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt @@ -1073,7 +1053,8 @@ def _encode_prompt_raw( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) negative_prompt_embeds = self.text_encoder( input_ids=uncond_input.input_ids, infer_op=infer_op, @@ -1081,21 +1062,19 @@ def _encode_prompt_raw( batch_size, max_length, self.text_encoder_hidden_states_dim, - ], )[0] + ], + )[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds @@ -1104,17 +1083,15 @@ def run_safety_checker(self, image): has_nsfw_concept = None else: if paddle.is_tensor(image): - feature_extractor_input = self.image_processor.postprocess( - image, output_type="pil") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") else: - feature_extractor_input = self.image_processor.numpy_to_pil( - image) - safety_checker_input = self.feature_extractor( - feature_extractor_input, return_tensors="np") + feature_extractor_input = self.image_processor.numpy_to_pil(image) + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="np") image, has_nsfw_concept = self.safety_checker( images=image.numpy(), clip_input=safety_checker_input.pixel_values.astype(self.dtype), - infer_op="raw", ) + infer_op="raw", + ) image = paddle.to_tensor(image, dtype=self.dtype) return image, has_nsfw_concept @@ -1124,15 +1101,13 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs @@ -1140,9 +1115,7 @@ def prepare_extra_step_kwargs(self, generator, eta): class FastDeployRuntimeModel: def __init__(self, model=None, **kwargs): - logger.info( - "`ppdiffusers.FastDeployRuntimeModel` is experimental and might change in the future." - ) + logger.info("`ppdiffusers.FastDeployRuntimeModel` is experimental and might change in the future.") self.model = model self.model_save_dir = kwargs.get("model_save_dir", None) self.model_format = kwargs.get("model_format", None) @@ -1171,11 +1144,12 @@ def is_spport_zero_copy(self): return False def zero_copy_infer( - self, - prebinded_inputs: dict, - prebinded_outputs: dict, - share_with_raw_ptr=True, - **kwargs, ): + self, + prebinded_inputs: dict, + prebinded_outputs: dict, + share_with_raw_ptr=True, + **kwargs, + ): """ Execute inference without copying data from cpu to gpu. @@ -1186,17 +1160,11 @@ def zero_copy_infer( List of output tensor. """ for inputs_name, inputs_tensor in prebinded_inputs.items(): - input_fdtensor = pdtensor2fdtensor( - inputs_tensor, - inputs_name, - share_with_raw_ptr=share_with_raw_ptr) + input_fdtensor = pdtensor2fdtensor(inputs_tensor, inputs_name, share_with_raw_ptr=share_with_raw_ptr) self.model.bind_input_tensor(inputs_name, input_fdtensor) for outputs_name, outputs_tensor in prebinded_outputs.items(): - output_fdtensor = pdtensor2fdtensor( - outputs_tensor, - outputs_name, - share_with_raw_ptr=share_with_raw_ptr) + output_fdtensor = pdtensor2fdtensor(outputs_tensor, outputs_name, share_with_raw_ptr=share_with_raw_ptr) self.model.bind_output_tensor(outputs_name, output_fdtensor) self.model.zero_copy_infer() @@ -1222,25 +1190,27 @@ def __call__(self, **kwargs): self.zero_copy_infer( prebinded_inputs=inputs, prebinded_outputs={self.model.get_output_info(0).name: output}, - share_with_raw_ptr=share_with_raw_ptr, ) - return [output, ] + share_with_raw_ptr=share_with_raw_ptr, + ) + return [ + output, + ] elif infer_op == "raw": inputs = {} for k, v in kwargs.items(): if paddle.is_tensor(v): v = v.numpy() inputs[k] = np.array(v) - return [ - paddle.to_tensor(output) for output in self.model.infer(inputs) - ] + return [paddle.to_tensor(output) for output in self.model.infer(inputs)] else: raise ValueError("Unknown infer_op {}".format(infer_op)) @staticmethod def load_model( - model_path: Union[str, Path], - params_path: Union[str, Path]=None, - runtime_options: Optional["fd.RuntimeOption"]=None, ): + model_path: Union[str, Path], + params_path: Union[str, Path] = None, + runtime_options: Optional["fd.RuntimeOption"] = None, + ): """ Loads an FastDeploy Inference Model with fastdeploy.RuntimeOption @@ -1255,9 +1225,7 @@ def load_model( """ option = runtime_options if option is None or not isinstance(runtime_options, fd.RuntimeOption): - logger.info( - "No fastdeploy.RuntimeOption specified, using CPU device and paddle inference backend." - ) + logger.info("No fastdeploy.RuntimeOption specified, using CPU device and paddle inference backend.") option = fd.RuntimeOption() option.use_paddle_backend() option.use_cpu() @@ -1275,11 +1243,12 @@ def load_model( return fd.Runtime(option) def _save_pretrained( - self, - save_directory: Union[str, Path], - model_file_name: Optional[str]=None, - params_file_name: Optional[str]=None, - **kwargs, ): + self, + save_directory: Union[str, Path], + model_file_name: Optional[str] = None, + params_file_name: Optional[str] = None, + **kwargs, + ): """ Save a model and its configuration file to a directory, so that it can be re-loaded using the [`~FastDeployRuntimeModel.from_pretrained`] class method. It will always save the @@ -1296,11 +1265,14 @@ def _save_pretrained( model with a different name. """ is_onnx_model = self.model_format == ModelFormat.ONNX - model_file_name = (model_file_name if model_file_name is not None else - FASTDEPLOY_MODEL_NAME - if not is_onnx_model else ONNX_WEIGHTS_NAME) - params_file_name = (params_file_name if params_file_name is not None - else FASTDEPLOY_WEIGHTS_NAME) + model_file_name = ( + model_file_name + if model_file_name is not None + else FASTDEPLOY_MODEL_NAME + if not is_onnx_model + else ONNX_WEIGHTS_NAME + ) + params_file_name = params_file_name if params_file_name is not None else FASTDEPLOY_WEIGHTS_NAME src_model_path = self.model_save_dir.joinpath(self.latest_model_name) dst_model_path = Path(save_directory).joinpath(model_file_name) @@ -1312,19 +1284,16 @@ def _save_pretrained( if is_onnx_model: # copy external weights (for models >2GB) - src_model_path = self.model_save_dir.joinpath( - ONNX_EXTERNAL_WEIGHTS_NAME) + src_model_path = self.model_save_dir.joinpath(ONNX_EXTERNAL_WEIGHTS_NAME) if src_model_path.exists(): - dst_model_path = Path(save_directory).joinpath( - ONNX_EXTERNAL_WEIGHTS_NAME) + dst_model_path = Path(save_directory).joinpath(ONNX_EXTERNAL_WEIGHTS_NAME) try: shutil.copyfile(src_model_path, dst_model_path) except shutil.SameFileError: pass if not is_onnx_model: - src_params_path = self.model_save_dir.joinpath( - self.latest_params_name) + src_params_path = self.model_save_dir.joinpath(self.latest_params_name) dst_params_path = Path(save_directory).joinpath(params_file_name) try: shutil.copyfile(src_params_path, dst_params_path) @@ -1332,9 +1301,10 @@ def _save_pretrained( pass def save_pretrained( - self, - save_directory: Union[str, os.PathLike], - **kwargs, ): + self, + save_directory: Union[str, os.PathLike], + **kwargs, + ): """ Save a model to a directory, so that it can be re-loaded using the [`~FastDeployRuntimeModel.from_pretrained`] class method.: @@ -1344,9 +1314,7 @@ def save_pretrained( Directory to which to save. Will be created if it doesn't exist. """ if os.path.isfile(save_directory): - logger.error( - f"Provided path ({save_directory}) should be a directory, not a file" - ) + logger.error(f"Provided path ({save_directory}) should be a directory, not a file") return os.makedirs(save_directory, exist_ok=True) @@ -1356,23 +1324,24 @@ def save_pretrained( @classmethod def _from_pretrained( - cls, - pretrained_model_name_or_path: Union[str, Path], - model_file_name: Optional[str]=None, - params_file_name: Optional[str]=None, - use_auth_token: Optional[Union[bool, str, None]]=None, - revision: Optional[str]=None, - subfolder: Optional[str]=None, - force_download: bool=False, - cache_dir: Optional[str]=None, - runtime_options: Optional["fd.RuntimeOption"]=None, - from_hf_hub: Optional[bool]=False, - proxies: Optional[Dict]=None, - resume_download: bool=False, - local_files_only: bool=False, - user_agent: Union[Dict, str, None]=None, - is_onnx_model: bool=False, - **kwargs, ): + cls, + pretrained_model_name_or_path: Union[str, Path], + model_file_name: Optional[str] = None, + params_file_name: Optional[str] = None, + use_auth_token: Optional[Union[bool, str, None]] = None, + revision: Optional[str] = None, + subfolder: Optional[str] = None, + force_download: bool = False, + cache_dir: Optional[str] = None, + runtime_options: Optional["fd.RuntimeOption"] = None, + from_hf_hub: Optional[bool] = False, + proxies: Optional[Dict] = None, + resume_download: bool = False, + local_files_only: bool = False, + user_agent: Union[Dict, str, None] = None, + is_onnx_model: bool = False, + **kwargs, + ): """ Load a model from a directory or the HF Hub. @@ -1404,24 +1373,25 @@ def _from_pretrained( kwargs will be passed to the model during initialization """ - model_file_name = (model_file_name if model_file_name is not None else - FASTDEPLOY_MODEL_NAME - if not is_onnx_model else ONNX_WEIGHTS_NAME) - params_file_name = (params_file_name if params_file_name is not None - else FASTDEPLOY_WEIGHTS_NAME) + model_file_name = ( + model_file_name + if model_file_name is not None + else FASTDEPLOY_MODEL_NAME + if not is_onnx_model + else ONNX_WEIGHTS_NAME + ) + params_file_name = params_file_name if params_file_name is not None else FASTDEPLOY_WEIGHTS_NAME kwargs["model_format"] = "ONNX" if is_onnx_model else "PADDLE" # load model from local directory if os.path.isdir(pretrained_model_name_or_path): - model_path = os.path.join(pretrained_model_name_or_path, - model_file_name) - params_path = ( - None if is_onnx_model else - os.path.join(pretrained_model_name_or_path, params_file_name)) + model_path = os.path.join(pretrained_model_name_or_path, model_file_name) + params_path = None if is_onnx_model else os.path.join(pretrained_model_name_or_path, params_file_name) model = FastDeployRuntimeModel.load_model( model_path, params_path, - runtime_options=runtime_options, ) + runtime_options=runtime_options, + ) kwargs["model_save_dir"] = Path(pretrained_model_name_or_path) # load model from hub or paddle bos else: @@ -1437,7 +1407,8 @@ def _from_pretrained( resume_download=resume_download, local_files_only=local_files_only, use_auth_token=use_auth_token, - user_agent=user_agent, ) + user_agent=user_agent, + ) if is_onnx_model: params_cache_path = None kwargs["latest_params_name"] = None @@ -1454,7 +1425,8 @@ def _from_pretrained( resume_download=resume_download, local_files_only=local_files_only, use_auth_token=use_auth_token, - user_agent=user_agent, ) + user_agent=user_agent, + ) kwargs["latest_params_name"] = Path(params_cache_path).name kwargs["model_save_dir"] = Path(model_cache_path).parent kwargs["latest_model_name"] = Path(model_cache_path).name @@ -1462,21 +1434,24 @@ def _from_pretrained( model = FastDeployRuntimeModel.load_model( model_cache_path, params_cache_path, - runtime_options=runtime_options, ) + runtime_options=runtime_options, + ) return cls(model=model, **kwargs) @classmethod def from_pretrained( - cls, - pretrained_model_name_or_path: Union[str, Path], - model_file_name: Optional[str]=None, - params_file_name: Optional[str]=None, - runtime_options: Optional["fd.RuntimeOption"]=None, - is_onnx_model: bool=False, - **kwargs, ): + cls, + pretrained_model_name_or_path: Union[str, Path], + model_file_name: Optional[str] = None, + params_file_name: Optional[str] = None, + runtime_options: Optional["fd.RuntimeOption"] = None, + is_onnx_model: bool = False, + **kwargs, + ): from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB) - cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub - else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)) + cache_dir = ( + kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE) + ) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) proxies = kwargs.pop("proxies", None) @@ -1508,4 +1483,5 @@ def from_pretrained( local_files_only=local_files_only, user_agent=user_agent, is_onnx_model=is_onnx_model, - **kwargs, ) + **kwargs, + ) diff --git a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/__init__.py index 0ebba5a459d49..dd119ef22d12e 100644 --- a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/__init__.py +++ b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/__init__.py @@ -15,9 +15,11 @@ # flake8: noqa from ...utils import is_paddlenlp_available -from .pipeline_latent_diffusion_superresolution import \ - LDMSuperResolutionPipeline +from .pipeline_latent_diffusion_superresolution import LDMSuperResolutionPipeline if is_paddlenlp_available(): - from .pipeline_latent_diffusion import (LDMBertConfig, LDMBertModel, - LDMTextToImagePipeline) + from .pipeline_latent_diffusion import ( + LDMBertConfig, + LDMBertModel, + LDMTextToImagePipeline, + ) diff --git a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py index f0d4f43308d80..e82dda6fe1de3 100644 --- a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py @@ -19,16 +19,20 @@ import paddle import paddle.nn as nn -from paddlenlp.transformers import (PretrainedConfig, PretrainedModel, - PretrainedTokenizer, register_base_model) -from paddlenlp.transformers.model_outputs import \ - BaseModelOutputWithPoolingAndCrossAttentions +from paddlenlp.transformers import ( + PretrainedConfig, + PretrainedModel, + PretrainedTokenizer, + register_base_model, +) +from paddlenlp.transformers.model_outputs import ( + BaseModelOutputWithPoolingAndCrossAttentions, +) from ...configuration_utils import FrozenDict from ...models import AutoencoderKL, UNet2DConditionModel, UNet2DModel, VQModel from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler -from ...utils import (deprecate, logging, randn_tensor, - replace_example_docstring) +from ...utils import deprecate, logging, randn_tensor, replace_example_docstring from ...utils.initializer_utils import normal_, zeros_ from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput @@ -69,34 +73,29 @@ class LDMTextToImagePipeline(DiffusionPipeline): """ def __init__( - self, - vqvae: Union[VQModel, AutoencoderKL], - bert: PretrainedModel, - tokenizer: PretrainedTokenizer, - unet: Union[UNet2DModel, UNet2DConditionModel], - scheduler: Union[DDIMScheduler, PNDMScheduler, - LMSDiscreteScheduler], ): + self, + vqvae: Union[VQModel, AutoencoderKL], + bert: PretrainedModel, + tokenizer: PretrainedTokenizer, + unet: Union[UNet2DModel, UNet2DConditionModel], + scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], + ): super().__init__() - if (hasattr(scheduler.config, "steps_offset") and - scheduler.config.steps_offset != 1): + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " "to update the config accordingly as leaving `steps_offset` might led to incorrect results" " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" - " file") - deprecate( - "steps_offset!=1", - "1.0.0", - deprecation_message, - standard_warn=False) + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["steps_offset"] = 1 scheduler._internal_dict = FrozenDict(new_config) - if (hasattr(scheduler.config, "clip_sample") and - scheduler.config.clip_sample is True): + if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`." " `clip_sample` should be set to False in the configuration file. Please make sure to update the" @@ -104,35 +103,25 @@ def __init__( " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very" " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file" ) - deprecate( - "clip_sample not set", - "1.0.0", - deprecation_message, - standard_warn=False) + deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["clip_sample"] = False scheduler._internal_dict = FrozenDict(new_config) if tokenizer.model_max_length > 77: tokenizer.model_max_length = 77 - self.register_modules( - vqvae=vqvae, - bert=bert, - tokenizer=tokenizer, - unet=unet, - scheduler=scheduler) - self.vae_scale_factor = ( - 8 # 2 ** (len(self.vqvae.config.block_out_channels) - 1) - ) + self.register_modules(vqvae=vqvae, bert=bert, tokenizer=tokenizer, unet=unet, scheduler=scheduler) + self.vae_scale_factor = 8 # 2 ** (len(self.vqvae.config.block_out_channels) - 1) def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): r""" Encodes the prompt into text encoder hidden states. @@ -168,21 +157,25 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because LDMBert can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - prompt_embeds = self.bert(text_input_ids, ) + prompt_embeds = self.bert( + text_input_ids, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(self.bert.dtype) @@ -190,8 +183,7 @@ def _encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -201,14 +193,16 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt @@ -218,28 +212,27 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - negative_prompt_embeds = self.bert(uncond_input.input_ids, ) + negative_prompt_embeds = self.bert( + uncond_input.input_ids, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - self.bert.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(self.bert.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds @@ -257,53 +250,49 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - height, - width, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -316,17 +305,19 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = [ batch_size, num_channels_latents, @@ -349,26 +340,25 @@ def prepare_latents( @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - prompt: Union[str, List[str]]=None, - height: Optional[int]=256, - width: Optional[int]=256, - num_inference_steps: int=50, - guidance_scale: float=1.0, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, ) -> Union[ - Tuple, ImagePipelineOutput]: + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = 256, + width: Optional[int] = 256, + num_inference_steps: int = 50, + guidance_scale: float = 1.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ) -> Union[Tuple, ImagePipelineOutput]: r""" Function invoked when calling the pipeline for generation. @@ -443,7 +433,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -465,7 +456,8 @@ def __call__( do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -480,43 +472,38 @@ def __call__( width, prompt_embeds.dtype, generator, - latents, ) + latents, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( latent_model_input, t, encoder_hidden_states=prompt_embeds, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -533,7 +520,7 @@ def __call__( image = self.decode_latents(latents) if not return_dict: - return (image, ) + return (image,) return ImagePipelineOutput(images=image) @@ -554,25 +541,26 @@ class LDMBertConfig(PretrainedConfig): } def __init__( - self, - vocab_size=30522, - max_position_embeddings=77, - encoder_layers=32, - encoder_ffn_dim=5120, - encoder_attention_heads=8, - head_dim=64, - encoder_layerdrop=0.0, - activation_function="gelu", - d_model=1280, - dropout=0.1, - attention_dropout=0.0, - activation_dropout=0.0, - init_std=0.02, - classifier_dropout=0.0, - scale_embedding=False, - use_cache=True, - pad_token_id=0, - **kwargs, ): + self, + vocab_size=30522, + max_position_embeddings=77, + encoder_layers=32, + encoder_ffn_dim=5120, + encoder_attention_heads=8, + head_dim=64, + encoder_layerdrop=0.0, + activation_function="gelu", + d_model=1280, + dropout=0.1, + attention_dropout=0.0, + activation_dropout=0.0, + init_std=0.02, + classifier_dropout=0.0, + scale_embedding=False, + use_cache=True, + pad_token_id=0, + **kwargs, + ): kwargs["return_dict"] = kwargs.pop("return_dict", True) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings @@ -590,9 +578,7 @@ def __init__( self.classifier_dropout = classifier_dropout self.use_cache = use_cache self.num_hidden_layers = encoder_layers - self.scale_embedding = ( - scale_embedding # scale factor will be sqrt(d_model) if True - ) + self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True super().__init__(pad_token_id=pad_token_id, **kwargs) @@ -603,9 +589,7 @@ class LDMBertPretrainedModel(PretrainedModel): base_model_prefix = "ldmbert" config_class = LDMBertConfig _supports_gradient_checkpointing = True - _keys_to_ignore_on_load_unexpected = [ - r"encoder\.version", r"decoder\.version" - ] + _keys_to_ignore_on_load_unexpected = [r"encoder\.version", r"decoder\.version"] def init_weights(self): """ @@ -626,9 +610,7 @@ def gradient_checkpointing_enable(self): activations". """ if not self.supports_gradient_checkpointing: - raise ValueError( - f"{self.__class__.__name__} does not support gradient checkpointing." - ) + raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.") self.apply(partial(self._set_gradient_checkpointing, value=True)) def gradient_checkpointing_disable(self): @@ -656,15 +638,15 @@ def _init_weights(self, module): class LDMBertEmbeddings(nn.Layer): def __init__( - self, - vocab_size, - hidden_size=768, - hidden_dropout_prob=0.0, - max_position_embeddings=512, ): + self, + vocab_size, + hidden_size=768, + hidden_dropout_prob=0.0, + max_position_embeddings=512, + ): super().__init__() self.word_embeddings = nn.Embedding(vocab_size, hidden_size) - self.position_embeddings = nn.Embedding(max_position_embeddings, - hidden_size) + self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size) self.dropout = nn.Dropout(hidden_dropout_prob) def forward(self, input_ids, position_ids=None): @@ -684,18 +666,19 @@ def forward(self, input_ids, position_ids=None): class TransformerEncoderLayer(nn.TransformerEncoderLayer): def __init__( - self, - d_model, - nhead, - dim_feedforward, - dropout=0.1, - activation="gelu", - attn_dropout=None, - act_dropout=None, - normalize_before=False, - weight_attr=None, - bias_attr=None, - head_dim=64, ): + self, + d_model, + nhead, + dim_feedforward, + dropout=0.1, + activation="gelu", + attn_dropout=None, + act_dropout=None, + normalize_before=False, + weight_attr=None, + bias_attr=None, + head_dim=64, + ): super().__init__( d_model, nhead, @@ -706,7 +689,8 @@ def __init__( act_dropout, normalize_before, weight_attr, - bias_attr, ) + bias_attr, + ) # update self attn self.self_attn = LDMBertAttention( d_model, @@ -714,7 +698,8 @@ def __init__( nhead, dropout=attn_dropout, weight_attr=weight_attr, - bias_attr=False, ) + bias_attr=False, + ) @register_base_model @@ -727,7 +712,8 @@ def __init__(self, config: LDMBertConfig): config.vocab_size, config.d_model, config.dropout, - config.max_position_embeddings, ) + config.max_position_embeddings, + ) encoder_layer = TransformerEncoderLayer( config.d_model, config.encoder_attention_heads, @@ -737,10 +723,10 @@ def __init__(self, config: LDMBertConfig): attn_dropout=config.attention_dropout, act_dropout=config.activation_dropout, normalize_before=True, - head_dim=config.head_dim, ) + head_dim=config.head_dim, + ) - self.encoder = nn.TransformerEncoder(encoder_layer, - config.encoder_layers) + self.encoder = nn.TransformerEncoder(encoder_layer, config.encoder_layers) self.final_layer_norm = nn.LayerNorm(config.d_model) self.init_weights() @@ -751,56 +737,58 @@ def set_input_embeddings(self, value): self.embeddings.word_embeddings = value def forward( - self, - input_ids, - position_ids=None, - attention_mask=None, - output_hidden_states=False, - output_attentions=False, - return_dict=False, ): + self, + input_ids, + position_ids=None, + attention_mask=None, + output_hidden_states=False, + output_attentions=False, + return_dict=False, + ): if attention_mask is not None and attention_mask.ndim == 2: # attention_mask [batch_size, sequence_length] -> [batch_size, 1, 1, sequence_length] - attention_mask = attention_mask.unsqueeze( - axis=[1, 2]).astype(paddle.get_default_dtype()) + attention_mask = attention_mask.unsqueeze(axis=[1, 2]).astype(paddle.get_default_dtype()) attention_mask = (1.0 - attention_mask) * -1e4 - embedding_output = self.embeddings( - input_ids=input_ids, position_ids=position_ids) + embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids) encoder_outputs = self.encoder( embedding_output, src_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) if isinstance(encoder_outputs, type(embedding_output)): sequence_output = self.final_layer_norm(encoder_outputs) - return (sequence_output, ) + return (sequence_output,) else: sequence_output = encoder_outputs[0] sequence_output = self.final_layer_norm(sequence_output) if not return_dict: - return (sequence_output, ) + encoder_outputs[1:] + return (sequence_output,) + encoder_outputs[1:] return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, ) + attentions=encoder_outputs.attentions, + ) class LDMBertAttention(nn.MultiHeadAttention): def __init__( - self, - embed_dim, - head_dim, - num_heads, - dropout=0.0, - kdim=None, - vdim=None, - need_weights=False, - weight_attr=None, - bias_attr=None, ): + self, + embed_dim, + head_dim, + num_heads, + dropout=0.0, + kdim=None, + vdim=None, + need_weights=False, + weight_attr=None, + bias_attr=None, + ): super().__init__( embed_dim, num_heads, @@ -809,15 +797,10 @@ def __init__( vdim, need_weights, weight_attr, - bias_attr, ) - assert ( - embed_dim > 0 - ), "Expected embed_dim to be greater than 0, " "but received {}".format( - embed_dim) - assert ( - num_heads > 0 - ), "Expected num_heads to be greater than 0, " "but received {}".format( - num_heads) + bias_attr, + ) + assert embed_dim > 0, "Expected embed_dim to be greater than 0, " "but received {}".format(embed_dim) + assert num_heads > 0, "Expected num_heads to be greater than 0, " "but received {}".format(num_heads) self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim @@ -830,12 +813,9 @@ def __init__( self.inner_dim = head_dim * num_heads self.scaling = self.head_dim**-0.5 - self.q_proj = nn.Linear( - embed_dim, self.inner_dim, weight_attr, bias_attr=bias_attr) - self.k_proj = nn.Linear( - self.kdim, self.inner_dim, weight_attr, bias_attr=bias_attr) - self.v_proj = nn.Linear( - self.vdim, self.inner_dim, weight_attr, bias_attr=bias_attr) + self.q_proj = nn.Linear(embed_dim, self.inner_dim, weight_attr, bias_attr=bias_attr) + self.k_proj = nn.Linear(self.kdim, self.inner_dim, weight_attr, bias_attr=bias_attr) + self.v_proj = nn.Linear(self.vdim, self.inner_dim, weight_attr, bias_attr=bias_attr) self.out_proj = nn.Linear(self.inner_dim, embed_dim, weight_attr) @@ -847,18 +827,20 @@ def __init__(self, config: LDMBertConfig): self.init_weights() def forward( - self, - input_ids=None, - attention_mask=None, - position_ids=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, ): + self, + input_ids=None, + attention_mask=None, + position_ids=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): outputs = self.ldmbert( input_ids, attention_mask=attention_mask, position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) return outputs diff --git a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py index 0f37d4a18387d..24475c0af099b 100644 --- a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +++ b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -21,8 +21,13 @@ from ...models import UNet2DModel, VQModel from ...schedulers import ( - DDIMScheduler, DPMSolverMultistepScheduler, EulerAncestralDiscreteScheduler, - EulerDiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler) + DDIMScheduler, + DPMSolverMultistepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, +) from ...utils import PIL_INTERPOLATION, randn_tensor from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput @@ -55,27 +60,32 @@ class LDMSuperResolutionPipeline(DiffusionPipeline): """ def __init__( - self, - vqvae: VQModel, - unet: UNet2DModel, - scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler, - EulerDiscreteScheduler, - EulerAncestralDiscreteScheduler, - DPMSolverMultistepScheduler, ], ): + self, + vqvae: VQModel, + unet: UNet2DModel, + scheduler: Union[ + DDIMScheduler, + PNDMScheduler, + LMSDiscreteScheduler, + EulerDiscreteScheduler, + EulerAncestralDiscreteScheduler, + DPMSolverMultistepScheduler, + ], + ): super().__init__() self.register_modules(vqvae=vqvae, unet=unet, scheduler=scheduler) @paddle.no_grad() def __call__( - self, - image: Union[paddle.Tensor, PIL.Image.Image]=None, - batch_size: Optional[int]=1, - num_inference_steps: Optional[int]=100, - eta: Optional[float]=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, ) -> Union[Tuple, ImagePipelineOutput]: + self, + image: Union[paddle.Tensor, PIL.Image.Image] = None, + batch_size: Optional[int] = 1, + num_inference_steps: Optional[int] = 100, + eta: Optional[float] = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + ) -> Union[Tuple, ImagePipelineOutput]: """ Args: image (`paddle.Tensor` or `PIL.Image.Image`): @@ -107,25 +117,20 @@ def __call__( elif isinstance(image, paddle.Tensor): batch_size = image.shape[0] else: - raise ValueError( - f"`image` has to be of type `PIL.Image.Image` or `paddle.Tensor` but is {type(image)}" - ) + raise ValueError(f"`image` has to be of type `PIL.Image.Image` or `paddle.Tensor` but is {type(image)}") if isinstance(image, PIL.Image.Image): image = preprocess(image) height, width = image.shape[-2:] # in_channels should be 6: 3 for latents, 3 for low resolution image - latents_shape = (batch_size, self.unet.config.in_channels // 2, height, - width) + latents_shape = (batch_size, self.unet.config.in_channels // 2, height, width) latents_dtype = self.unet.dtype - latents = randn_tensor( - latents_shape, generator=generator, dtype=latents_dtype) + latents = randn_tensor(latents_shape, generator=generator, dtype=latents_dtype) image = image.cast(latents_dtype) self.scheduler.set_timesteps(num_inference_steps) timesteps_tensor = self.scheduler.timesteps # scale the initial noise by the standard deviation required by the scheduler latents = latents * self.scheduler.init_noise_sigma - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_kwargs = {} if accepts_eta: extra_kwargs["eta"] = eta @@ -136,8 +141,7 @@ def __call__( # predict the noise residual noise_pred = self.unet(latents_input, t).sample # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_kwargs).prev_sample # decode the image latents with the VQVAE image = self.vqvae.decode(latents).sample @@ -147,5 +151,5 @@ def __call__( if output_type == "pil": image = self.numpy_to_pil(image) if not return_dict: - return (image, ) + return (image,) return ImagePipelineOutput(images=image) diff --git a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py index 5434c0cbb084e..11e66b2063f75 100644 --- a/ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +++ b/ppdiffusers/ppdiffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py @@ -36,23 +36,21 @@ class LDMPipeline(DiffusionPipeline): [`DDIMScheduler`] is to be used in combination with `unet` to denoise the encoded image latents. """ - def __init__(self, - vqvae: VQModel, - unet: UNet2DModel, - scheduler: DDIMScheduler): + def __init__(self, vqvae: VQModel, unet: UNet2DModel, scheduler: DDIMScheduler): super().__init__() self.register_modules(vqvae=vqvae, unet=unet, scheduler=scheduler) @paddle.no_grad() - def __call__(self, - batch_size: int=1, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - eta: float=0.0, - num_inference_steps: int=50, - output_type: Optional[str]="pil", - return_dict: bool=True, - **kwargs) -> Union[Tuple, ImagePipelineOutput]: + def __call__( + self, + batch_size: int = 1, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + eta: float = 0.0, + num_inference_steps: int = 50, + output_type: Optional[str] = "pil", + return_dict: bool = True, + **kwargs + ) -> Union[Tuple, ImagePipelineOutput]: """ Args: batch_size (`int`, *optional*, defaults to 1): @@ -77,8 +75,10 @@ def __call__(self, batch_size, self.unet.config.in_channels, self.unet.config.sample_size, - self.unet.config.sample_size, ), - generator=generator, ) + self.unet.config.sample_size, + ), + generator=generator, + ) # scale the initial noise by the standard deviation required by the scheduler latents = latents * self.scheduler.init_noise_sigma @@ -86,8 +86,7 @@ def __call__(self, self.scheduler.set_timesteps(num_inference_steps) # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_kwargs = {} if accepts_eta: extra_kwargs["eta"] = eta @@ -96,13 +95,12 @@ def __call__(self, # predict the noise residual noise_prediction = self.unet(latent_model_input, t).sample # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_prediction, t, latents, - **extra_kwargs).prev_sample + latents = self.scheduler.step(noise_prediction, t, latents, **extra_kwargs).prev_sample image = self.vqvae.decode(latents).sample image = (image / 2 + 0.5).clip(min=0, max=1) image = image.cpu().transpose(perm=[0, 2, 3, 1]).numpy() if output_type == "pil": image = self.numpy_to_pil(image) if not return_dict: - return (image, ) + return (image,) return ImagePipelineOutput(images=image) diff --git a/ppdiffusers/ppdiffusers/pipelines/lvdm/__init__.py b/ppdiffusers/ppdiffusers/pipelines/lvdm/__init__.py index a3967f589a49a..3d31b5e95e74f 100644 --- a/ppdiffusers/ppdiffusers/pipelines/lvdm/__init__.py +++ b/ppdiffusers/ppdiffusers/pipelines/lvdm/__init__.py @@ -19,8 +19,12 @@ import numpy as np import paddle -from ...utils import (BaseOutput, OptionalDependencyNotAvailable, - is_paddle_available, is_paddlenlp_available) +from ...utils import ( + BaseOutput, + OptionalDependencyNotAvailable, + is_paddle_available, + is_paddlenlp_available, +) @dataclass @@ -45,7 +49,7 @@ class VideoPipelineOutput(BaseOutput): except OptionalDependencyNotAvailable: from ...utils.dummy_paddle_and_paddlenlp_objects import * else: - from .pipeline_latent_video_diffusion_model_text2video import \ - LVDMTextToVideoPipeline - from .pipeline_latent_video_diffusion_model_uncond import \ - LVDMUncondPipeline + from .pipeline_latent_video_diffusion_model_text2video import ( + LVDMTextToVideoPipeline, + ) + from .pipeline_latent_video_diffusion_model_uncond import LVDMUncondPipeline diff --git a/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_text2video.py b/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_text2video.py index a727ada59472b..8e339ecfee43d 100644 --- a/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_text2video.py +++ b/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_text2video.py @@ -24,8 +24,7 @@ from ...configuration_utils import FrozenDict from ...models import LVDMAutoencoderKL, LVDMUNet3DModel from ...schedulers import KarrasDiffusionSchedulers -from ...utils import (deprecate, logging, randn_tensor, - replace_example_docstring) +from ...utils import deprecate, logging, randn_tensor, replace_example_docstring from ..pipeline_utils import DiffusionPipeline from . import VideoPipelineOutput from .video_save import save_results @@ -43,12 +42,12 @@ prompt="cutting in kitchen", num_frames=16, height=256, - width=256, - num_inference_steps=50, - generator=generator, + width=256, + num_inference_steps=50, + generator=generator, guidance_scale=15, - eta=1, - save_dir='.', + eta=1, + save_dir='.', save_name='ddim_lvdm_text_to_video_ucf', encoder_type='2d', scale_factor=0.18215, @@ -64,12 +63,10 @@ def split_video_to_clips(video, clip_length, drop_left=True): video_length = video.shape[2] shape = video.shape if video_length % clip_length != 0 and drop_left: - video = video[:, :, :video_length // clip_length * clip_length, :, :] - print( - f"[split_video_to_clips] Drop frames from {shape} to {video.shape}") + video = video[:, :, : video_length // clip_length * clip_length, :, :] + print(f"[split_video_to_clips] Drop frames from {shape} to {video.shape}") nclips = video_length // clip_length - clips = rearrange( - video, "b c (nc cl) h w -> (b nc) c cl h w", cl=clip_length, nc=nclips) + clips = rearrange(video, "b c (nc cl) h w -> (b nc) c cl h w", cl=clip_length, nc=nclips) return clips @@ -104,34 +101,30 @@ class LVDMTextToVideoPipeline(DiffusionPipeline): """ def __init__( - self, - vae: LVDMAutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: LVDMUNet3DModel, - scheduler: KarrasDiffusionSchedulers, ): + self, + vae: LVDMAutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: LVDMUNet3DModel, + scheduler: KarrasDiffusionSchedulers, + ): super().__init__() - if (hasattr(scheduler.config, "steps_offset") and - scheduler.config.steps_offset != 1): + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " "to update the config accordingly as leaving `steps_offset` might led to incorrect results" " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" - " file") - deprecate( - "steps_offset!=1", - "1.0.0", - deprecation_message, - standard_warn=False) + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["steps_offset"] = 1 scheduler._internal_dict = FrozenDict(new_config) - if (hasattr(scheduler.config, "clip_sample") and - scheduler.config.clip_sample is True): + if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`." " `clip_sample` should be set to False in the configuration file. Please make sure to update the" @@ -139,11 +132,7 @@ def __init__( " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very" " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file" ) - deprecate( - "clip_sample not set", - "1.0.0", - deprecation_message, - standard_warn=False) + deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["clip_sample"] = False scheduler._internal_dict = FrozenDict(new_config) @@ -153,7 +142,8 @@ def __init__( text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, - scheduler=scheduler, ) + scheduler=scheduler, + ) # self.encoder_type = '2d' # self.scale_factor = 0.18215 @@ -166,12 +156,7 @@ def decode(self, z, **kwargs): return results @paddle.no_grad() - def overlapped_decode(self, - z, - max_z_t=None, - overlap_t=2, - predict_cids=False, - force_not_quantize=False): + def overlapped_decode(self, z, max_z_t=None, overlap_t=2, predict_cids=False, force_not_quantize=False): if max_z_t is None: max_z_t = z.shape[2] assert max_z_t > overlap_t @@ -190,69 +175,56 @@ def overlapped_decode(self, reses = [] for i, z_ in enumerate(zs): if i == 0: - res = self.decode( - z_, predict_cids, - force_not_quantize).cpu()[:, :, :max_x_t - drop_r_x, :, :] + res = self.decode(z_, predict_cids, force_not_quantize).cpu()[:, :, : max_x_t - drop_r_x, :, :] elif i == len(zs) - 1: - res = self.decode( - z_, predict_cids, - force_not_quantize).cpu()[:, :, drop_l_x:, :, :] + res = self.decode(z_, predict_cids, force_not_quantize).cpu()[:, :, drop_l_x:, :, :] else: - res = self.decode(z_, predict_cids, force_not_quantize).cpu( - )[:, :, drop_l_x:max_x_t - drop_r_x, :, :] + res = self.decode(z_, predict_cids, force_not_quantize).cpu()[ + :, :, drop_l_x : max_x_t - drop_r_x, :, : + ] reses.append(res) results = paddle.concat(x=reses, axis=2) return results @paddle.no_grad() - def decode_first_stage_2DAE_video(self, - z, - decode_bs=16, - return_cpu=True, - **kwargs): + def decode_first_stage_2DAE_video(self, z, decode_bs=16, return_cpu=True, **kwargs): b, _, t, _, _ = z.shape z = rearrange(z, "b c t h w -> (b t) c h w") if decode_bs is None: results = self.decode(z, **kwargs) else: - z = paddle.split( - x=z, num_or_sections=z.shape[0] // decode_bs, axis=0) + z = paddle.split(x=z, num_or_sections=z.shape[0] // decode_bs, axis=0) if return_cpu: - results = paddle.concat( - x=[self.decode(z_, **kwargs).cpu() for z_ in z], axis=0) + results = paddle.concat(x=[self.decode(z_, **kwargs).cpu() for z_ in z], axis=0) else: - results = paddle.concat( - x=[self.decode(z_, **kwargs) for z_ in z], axis=0) - results = rearrange( - results, "(b t) c h w -> b c t h w", b=b, t=t).contiguous() + results = paddle.concat(x=[self.decode(z_, **kwargs) for z_ in z], axis=0) + results = rearrange(results, "(b t) c h w -> b c t h w", b=b, t=t).contiguous() return results @paddle.no_grad() def decode_latents( - self, - z, - decode_bs=16, - return_cpu=True, - bs=None, - decode_single_video_allframes=False, - max_z_t=None, - overlapped_length=0, - **kwargs, ): + self, + z, + decode_bs=16, + return_cpu=True, + bs=None, + decode_single_video_allframes=False, + max_z_t=None, + overlapped_length=0, + **kwargs, + ): b, _, t, _, _ = z.shape if kwargs["encoder_type"] == "2d" and z.dim() == 5: - return self.decode_first_stage_2DAE_video( - z, decode_bs=decode_bs, return_cpu=return_cpu, **kwargs) + return self.decode_first_stage_2DAE_video(z, decode_bs=decode_bs, return_cpu=return_cpu, **kwargs) if decode_single_video_allframes: z = paddle.split(x=z, num_or_sections=z.shape[0] // 1, axis=0) cat_dim = 0 elif max_z_t is not None: if kwargs["encoder_type"] == "3d": - z = paddle.split( - x=z, num_or_sections=z.shape[2] // max_z_t, axis=2) + z = paddle.split(x=z, num_or_sections=z.shape[2] // max_z_t, axis=2) cat_dim = 2 if kwargs["encoder_type"] == "2d": - z = paddle.split( - x=z, num_or_sections=z.shape[0] // max_z_t, axis=0) + z = paddle.split(x=z, num_or_sections=z.shape[0] // max_z_t, axis=0) cat_dim = 0 # elif self.split_clips and self.downfactor_t is not None or self.clip_length is not None and self.downfactor_t is not None and z.shape[ # 2 @@ -286,8 +258,7 @@ def paddle_to_np(self, x): if isinstance("uint8", paddle.dtype): dtype = "uint8" - elif isinstance("uint8", - str) and "uint8" not in ["cpu", "cuda", "ipu", "xpu"]: + elif isinstance("uint8", str) and "uint8" not in ["cpu", "cuda", "ipu", "xpu"]: dtype = "uint8" elif isinstance("uint8", paddle.Tensor): dtype = "uint8".dtype @@ -299,13 +270,14 @@ def paddle_to_np(self, x): return sample def _encode_prompt( - self, - prompt, - num_videos_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_videos_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): r""" Encodes the prompt into text encoder hidden states. @@ -341,28 +313,30 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) @@ -370,8 +344,7 @@ def _encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_videos_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_videos_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_videos_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -381,14 +354,16 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt @@ -398,36 +373,33 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_videos_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_videos_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_videos_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_videos_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds @@ -437,53 +409,49 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - height, - width, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -496,22 +464,21 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) def prepare_latents( - self, - batch_size, - num_channels_latents, - num_frames, - height, - width, - dtype, - generator, - latents=None, ): - shape = [ - batch_size, num_channels_latents, num_frames, height // 8, - width // 8 - ] + self, + batch_size, + num_channels_latents, + num_frames, + height, + width, + dtype, + generator, + latents=None, + ): + shape = [batch_size, num_channels_latents, num_frames, height // 8, width // 8] if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" @@ -528,31 +495,31 @@ def prepare_latents( @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - prompt: Union[str, List[str]]=None, - height: Optional[int]=256, - width: Optional[int]=256, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_videos_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - save_dir=None, - save_name=None, - num_frames: Optional[int]=16, - encoder_type="2d", - scale_factor=0.18215, - shift_factor=0, ): + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = 256, + width: Optional[int] = 256, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_videos_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + save_dir=None, + save_name=None, + num_frames: Optional[int] = 16, + encoder_type="2d", + scale_factor=0.18215, + shift_factor=0, + ): r""" Function invoked when calling the pipeline for generation. @@ -628,9 +595,7 @@ def __call__( """ # 0. Default height and width to unet if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") # 1. Check inputs. Raise error if not correct self.check_inputs( @@ -640,7 +605,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -662,7 +628,8 @@ def __call__( do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -678,43 +645,38 @@ def __call__( width, prompt_embeds.dtype, generator, - latents, ) + latents, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( latent_model_input, timesteps=t, context=prompt_embeds, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -724,8 +686,7 @@ def __call__( "scale_factor": scale_factor, "shift_factor": shift_factor, } - sampled_videos = self.decode_latents( - latents, decode_bs=1, return_cpu=False, **extra_decode_kwargs) + sampled_videos = self.decode_latents(latents, decode_bs=1, return_cpu=False, **extra_decode_kwargs) all_videos.append(self.paddle_to_np(sampled_videos)) all_videos = np.concatenate(all_videos, axis=0) @@ -744,10 +705,9 @@ def __call__( videos_frames.append(video_frames) if not save_name: - save_name = f"defaul_video" + save_name = "defaul_video" if not save_dir: save_dir = "." os.makedirs(save_dir, exist_ok=True) - save_results( - all_videos, save_dir=save_dir, save_name=save_name, save_fps=8) + save_results(all_videos, save_dir=save_dir, save_name=save_name, save_fps=8) return VideoPipelineOutput(frames=videos_frames, samples=sampled_videos) diff --git a/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_uncond.py b/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_uncond.py index 5581777325761..3d64085312440 100644 --- a/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_uncond.py +++ b/ppdiffusers/ppdiffusers/pipelines/lvdm/pipeline_latent_video_diffusion_model_uncond.py @@ -19,9 +19,6 @@ import numpy as np import paddle -import paddle.nn as nn -from paddlenlp.transformers import PretrainedModel, PretrainedTokenizer -from tqdm import trange from ...configuration_utils import FrozenDict from ...models import LVDMAutoencoderKL, LVDMUNet3DModel @@ -49,34 +46,29 @@ class LVDMUncondPipeline(DiffusionPipeline): """ def __init__( - self, - vae: LVDMAutoencoderKL, - unet: LVDMUNet3DModel, - scheduler: Union[DDIMScheduler, PNDMScheduler, - LMSDiscreteScheduler], ): + self, + vae: LVDMAutoencoderKL, + unet: LVDMUNet3DModel, + scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], + ): super().__init__() - if (hasattr(scheduler.config, "steps_offset") and - scheduler.config.steps_offset != 1): + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " "to update the config accordingly as leaving `steps_offset` might led to incorrect results" " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" - " file") - deprecate( - "steps_offset!=1", - "1.0.0", - deprecation_message, - standard_warn=False) + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["steps_offset"] = 1 scheduler._internal_dict = FrozenDict(new_config) self.register_modules(vae=vae, unet=unet, scheduler=scheduler) - def enable_attention_slicing(self, - slice_size: Optional[Union[str, int]]="auto"): + def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): r""" Enable sliced attention computation. @@ -113,8 +105,7 @@ def paddle_to_np(self, x): if isinstance("uint8", paddle.dtype): dtype = "uint8" - elif isinstance("uint8", - str) and "uint8" not in ["cpu", "cuda", "ipu", "xpu"]: + elif isinstance("uint8", str) and "uint8" not in ["cpu", "cuda", "ipu", "xpu"]: dtype = "uint8" elif isinstance("uint8", paddle.Tensor): dtype = "uint8".dtype @@ -127,25 +118,25 @@ def paddle_to_np(self, x): @paddle.no_grad() def __call__( - self, - batch_size: int=1, - num_frames: Optional[int]=16, - height: Optional[int]=256, - width: Optional[int]=256, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - eta: Optional[float]=0.0, - num_inference_steps: Optional[int]=50, - latents: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - save_dir=None, - save_name=None, - scale_factor: Optional[float]=0.33422927, - shift_factor: Optional[float]=1.4606637, - **kwargs, ) -> Union[Tuple, VideoPipelineOutput]: + self, + batch_size: int = 1, + num_frames: Optional[int] = 16, + height: Optional[int] = 256, + width: Optional[int] = 256, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + eta: Optional[float] = 0.0, + num_inference_steps: Optional[int] = 50, + latents: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + save_dir=None, + save_name=None, + scale_factor: Optional[float] = 0.33422927, + shift_factor: Optional[float] = 1.4606637, + **kwargs, + ) -> Union[Tuple, VideoPipelineOutput]: r""" Args: height (`int`, *optional*, defaults to 256): @@ -188,16 +179,15 @@ def __call__( """ if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) # get the initial random noise unless the user supplied it latents_shape = [ @@ -211,12 +201,11 @@ def __call__( if latents is None: latents = randn_tensor( latents_shape, - generator=generator, ) + generator=generator, + ) else: if latents.shape != latents_shape: - raise ValueError( - f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}" - ) + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}") # set timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -231,30 +220,26 @@ def __call__( # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta for i, t in enumerate(self.progress_bar(timesteps_tensor)): latent_model_input = latents - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) t_tensor = paddle.expand( t, - [latent_model_input.shape[0], ], ) + [ + latent_model_input.shape[0], + ], + ) # predict the noise residual noise_pred = self.unet(latent_model_input, t_tensor).sample # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step( - noise_pred, - t, - latents, - generator=generator, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, generator=generator, **extra_step_kwargs).prev_sample # call the callback, if provided if callback is not None and i % callback_steps == 0: @@ -281,10 +266,9 @@ def __call__( videos_frames.append(video_frames) if not save_name: - save_name = f"defaul_video" + save_name = "default_video" if not save_dir: save_dir = "." os.makedirs(save_dir, exist_ok=True) - save_results( - all_videos, save_dir=save_dir, save_name=save_name, save_fps=8) + save_results(all_videos, save_dir=save_dir, save_name=save_name, save_fps=8) return VideoPipelineOutput(frames=videos_frames, samples=sampled_videos) diff --git a/ppdiffusers/ppdiffusers/pipelines/lvdm/video_save.py b/ppdiffusers/ppdiffusers/pipelines/lvdm/video_save.py index 837050f0222df..a969643113c68 100644 --- a/ppdiffusers/ppdiffusers/pipelines/lvdm/video_save.py +++ b/ppdiffusers/ppdiffusers/pipelines/lvdm/video_save.py @@ -33,12 +33,9 @@ av.logging.set_level(av.logging.ERROR) if not hasattr(av.video.frame.VideoFrame, "pict_type"): - av = ImportError( - """Your version of PyAV is too old for the necessary video operations.""" - ) + av = ImportError("""Your version of PyAV is too old for the necessary video operations.""") except ImportError: - av = ImportError( - """PyAV is not installed, and is necessary for the video operations.""") + av = ImportError("""PyAV is not installed, and is necessary for the video operations.""") def _check_av_available() -> None: @@ -47,15 +44,16 @@ def _check_av_available() -> None: def write_video( - filename: str, - video_array: paddle.Tensor, - fps: float, - video_codec: str="libx264", - options: Optional[Dict[str, Any]]=None, - audio_array: Optional[paddle.Tensor]=None, - audio_fps: Optional[float]=None, - audio_codec: Optional[str]=None, - audio_options: Optional[Dict[str, Any]]=None, ) -> None: + filename: str, + video_array: paddle.Tensor, + fps: float, + video_codec: str = "libx264", + options: Optional[Dict[str, Any]] = None, + audio_array: Optional[paddle.Tensor] = None, + audio_fps: Optional[float] = None, + audio_codec: Optional[str] = None, + audio_options: Optional[Dict[str, Any]] = None, +) -> None: """ Writes a 4d tensor in [T, H, W, C] format in a video file @@ -101,10 +99,8 @@ def write_video( audio_layout = "stereo" if num_channels > 1 else "mono" audio_sample_fmt = container.streams.audio[0].format.name format_dtype = np.dtype(audio_format_dtypes[audio_sample_fmt]) - audio_array = ( - paddle.to_tensor(data=audio_array).numpy().astype(format_dtype)) - frame = av.AudioFrame.from_ndarray( - audio_array, format=audio_sample_fmt, layout=audio_layout) + audio_array = paddle.to_tensor(data=audio_array).numpy().astype(format_dtype) + frame = av.AudioFrame.from_ndarray(audio_array, format=audio_sample_fmt, layout=audio_layout) frame.sample_rate = audio_fps for packet in a_stream.encode(frame): container.mux(packet) @@ -121,13 +117,14 @@ def write_video( @paddle.no_grad() def make_grid( - tensor: Union[paddle.Tensor, List[paddle.Tensor]], - nrow: int=8, - padding: int=2, - normalize: bool=False, - value_range: Optional[Tuple[int, int]]=None, - scale_each: bool=False, - pad_value: float=0.0, ) -> paddle.Tensor: + tensor: Union[paddle.Tensor, List[paddle.Tensor]], + nrow: int = 8, + padding: int = 2, + normalize: bool = False, + value_range: Optional[Tuple[int, int]] = None, + scale_each: bool = False, + pad_value: float = 0.0, +) -> paddle.Tensor: """ Make a grid of images. @@ -153,12 +150,9 @@ def make_grid( if isinstance(tensor, list): for t in tensor: if not paddle.is_tensor(x=t): - raise TypeError( - f"tensor or list of tensors expected, got a list containing {type(t)}" - ) + raise TypeError(f"tensor or list of tensors expected, got a list containing {type(t)}") else: - raise TypeError( - f"tensor or list of tensors expected, got {type(tensor)}") + raise TypeError(f"tensor or list of tensors expected, got {type(tensor)}") if isinstance(tensor, list): tensor = paddle.stack(x=tensor, axis=0) if tensor.dim() == 2: @@ -172,9 +166,7 @@ def make_grid( if normalize is True: tensor = tensor.clone() if value_range is not None and not isinstance(value_range, tuple): - raise TypeError( - "value_range has to be a tuple (min, max) if specified. min and max are numbers" - ) + raise TypeError("value_range has to be a tuple (min, max) if specified. min and max are numbers") def norm_ip(img, low, high): img.clip_(min=low, max=high) @@ -198,32 +190,33 @@ def norm_range(t, value_range): nmaps = tensor.shape[0] xmaps = min(nrow, nmaps) ymaps = int(math.ceil(float(nmaps) / xmaps)) - height, width = int(tensor.shape[2] + padding), int(tensor.shape[3] + - padding) + height, width = int(tensor.shape[2] + padding), int(tensor.shape[3] + padding) num_channels = tensor.shape[1] grid = paddle.full( shape=(num_channels, height * ymaps + padding, width * xmaps + padding), fill_value=pad_value, - dtype=tensor.dtype, ) + dtype=tensor.dtype, + ) k = 0 for y in range(ymaps): for x in range(xmaps): if k >= nmaps: break - start_0 = (grid.shape[1] + y * height + padding - if y * height + padding < 0 else y * height + padding) - start_1 = (paddle.slice(grid, [1], [start_0], - [start_0 + height - padding]).shape[2] + x * - width + padding - if x * width + padding < 0 else x * width + padding) + start_0 = grid.shape[1] + y * height + padding if y * height + padding < 0 else y * height + padding + start_1 = ( + paddle.slice(grid, [1], [start_0], [start_0 + height - padding]).shape[2] + x * width + padding + if x * width + padding < 0 + else x * width + padding + ) paddle.assign( tensor[k], output=paddle.slice( - paddle.slice(grid, [1], [start_0], - [start_0 + height - padding]), + paddle.slice(grid, [1], [start_0], [start_0 + height - padding]), [2], [start_1], - [start_1 + width - padding], ), ) + [start_1 + width - padding], + ), + ) k = k + 1 return grid @@ -264,13 +257,12 @@ def to_tensor(pic) -> paddle.Tensor: if img.dtype == paddle.uint8: return paddle.divide( img.cast(default_float_dtype), - paddle.to_tensor( - 255, dtype=paddle.float32), ) + paddle.to_tensor(255, dtype=paddle.float32), + ) else: return img mode_to_nptype = {"I": np.int32, "I;16": np.int16, "F": np.float32} - img = paddle.to_tensor(data=np.array( - pic, mode_to_nptype.get(pic.mode, np.uint8), copy=True)) + img = paddle.to_tensor(data=np.array(pic, mode_to_nptype.get(pic.mode, np.uint8), copy=True)) if pic.mode == "1": img = 255 * img img = img.reshape([pic.size[1], pic.size[0], get_image_num_channels(pic)]) @@ -299,20 +291,21 @@ def fill_with_black_squares(video, desired_len: int) -> paddle.Tensor: return paddle.concat( x=[ video, - paddle.zeros_like(x=video[0]).unsqueeze(axis=0) - .tile(repeat_times=[desired_len - len(video), 1, 1, 1]), + paddle.zeros_like(x=video[0]).unsqueeze(axis=0).tile(repeat_times=[desired_len - len(video), 1, 1, 1]), ], - axis=0, ) + axis=0, + ) def npz_to_video_grid( - data_path, - out_path, - num_frames=None, - fps=8, - num_videos=None, - nrow=None, - verbose=True, ): + data_path, + out_path, + num_frames=None, + fps=8, + num_videos=None, + nrow=None, + verbose=True, +): if isinstance(data_path, str): videos = load_num_videos(data_path, num_videos) elif isinstance(data_path, np.ndarray): @@ -332,22 +325,14 @@ def npz_to_video_grid( if num_frames is None: num_frames = videos.shape[1] if verbose: - videos = [ - fill_with_black_squares(v, num_frames) - for v in tqdm( - videos_th, desc="Adding empty frames") - ] + videos = [fill_with_black_squares(v, num_frames) for v in tqdm(videos_th, desc="Adding empty frames")] else: videos = [fill_with_black_squares(v, num_frames) for v in videos_th] frame_grids = paddle.stack(x=videos).transpose(perm=[1, 0, 2, 3, 4]) if nrow is None: nrow = int(np.ceil(np.sqrt(n))) if verbose: - frame_grids = [ - make_grid( - fs, nrow=nrow) for fs in tqdm( - frame_grids, desc="Making grids") - ] + frame_grids = [make_grid(fs, nrow=nrow) for fs in tqdm(frame_grids, desc="Making grids")] else: frame_grids = [make_grid(fs, nrow=nrow) for fs in frame_grids] @@ -356,21 +341,14 @@ def npz_to_video_grid( os.makedirs(os.path.dirname(out_path), exist_ok=True) if isinstance("uint8", paddle.dtype): dtype = "uint8" - elif isinstance("uint8", - str) and "uint8" not in ["cpu", "cuda", "ipu", "xpu"]: + elif isinstance("uint8", str) and "uint8" not in ["cpu", "cuda", "ipu", "xpu"]: dtype = "uint8" elif isinstance("uint8", paddle.Tensor): dtype = "uint8".dtype else: dtype = (paddle.stack(x=frame_grids) * 255).dtype - frame_grids = ((paddle.stack(x=frame_grids) * 255).transpose( - perm=[0, 2, 3, 1]).cast(dtype)) - write_video( - out_path, - frame_grids, - fps=fps, - video_codec="h264", - options={"crf": "10"}) + frame_grids = (paddle.stack(x=frame_grids) * 255).transpose(perm=[0, 2, 3, 1]).cast(dtype) + write_video(out_path, frame_grids, fps=fps, video_codec="h264", options={"crf": "10"}) def savenp2sheet(imgs, savepath, nrow=None): @@ -398,10 +376,7 @@ def savenp2sheet(imgs, savepath, nrow=None): n_rows = int(np.ceil(n / n_cols)) print(n_cols) print(n_rows) - imgsheet = cv2.vconcat([ - cv2.hconcat(imgs_new[i * n_cols:(i + 1) * n_cols]) - for i in range(n_rows) - ]) + imgsheet = cv2.vconcat([cv2.hconcat(imgs_new[i * n_cols : (i + 1) * n_cols]) for i in range(n_rows)]) cv2.imwrite(savepath, imgsheet) print(f"saved in {savepath}") @@ -414,7 +389,7 @@ def npz_to_imgsheet_5d(data_path, res_dir, nrow=None): else: raise Exception if os.path.isdir(res_dir): - res_path = os.path.join(res_dir, f"samples.jpg") + res_path = os.path.join(res_dir, "samples.jpg") else: assert res_dir.endswith(".jpg") res_path = res_dir @@ -423,24 +398,25 @@ def npz_to_imgsheet_5d(data_path, res_dir, nrow=None): def save_results( - videos, - save_dir, - save_name="results", - save_fps=8, - save_mp4=True, - save_npz=False, - save_mp4_sheet=False, - save_jpg=False, ): + videos, + save_dir, + save_name="results", + save_fps=8, + save_mp4=True, + save_npz=False, + save_mp4_sheet=False, + save_jpg=False, +): if save_mp4: save_subdir = os.path.join(save_dir, "videos") os.makedirs(save_subdir, exist_ok=True) shape_str = "x".join([str(x) for x in videos[0:1, (...)].shape]) for i in range(videos.shape[0]): npz_to_video_grid( - videos[i:i + 1, (...)], - os.path.join(save_subdir, - f"{save_name}_{i:03d}_{shape_str}.mp4"), - fps=save_fps, ) + videos[i : i + 1, (...)], + os.path.join(save_subdir, f"{save_name}_{i:03d}_{shape_str}.mp4"), + fps=save_fps, + ) print(f"Successfully saved videos in {save_subdir}") shape_str = "x".join([str(x) for x in videos.shape]) if save_npz: diff --git a/ppdiffusers/ppdiffusers/pipelines/paint_by_example/image_encoder.py b/ppdiffusers/ppdiffusers/pipelines/paint_by_example/image_encoder.py index 4ba5c5d72ec8e..713ed5d8191b5 100644 --- a/ppdiffusers/ppdiffusers/pipelines/paint_by_example/image_encoder.py +++ b/ppdiffusers/ppdiffusers/pipelines/paint_by_example/image_encoder.py @@ -14,8 +14,11 @@ # limitations under the License. import paddle from paddle import nn -from paddlenlp.transformers import (CLIPPretrainedModel, CLIPVisionConfig, - CLIPVisionModel) +from paddlenlp.transformers import ( + CLIPPretrainedModel, + CLIPVisionConfig, + CLIPVisionModel, +) from ...models.attention import BasicTransformerBlock from ...utils import logging @@ -42,8 +45,8 @@ def __init__(self, config: CLIPVisionConfig, proj_size=None): self.uncond_vector = self.create_parameter( [1, 1, self.projection_dim], dtype=paddle.get_default_dtype(), - default_initializer=nn.initializer.Assign( - paddle.rand((1, 1, self.projection_dim))), ) + default_initializer=nn.initializer.Assign(paddle.rand((1, 1, self.projection_dim))), + ) def forward(self, pixel_values, return_uncond_vector=False): clip_output = self.model(pixel_values=pixel_values) @@ -63,14 +66,18 @@ def __init__(self, config: CLIPVisionConfig): num_layers = (config.num_hidden_layers + 1) // 5 hid_size = config.hidden_size num_heads = 1 - self.blocks = nn.LayerList([ - BasicTransformerBlock( - hid_size, - num_heads, - hid_size, - activation_fn="gelu", - attention_bias=True, ) for _ in range(num_layers) - ]) + self.blocks = nn.LayerList( + [ + BasicTransformerBlock( + hid_size, + num_heads, + hid_size, + activation_fn="gelu", + attention_bias=True, + ) + for _ in range(num_layers) + ] + ) def forward(self, hidden_states): for block in self.blocks: diff --git a/ppdiffusers/ppdiffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/ppdiffusers/ppdiffusers/pipelines/paint_by_example/pipeline_paint_by_example.py index f6b679c76b433..8ed3770065a18 100644 --- a/ppdiffusers/ppdiffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +++ b/ppdiffusers/ppdiffusers/pipelines/paint_by_example/pipeline_paint_by_example.py @@ -62,14 +62,11 @@ def prepare_mask_and_masked_image(image, mask): """ if isinstance(image, paddle.Tensor): if not isinstance(mask, paddle.Tensor): - raise TypeError( - f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not" - ) + raise TypeError(f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not") # Batch single image if image.ndim == 3: - assert (image.shape[0] == 3 - ), "Image outside a batch should be of shape (3, H, W)" + assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)" image = image.unsqueeze(0) # Batch and add channel dim for single mask @@ -84,12 +81,9 @@ def prepare_mask_and_masked_image(image, mask): else: mask = mask.unsqueeze(0) - assert (image.ndim == 4 and - mask.ndim == 4), "Image and Mask must have 4 dimensions" - assert (image.shape[-2:] == mask.shape[-2:] - ), "Image and Mask must have the same spatial dimensions" - assert (image.shape[0] == mask.shape[0] - ), "Image and Mask must have the same batch size" + assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions" + assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions" + assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size" assert mask.shape[1] == 1, "Mask image must have a single channel" # Check image is in [-1, 1] @@ -109,14 +103,12 @@ def prepare_mask_and_masked_image(image, mask): # Image as float32 image = image.cast(paddle.float32) elif isinstance(mask, paddle.Tensor): - raise TypeError( - f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not") + raise TypeError(f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not") else: if isinstance(image, PIL.Image.Image): image = [image] - image = np.concatenate( - [np.array(i.convert("RGB"))[None, :] for i in image], axis=0) + image = np.concatenate([np.array(i.convert("RGB"))[None, :] for i in image], axis=0) image = image.transpose(0, 3, 1, 2) image = paddle.to_tensor(image).cast(paddle.float32) / 127.5 - 1.0 @@ -124,8 +116,7 @@ def prepare_mask_and_masked_image(image, mask): if isinstance(mask, PIL.Image.Image): mask = [mask] - mask = np.concatenate( - [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0) + mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0) mask = mask.astype(np.float32) / 255.0 # paint-by-example inverses the mask @@ -170,15 +161,15 @@ class PaintByExamplePipeline(DiffusionPipeline): _optional_components = ["safety_checker"] def __init__( - self, - vae: AutoencoderKL, - image_encoder: PaintByExampleImageEncoder, - unet: UNet2DConditionModel, - scheduler: Union[DDIMScheduler, PNDMScheduler, - LMSDiscreteScheduler], - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=False, ): + self, + vae: AutoencoderKL, + image_encoder: PaintByExampleImageEncoder, + unet: UNet2DConditionModel, + scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = False, + ): super().__init__() self.register_modules( @@ -187,18 +178,18 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) else: has_nsfw_concept = None return image, has_nsfw_concept @@ -210,15 +201,13 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs @@ -234,40 +223,44 @@ def decode_latents(self, latents): # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_image_variation.StableDiffusionImageVariationPipeline.check_inputs def check_inputs(self, image, height, width, callback_steps): - if (not isinstance(image, paddle.Tensor) and - not isinstance(image, PIL.Image.Image) and - not isinstance(image, list)): + if ( + not isinstance(image, paddle.Tensor) + and not isinstance(image, PIL.Image.Image) + and not isinstance(image, list) + ): raise ValueError( "`image` has to be of type `paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" - f" {type(image)}") + f" {type(image)}" + ) if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = ( batch_size, num_channels_latents, height // self.vae_scale_factor, - width // self.vae_scale_factor, ) + width // self.vae_scale_factor, + ) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" @@ -283,22 +276,22 @@ def prepare_latents( # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline.prepare_mask_latents def prepare_mask_latents( - self, - mask, - masked_image, - batch_size, - height, - width, - dtype, - generator, - do_classifier_free_guidance, ): + self, + mask, + masked_image, + batch_size, + height, + width, + dtype, + generator, + do_classifier_free_guidance, + ): # resize the mask to latents shape as we concatenate the mask to the latents # we do that before converting to dtype to avoid breaking in case we're using cpu_offload # and half precision mask = paddle.nn.functional.interpolate( - mask, - size=(height // self.vae_scale_factor, - width // self.vae_scale_factor)) + mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor) + ) mask = mask.cast(dtype) masked_image = masked_image.cast(dtype) @@ -306,13 +299,12 @@ def prepare_mask_latents( # encode the mask image into latents space so we can concatenate it to the latents if isinstance(generator, list): masked_image_latents = [ - self.vae.encode(masked_image[i:i + 1]).latent_dist.sample( - generator=generator[i]) for i in range(batch_size) + self.vae.encode(masked_image[i : i + 1]).latent_dist.sample(generator=generator[i]) + for i in range(batch_size) ] masked_image_latents = paddle.concat(masked_image_latents, axis=0) else: - masked_image_latents = self.vae.encode( - masked_image).latent_dist.sample(generator=generator) + masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator) masked_image_latents = self.vae.config.scaling_factor * masked_image_latents # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method @@ -331,71 +323,62 @@ def prepare_mask_latents( f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed." " Make sure the number of images that you pass is divisible by the total requested batch size." ) - masked_image_latents = masked_image_latents.tile( - [batch_size // masked_image_latents.shape[0], 1, 1, 1]) + masked_image_latents = masked_image_latents.tile([batch_size // masked_image_latents.shape[0], 1, 1, 1]) - mask = paddle.concat([mask] * - 2) if do_classifier_free_guidance else mask - masked_image_latents = (paddle.concat([masked_image_latents] * 2) - if do_classifier_free_guidance else - masked_image_latents) + mask = paddle.concat([mask] * 2) if do_classifier_free_guidance else mask + masked_image_latents = ( + paddle.concat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents + ) # aligning device to prevent device errors when concating it with the latent model input masked_image_latents = masked_image_latents.cast(dtype) return mask, masked_image_latents - def _encode_image(self, image, num_images_per_prompt, - do_classifier_free_guidance): + def _encode_image(self, image, num_images_per_prompt, do_classifier_free_guidance): dtype = self.image_encoder.dtype if not isinstance(image, paddle.Tensor): - image = self.feature_extractor( - images=image, return_tensors="pd").pixel_values + image = self.feature_extractor(images=image, return_tensors="pd").pixel_values image = image.cast(dtype) - image_embeddings, negative_prompt_embeds = self.image_encoder( - image, return_uncond_vector=True) + image_embeddings, negative_prompt_embeds = self.image_encoder(image, return_uncond_vector=True) # duplicate image embeddings for each generation per prompt, using mps friendly method bs_embed, seq_len, _ = image_embeddings.shape image_embeddings = image_embeddings.tile([1, num_images_per_prompt, 1]) - image_embeddings = image_embeddings.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + image_embeddings = image_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) if do_classifier_free_guidance: - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, image_embeddings.shape[0], 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, 1, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, image_embeddings.shape[0], 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([bs_embed * num_images_per_prompt, 1, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - image_embeddings = paddle.concat( - [negative_prompt_embeds, image_embeddings]) + image_embeddings = paddle.concat([negative_prompt_embeds, image_embeddings]) return image_embeddings @paddle.no_grad() def __call__( - self, - example_image: Union[paddle.Tensor, PIL.Image.Image], - image: Union[paddle.Tensor, PIL.Image.Image], - mask_image: Union[paddle.Tensor, PIL.Image.Image], - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=5.0, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, ): + self, + example_image: Union[paddle.Tensor, PIL.Image.Image], + image: Union[paddle.Tensor, PIL.Image.Image], + mask_image: Union[paddle.Tensor, PIL.Image.Image], + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 5.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + ): r""" Function invoked when calling the pipeline for generation. @@ -477,8 +460,7 @@ def __call__( self.check_inputs(example_image, height, width, callback_steps) # 4. Encode input image - image_embeddings = self._encode_image( - example_image, num_images_per_prompt, do_classifier_free_guidance) + image_embeddings = self._encode_image(example_image, num_images_per_prompt, do_classifier_free_guidance) # 5. set timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -493,7 +475,8 @@ def __call__( width, image_embeddings.dtype, generator, - latents, ) + latents, + ) # 7. Prepare mask latent variables mask, masked_image_latents = self.prepare_mask_latents( @@ -504,60 +487,50 @@ def __call__( width, image_embeddings.dtype, generator, - do_classifier_free_guidance, ) + do_classifier_free_guidance, + ) # 8. Check that sizes of mask, masked image and latents match num_channels_mask = mask.shape[1] num_channels_masked_image = masked_image_latents.shape[1] - if (num_channels_latents + num_channels_mask + num_channels_masked_image - != self.unet.config.in_channels): + if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels: raise ValueError( f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects" f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +" f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}" f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of" - " `pipeline.unet` or your `mask_image` or `image` input.") + " `pipeline.unet` or your `mask_image` or `image` input." + ) # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 10. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents # concat latents, mask, masked_image_latents in the channel dimension - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) - latent_model_input = paddle.concat( - [latent_model_input, masked_image_latents, mask], axis=1) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + latent_model_input = paddle.concat([latent_model_input, masked_image_latents, mask], axis=1) # predict the noise residual - noise_pred = self.unet( - latent_model_input, - t, - encoder_hidden_states=image_embeddings).sample + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # must cast this, paddle.concat has bug... latents = latents.cast(image_embeddings.dtype) # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -566,8 +539,7 @@ def __call__( image = self.decode_latents(latents) # 12. Run safety checker - image, has_nsfw_concept = self.run_safety_checker( - image, image_embeddings.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, image_embeddings.dtype) # 13. Convert to PIL if output_type == "pil": @@ -576,5 +548,4 @@ def __call__( if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py b/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py index 9c25c86f78f6a..b51612c302879 100644 --- a/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py +++ b/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py @@ -30,9 +30,15 @@ import numpy as np import PIL import PIL.Image -from huggingface_hub import (create_repo, get_hf_file_metadata, hf_hub_url, - model_info, repo_type_and_id_from_hf_id, - snapshot_download, upload_folder) +from huggingface_hub import ( + create_repo, + get_hf_file_metadata, + hf_hub_url, + model_info, + repo_type_and_id_from_hf_id, + snapshot_download, + upload_folder, +) from huggingface_hub.utils import EntryNotFoundError from packaging import version from tqdm.auto import tqdm @@ -40,13 +46,31 @@ from ..configuration_utils import ConfigMixin from ..schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME from ..utils import ( - CONFIG_NAME, DEPRECATED_REVISION_ARGS, DIFFUSERS_CACHE, FLAX_WEIGHTS_NAME, - FROM_DIFFUSERS, FROM_HF_HUB, HF_HUB_OFFLINE, LOW_CPU_MEM_USAGE_DEFAULT, - ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME, PPDIFFUSERS_CACHE, - TO_DIFFUSERS, TORCH_SAFETENSORS_WEIGHTS_NAME, TORCH_WEIGHTS_NAME, - BaseOutput, deprecate, get_class_from_dynamic_module, is_paddle_available, - is_paddlenlp_available, is_safetensors_available, logging, numpy_to_pil, - ppdiffusers_bos_dir_download, ppdiffusers_url_download) + CONFIG_NAME, + DEPRECATED_REVISION_ARGS, + DIFFUSERS_CACHE, + FLAX_WEIGHTS_NAME, + FROM_DIFFUSERS, + FROM_HF_HUB, + HF_HUB_OFFLINE, + LOW_CPU_MEM_USAGE_DEFAULT, + ONNX_EXTERNAL_WEIGHTS_NAME, + ONNX_WEIGHTS_NAME, + PPDIFFUSERS_CACHE, + TO_DIFFUSERS, + TORCH_SAFETENSORS_WEIGHTS_NAME, + TORCH_WEIGHTS_NAME, + BaseOutput, + deprecate, + get_class_from_dynamic_module, + is_paddle_available, + is_paddlenlp_available, + is_safetensors_available, + logging, + numpy_to_pil, + ppdiffusers_bos_dir_download, + ppdiffusers_url_download, +) from ..version import VERSION as __version__ if is_paddle_available(): @@ -133,8 +157,7 @@ class AudioPipelineOutput(BaseOutput): audios: np.ndarray -def is_safetensors_compatible(filenames, variant=None, - passed_components=None) -> bool: +def is_safetensors_compatible(filenames, variant=None, passed_components=None) -> bool: """ Checking for safetensors compatibility: - By default, all models are saved with the default pytorch serialization, so we use the list of default pytorch @@ -154,8 +177,7 @@ def is_safetensors_compatible(filenames, variant=None, for filename in filenames: _, extension = os.path.splitext(filename) - if (len(filename.split("/")) == 2 and - filename.split("/")[0] in passed_components): + if len(filename.split("/")) == 2 and filename.split("/")[0] in passed_components: continue if extension == ".bin": @@ -183,8 +205,7 @@ def is_safetensors_compatible(filenames, variant=None, return True -def variant_compatible_siblings(filenames, - variant=None) -> Union[List[os.PathLike], str]: +def variant_compatible_siblings(filenames, variant=None) -> Union[List[os.PathLike], str]: weight_names = [ TORCH_WEIGHTS_NAME, TORCH_SAFETENSORS_WEIGHTS_NAME, @@ -217,35 +238,17 @@ def variant_compatible_siblings(filenames, rf"({'|'.join(weight_prefixes)})(-{transformers_index_format})?\.({'|'.join(weight_suffixs)})$" ) # `text_encoder/pytorch_model.bin.index.json` - non_variant_index_re = re.compile( - rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.json" - ) + non_variant_index_re = re.compile(rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.json") if variant is not None: - variant_weights = { - f - for f in filenames - if variant_file_re.match(f.split("/")[-1]) is not None - } - variant_indexes = { - f - for f in filenames - if variant_index_re.match(f.split("/")[-1]) is not None - } + variant_weights = {f for f in filenames if variant_file_re.match(f.split("/")[-1]) is not None} + variant_indexes = {f for f in filenames if variant_index_re.match(f.split("/")[-1]) is not None} variant_filenames = variant_weights | variant_indexes else: variant_filenames = set() - non_variant_weights = { - f - for f in filenames - if non_variant_file_re.match(f.split("/")[-1]) is not None - } - non_variant_indexes = { - f - for f in filenames - if non_variant_index_re.match(f.split("/")[-1]) is not None - } + non_variant_weights = {f for f in filenames if non_variant_file_re.match(f.split("/")[-1]) is not None} + non_variant_indexes = {f for f in filenames if non_variant_index_re.match(f.split("/")[-1]) is not None} non_variant_filenames = non_variant_weights | non_variant_indexes # all variant filenames will be used by default @@ -254,12 +257,10 @@ def variant_compatible_siblings(filenames, def convert_to_variant(filename): if "index" in filename: variant_filename = filename.replace("index", f"index.{variant}") - elif (re.compile(f"^(.*?){transformers_index_format}").match(filename) - is not None): + elif re.compile(f"^(.*?){transformers_index_format}").match(filename) is not None: variant_filename = f"{filename.split('-')[0]}.{variant}-{'-'.join(filename.split('-')[1:])}" else: - variant_filename = ( - f"{filename.split('.')[0]}.{variant}.{filename.split('.')[1]}") + variant_filename = f"{filename.split('.')[0]}.{variant}.{filename.split('.')[1]}" return variant_filename for f in non_variant_filenames: @@ -270,51 +271,46 @@ def convert_to_variant(filename): return usable_filenames, variant_filenames -def warn_deprecated_model_variant(pretrained_model_name_or_path, use_auth_token, - variant, revision, model_filenames): +def warn_deprecated_model_variant(pretrained_model_name_or_path, use_auth_token, variant, revision, model_filenames): info = model_info( pretrained_model_name_or_path, use_auth_token=use_auth_token, - revision=None, ) + revision=None, + ) filenames = {sibling.rfilename for sibling in info.siblings} - comp_model_filenames, _ = variant_compatible_siblings( - filenames, variant=revision) - comp_model_filenames = [ - ".".join(f.split(".")[:1] + f.split(".")[2:]) - for f in comp_model_filenames - ] + comp_model_filenames, _ = variant_compatible_siblings(filenames, variant=revision) + comp_model_filenames = [".".join(f.split(".")[:1] + f.split(".")[2:]) for f in comp_model_filenames] if set(comp_model_filenames) == set(model_filenames): warnings.warn( f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'` even though you can load it via `variant=`{revision}`. Loading model variants via `revision='{revision}'` is deprecated and will be removed in diffusers v1. Please use `variant='{revision}'` instead.", - FutureWarning, ) + FutureWarning, + ) else: warnings.warn( f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'`. This behavior is deprecated and will be removed in diffusers v1. One should use `variant='{revision}'` instead. However, it appears that {pretrained_model_name_or_path} currently does not have the required variant filenames in the 'main' branch. \n The Diffusers team and community would be very grateful if you could open an issue: https://github.com/huggingface/diffusers/issues/new with the title '{pretrained_model_name_or_path} is missing {revision} files' so that the correct variant file can be added.", - FutureWarning, ) + FutureWarning, + ) def maybe_raise_or_warn( - library_name, - library, - class_name, - importable_classes, - passed_class_obj, - name, - is_pipeline_module, ): + library_name, + library, + class_name, + importable_classes, + passed_class_obj, + name, + is_pipeline_module, +): """Simple helper method to raise or warn in case incorrect module has been passed""" if not is_pipeline_module: library = importlib.import_module(library_name) class_obj = getattr(library, class_name) - class_candidates = { - c: getattr(library, c, None) - for c in importable_classes.keys() - } + class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()} expected_class_obj = None for class_name, class_candidate in class_candidates.items(): - if class_candidate is not None and issubclass(class_obj, - class_candidate): + if class_candidate is not None and issubclass(class_obj, class_candidate): expected_class_obj = class_candidate # Dynamo wraps the original model in a private class. @@ -325,15 +321,16 @@ def maybe_raise_or_warn( if not issubclass(model_cls, expected_class_obj): raise ValueError( f"{passed_class_obj[name]} is of type: {type(passed_class_obj[name])}, but should be" - f" {expected_class_obj}") + f" {expected_class_obj}" + ) else: logger.warning( f"You have passed a non-standard module {passed_class_obj[name]}. We cannot verify whether it" - " has the correct type") + " has the correct type" + ) -def get_class_obj_and_candidates(library_name, class_name, importable_classes, - pipelines, is_pipeline_module): +def get_class_obj_and_candidates(library_name, class_name, importable_classes, pipelines, is_pipeline_module): """Simple helper method to retrieve class object of module as well as potential parent class objects""" if is_pipeline_module: pipeline_module = getattr(pipelines, library_name) @@ -344,19 +341,12 @@ def get_class_obj_and_candidates(library_name, class_name, importable_classes, # else we just import it from the library. library = importlib.import_module(library_name) class_obj = getattr(library, class_name) - class_candidates = { - c: getattr(library, c, None) - for c in importable_classes.keys() - } + class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()} return class_obj, class_candidates -def _get_pipeline_class(class_obj, - config, - custom_pipeline=None, - cache_dir=None, - revision=None): +def _get_pipeline_class(class_obj, config, custom_pipeline=None, cache_dir=None, revision=None): if custom_pipeline is not None: if custom_pipeline.endswith(".py"): path = Path(custom_pipeline) @@ -370,31 +360,32 @@ def _get_pipeline_class(class_obj, custom_pipeline, module_file=file_name, cache_dir=cache_dir, - revision=revision, ) + revision=revision, + ) if class_obj != DiffusionPipeline: return class_obj - ppdiffusers_module = importlib.import_module( - class_obj.__module__.split(".")[0]) + ppdiffusers_module = importlib.import_module(class_obj.__module__.split(".")[0]) return getattr(ppdiffusers_module, config["_class_name"]) def load_sub_model( - library_name: str, - class_name: str, - importable_classes: List[Any], - pipelines: Any, - is_pipeline_module: bool, - pipeline_class: Any, - paddle_dtype: paddle.dtype, - runtime_options: Any, - model_variants: Dict[str, str], - name: str, - from_diffusers: bool, - low_cpu_mem_usage: bool=False, - cached_folder: Union[str, os.PathLike]=None, - **kwargs, ): + library_name: str, + class_name: str, + importable_classes: List[Any], + pipelines: Any, + is_pipeline_module: bool, + pipeline_class: Any, + paddle_dtype: paddle.dtype, + runtime_options: Any, + model_variants: Dict[str, str], + name: str, + from_diffusers: bool, + low_cpu_mem_usage: bool = False, + cached_folder: Union[str, os.PathLike] = None, + **kwargs, +): # support huggingface diffusers onnx model is_onnx_model = False if "Onnx" in class_name: @@ -403,29 +394,29 @@ def load_sub_model( """Helper method to load the module `name` from `library_name` and `class_name`""" # retrieve class candidates class_obj, class_candidates = get_class_obj_and_candidates( - library_name, class_name, importable_classes, pipelines, - is_pipeline_module) + library_name, class_name, importable_classes, pipelines, is_pipeline_module + ) load_method_name = None # retrive load method name for class_name, class_candidate in class_candidates.items(): - if class_candidate is not None and issubclass(class_obj, - class_candidate): + if class_candidate is not None and issubclass(class_obj, class_candidate): load_method_name = importable_classes[class_name][1] # if load method name is None, then we have a dummy module -> raise Error if load_method_name is None: none_module = class_obj.__module__ - is_dummy_path = none_module.startswith( - DUMMY_MODULES_FOLDER) or none_module.startswith( - PADDLENLP_DUMMY_MODULES_FOLDER) + is_dummy_path = none_module.startswith(DUMMY_MODULES_FOLDER) or none_module.startswith( + PADDLENLP_DUMMY_MODULES_FOLDER + ) if is_dummy_path and "dummy" in none_module: # call class_obj for nice error message of missing requirements class_obj() raise ValueError( f"The component {class_obj} of {pipeline_class} cannot be loaded as it does not seem to have" - f" any of the loading methods defined in {ALL_IMPORTABLE_CLASSES}.") + f" any of the loading methods defined in {ALL_IMPORTABLE_CLASSES}." + ) load_method = getattr(class_obj, load_method_name) @@ -435,17 +426,17 @@ def load_sub_model( # FastDeploy Model if issubclass(class_obj, FastDeployRuntimeModel): loading_kwargs["runtime_options"] = ( - runtime_options.get(name, None) - if isinstance(runtime_options, dict) else runtime_options) + runtime_options.get(name, None) if isinstance(runtime_options, dict) else runtime_options + ) if not is_onnx_model: if os.path.isdir(os.path.join(cached_folder, name)): is_onnx_model = any( - d.endswith(".onnx") or d.endswith(".pb") - for d in os.listdir(os.path.join(cached_folder, name))) + d.endswith(".onnx") or d.endswith(".pb") for d in os.listdir(os.path.join(cached_folder, name)) + ) else: is_onnx_model = any( - d.endswith(".onnx") or d.endswith(".pb") - for d in os.listdir(os.path.join(cached_folder))) + d.endswith(".onnx") or d.endswith(".pb") for d in os.listdir(os.path.join(cached_folder)) + ) loading_kwargs["is_onnx_model"] = is_onnx_model from ppdiffusers import ModelMixin @@ -461,8 +452,7 @@ def load_sub_model( try: # check if the module is in a subdirectory if os.path.isdir(os.path.join(cached_folder, name)): - loaded_sub_model = load_method( - os.path.join(cached_folder, name), **loading_kwargs) + loaded_sub_model = load_method(os.path.join(cached_folder, name), **loading_kwargs) else: # else load from the root directory loaded_sub_model = load_method(cached_folder, **loading_kwargs) @@ -478,11 +468,10 @@ def load_sub_model( loaded_sub_model = load_method( pretrained_model_name_or_path + "/" + name, cache_dir=cache_dir, - **loading_kwargs, ) - if loaded_sub_model is None: - raise ValueError( - f"We cant load '{name}' from {pretrained_model_name_or_path} or {cached_folder}! \n {e} " + **loading_kwargs, ) + if loaded_sub_model is None: + raise ValueError(f"We cant load '{name}' from {pretrained_model_name_or_path} or {cached_folder}! \n {e} ") return loaded_sub_model @@ -517,19 +506,15 @@ def register_modules(self, **kwargs): register_dict = {name: (None, None)} else: # TODO (junnyu) support paddlenlp.transformers - if "paddlenlp" in module.__module__.split( - ".") or "ppnlp_patch_utils" in module.__module__.split( - "."): + if "paddlenlp" in module.__module__.split(".") or "ppnlp_patch_utils" in module.__module__.split("."): library = "paddlenlp.transformers" else: library = module.__module__.split(".")[0] # check if the module is a pipeline module - pipeline_dir = (module.__module__.split(".")[-2] if - len(module.__module__.split(".")) > 2 else None) + pipeline_dir = module.__module__.split(".")[-2] if len(module.__module__.split(".")) > 2 else None path = module.__module__.split(".") - is_pipeline_module = pipeline_dir in path and hasattr( - pipelines, pipeline_dir) + is_pipeline_module = pipeline_dir in path and hasattr(pipelines, pipeline_dir) # if library is not in LOADABLE_CLASSES, then it is a custom module. # Or if it's a pipeline module, then the module is inside the pipeline @@ -549,19 +534,20 @@ def register_modules(self, **kwargs): setattr(self, name, module) # TODO junnyu, before register model, we may need to keep some module in fp32 - if (isinstance(module, nn.Layer) and - hasattr(module, "_keep_in_fp32_modules") and - module.dtype == paddle.float16 and - module._keep_in_fp32_modules is not None): - for module_name, sub_module in module.named_sublayers( - include_self=True): - if any(n in module_name - for n in module._keep_in_fp32_modules): + if ( + isinstance(module, nn.Layer) + and hasattr(module, "_keep_in_fp32_modules") + and module.dtype == paddle.float16 + and module._keep_in_fp32_modules is not None + ): + for module_name, sub_module in module.named_sublayers(include_self=True): + if any(n in module_name for n in module._keep_in_fp32_modules): sub_module.to(dtype=paddle.float32) if hasattr(sub_module, "pre_hook"): sub_module.pre_hook.remove() sub_module.pre_hook = sub_module.register_forward_pre_hook( - lambda layer, input: input[0].cast("float32")) + lambda layer, input: input[0].cast("float32") + ) def __setattr__(self, name: str, value: Any): if name in self.__dict__ and hasattr(self.config, name): @@ -570,7 +556,8 @@ def __setattr__(self, name: str, value: Any): if value is not None and self.config[name][0] is not None: class_library_tuple = ( value.__module__.split(".")[0], - value.__class__.__name__, ) + value.__class__.__name__, + ) else: class_library_tuple = (None, None) @@ -581,11 +568,12 @@ def __setattr__(self, name: str, value: Any): super().__setattr__(name, value) def save_pretrained( - self, - save_directory: Union[str, os.PathLike], - safe_serialization: bool=False, - variant: Optional[str]=None, - to_diffusers: bool=None, ): + self, + save_directory: Union[str, os.PathLike], + safe_serialization: bool = False, + variant: Optional[str] = None, + to_diffusers: bool = None, + ): """ Save all variables of the pipeline that can be saved and loaded as well as the pipelines configuration file to a directory. A pipeline variable can be saved and loaded if its class implements both a save and loading @@ -619,10 +607,7 @@ def is_saveable_module(name, value): return False return True - model_index_dict = { - k: v - for k, v in model_index_dict.items() if is_saveable_module(k, v) - } + model_index_dict = {k: v for k, v in model_index_dict.items() if is_saveable_module(k, v)} for pipeline_component_name in model_index_dict.keys(): sub_model = getattr(self, pipeline_component_name) @@ -639,8 +624,7 @@ def is_saveable_module(name, value): ) for base_class, save_load_methods in library_classes.items(): class_candidate = getattr(library, base_class, None) - if class_candidate is not None and issubclass( - model_cls, class_candidate): + if class_candidate is not None and issubclass(model_cls, class_candidate): # if we found a suitable base class in LOADABLE_CLASSES then grab its save method save_method_name = save_load_methods[0] break @@ -648,23 +632,18 @@ def is_saveable_module(name, value): break if save_method_name is None: - logger.warn( - f"self.{pipeline_component_name}={sub_model} of type {type(sub_model)} cannot be saved." - ) + logger.warn(f"self.{pipeline_component_name}={sub_model} of type {type(sub_model)} cannot be saved.") # make sure that unsaveable components are not tried to be loaded afterward - self.register_to_config( - **{pipeline_component_name: (None, None)}) + self.register_to_config(**{pipeline_component_name: (None, None)}) continue save_method = getattr(sub_model, save_method_name) # Call the save method with the argument safe_serialization only if it's supported save_method_signature = inspect.signature(save_method) - save_method_accept_safe = ( - "safe_serialization" in save_method_signature.parameters) + save_method_accept_safe = "safe_serialization" in save_method_signature.parameters save_method_accept_variant = "variant" in save_method_signature.parameters - save_method_accept_to_diffusers = ( - "to_diffusers" in save_method_signature.parameters) + save_method_accept_to_diffusers = "to_diffusers" in save_method_signature.parameters save_kwargs = {} # maybe we donot have torch so we use safe_serialization @@ -678,20 +657,19 @@ def is_saveable_module(name, value): if save_method_accept_to_diffusers: save_kwargs["to_diffusers"] = to_diffusers - save_method( - os.path.join(save_directory, pipeline_component_name), - **save_kwargs) + save_method(os.path.join(save_directory, pipeline_component_name), **save_kwargs) # finally save the config self.save_config(save_directory, to_diffusers=to_diffusers) def save_to_hf_hub( - self, - repo_id: str, - private: Optional[bool]=None, - commit_message: Optional[str]=None, - revision: Optional[str]=None, - create_pr: bool=False, ): + self, + repo_id: str, + private: Optional[bool] = None, + commit_message: Optional[str] = None, + revision: Optional[str] = None, + create_pr: bool = False, + ): """ Uploads all elements of this pipeline to a new HuggingFace Hub repository. Args: @@ -715,9 +693,7 @@ def save_to_hf_hub( # Check if README file already exist in repo try: - get_hf_file_metadata( - hf_hub_url( - repo_id=repo_id, filename="README.md", revision=revision)) + get_hf_file_metadata(hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision)) has_readme = True except EntryNotFoundError: has_readme = False @@ -739,13 +715,15 @@ def save_to_hf_hub( folder_path=tmp_dir, commit_message=commit_message, revision=revision, - create_pr=create_pr, ) + create_pr=create_pr, + ) def to( - self, - paddle_device: Optional[str]=None, - paddle_dtype: Optional[paddle.dtype]=None, - silence_dtype_warnings: bool=True, ): + self, + paddle_device: Optional[str] = None, + paddle_dtype: Optional[paddle.dtype] = None, + silence_dtype_warnings: bool = True, + ): if paddle_device is None and paddle_dtype is None: return self @@ -753,9 +731,12 @@ def to( modules = [getattr(self, n, None) for n in module_names] modules = [m for m in modules if isinstance(m, nn.Layer)] for module in modules: - if (paddle_device is not None and module.dtype == paddle.float16 and - str(paddle_device) in ["cpu"] and - not silence_dtype_warnings): + if ( + paddle_device is not None + and module.dtype == paddle.float16 + and str(paddle_device) in ["cpu"] + and not silence_dtype_warnings + ): logger.warning( "Pipelines loaded with `paddle_dtype=paddle.float16` cannot run with `cpu` device. It" " is not recommended to move them to `cpu` as running them will fail. Please make" @@ -771,19 +752,20 @@ def to( module.to(**kwargs) # TODO junnyu, before register model, we may need to keep some module in fp32 - if (isinstance(module, nn.Layer) and - hasattr(module, "_keep_in_fp32_modules") and - module.dtype == paddle.float16 and - module._keep_in_fp32_modules is not None): - for module_name, sub_module in module.named_sublayers( - include_self=True): - if any(n in module_name - for n in module._keep_in_fp32_modules): + if ( + isinstance(module, nn.Layer) + and hasattr(module, "_keep_in_fp32_modules") + and module.dtype == paddle.float16 + and module._keep_in_fp32_modules is not None + ): + for module_name, sub_module in module.named_sublayers(include_self=True): + if any(n in module_name for n in module._keep_in_fp32_modules): sub_module.to(dtype=paddle.float32) if hasattr(sub_module, "pre_hook"): sub_module.pre_hook.remove() sub_module.pre_hook = sub_module.register_forward_pre_hook( - lambda layer, input: input[0].cast("float32")) + lambda layer, input: input[0].cast("float32") + ) return self @property @@ -801,10 +783,7 @@ def device(self): return "cpu" @classmethod - def from_pretrained( - cls, - pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], - **kwargs): + def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs): r""" Instantiate a Paddle diffusion pipeline from pre-trained pipeline weights. @@ -964,18 +943,17 @@ def from_pretrained( custom_pipeline = kwargs.pop("custom_pipeline", None) custom_revision = kwargs.pop("custom_revision", None) runtime_options = kwargs.pop("runtime_options", None) - low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", - LOW_CPU_MEM_USAGE_DEFAULT) - use_safetensors = kwargs.pop("use_safetensors", None - if is_safetensors_available() else False) + low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", LOW_CPU_MEM_USAGE_DEFAULT) + use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False) variant = kwargs.pop("variant", None) from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB) # deperate return_cached_folder = kwargs.pop("return_cached_folder", False) - cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub - else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)) + cache_dir = ( + kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE) + ) load_sub_model_kwargs = { "pretrained_model_name_or_path": pretrained_model_name_or_path, @@ -1003,7 +981,8 @@ def from_pretrained( variant=variant, from_hf_hub=from_hf_hub, from_diffusers=from_diffusers, - **kwargs, ) + **kwargs, + ) else: # is_local_dir load_sub_model_kwargs["is_local_dir"] = True @@ -1023,8 +1002,8 @@ def from_pretrained( folder_path = os.path.join(cached_folder, folder) is_folder = os.path.isdir(folder_path) and folder in config_dict variant_exists = is_folder and any( - p.split(".")[1].startswith(variant) - for p in os.listdir(folder_path)) + p.split(".")[1].startswith(variant) for p in os.listdir(folder_path) + ) if variant_exists: model_variants[folder] = variant @@ -1035,18 +1014,22 @@ def from_pretrained( config_dict, custom_pipeline=custom_pipeline, cache_dir=cache_dir, - revision=custom_revision, ) + revision=custom_revision, + ) # DEPRECATED: To be removed in 1.0.0 - _ppdiffusers_version = (config_dict["_diffusers_paddle_version"] - if "_diffusers_paddle_version" in config_dict - else config_dict["_ppdiffusers_version"]) - if (pipeline_class.__name__ == "StableDiffusionInpaintPipeline" and - version.parse( - version.parse(_ppdiffusers_version).base_version) <= - version.parse("0.5.1")): - from ppdiffusers import (StableDiffusionInpaintPipeline, - StableDiffusionInpaintPipelineLegacy) + _ppdiffusers_version = ( + config_dict["_diffusers_paddle_version"] + if "_diffusers_paddle_version" in config_dict + else config_dict["_ppdiffusers_version"] + ) + if pipeline_class.__name__ == "StableDiffusionInpaintPipeline" and version.parse( + version.parse(_ppdiffusers_version).base_version + ) <= version.parse("0.5.1"): + from ppdiffusers import ( + StableDiffusionInpaintPipeline, + StableDiffusionInpaintPipelineLegacy, + ) pipeline_class = StableDiffusionInpaintPipelineLegacy @@ -1063,7 +1046,8 @@ def from_pretrained( "StableDiffusionInpaintPipelineLegacy", "1.0.0", deprecation_message, - standard_warn=False, ) + standard_warn=False, + ) # 4. Define expected modules given pipeline signature # and define non-None initialized modules (=`init_kwargs`) @@ -1071,26 +1055,15 @@ def from_pretrained( # some modules can be passed directly to the init # in this case they are already instantiated in `kwargs` # extract them here - expected_modules, optional_kwargs = cls._get_signature_keys( - pipeline_class) - passed_class_obj = { - k: kwargs.pop(k) - for k in expected_modules if k in kwargs - } - passed_pipe_kwargs = { - k: kwargs.pop(k) - for k in optional_kwargs if k in kwargs - } + expected_modules, optional_kwargs = cls._get_signature_keys(pipeline_class) + passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs} + passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs} - init_dict, unused_kwargs, _ = pipeline_class.extract_init_dict( - config_dict, **kwargs) + init_dict, unused_kwargs, _ = pipeline_class.extract_init_dict(config_dict, **kwargs) # define init kwargs - init_kwargs = { - k: init_dict.pop(k) - for k in optional_kwargs if k in init_dict - } - init_kwargs = { ** init_kwargs, ** passed_pipe_kwargs} + init_kwargs = {k: init_dict.pop(k) for k in optional_kwargs if k in init_dict} + init_kwargs = {**init_kwargs, **passed_pipe_kwargs} # remove `null` components def load_module(name, value): @@ -1127,8 +1100,7 @@ def load_module(name, value): # 6.2 Define all importable classes is_pipeline_module = hasattr(pipelines, library_name) - importable_classes = (ALL_IMPORTABLE_CLASSES if is_pipeline_module - else LOADABLE_CLASSES[library_name]) + importable_classes = ALL_IMPORTABLE_CLASSES if is_pipeline_module else LOADABLE_CLASSES[library_name] loaded_sub_model = None # 6.3 Use passed sub model or load class_name from library_name @@ -1144,7 +1116,8 @@ def load_module(name, value): importable_classes, passed_class_obj, name, - is_pipeline_module, ) + is_pipeline_module, + ) loaded_sub_model = passed_class_obj[name] else: @@ -1164,23 +1137,20 @@ def load_module(name, value): variant=variant, low_cpu_mem_usage=low_cpu_mem_usage, cached_folder=cached_folder, - **load_sub_model_kwargs, ) + **load_sub_model_kwargs, + ) - init_kwargs[ - name] = loaded_sub_model # UNet(...), # DiffusionSchedule(...) + init_kwargs[name] = loaded_sub_model # UNet(...), # DiffusionSchedule(...) # 7. Potentially add passed objects if expected missing_modules = set(expected_modules) - set(init_kwargs.keys()) passed_modules = list(passed_class_obj.keys()) optional_modules = pipeline_class._optional_components - if len(missing_modules) > 0 and missing_modules <= set( - passed_modules + optional_modules): + if len(missing_modules) > 0 and missing_modules <= set(passed_modules + optional_modules): for module in missing_modules: init_kwargs[module] = passed_class_obj.get(module, None) elif len(missing_modules) > 0: - passed_modules = ( - set(list(init_kwargs.keys()) + list(passed_class_obj.keys())) - - optional_kwargs) + passed_modules = set(list(init_kwargs.keys()) + list(passed_class_obj.keys())) - optional_kwargs raise ValueError( f"Pipeline {pipeline_class} expected {expected_modules}, but only {passed_modules} were passed." ) @@ -1195,8 +1165,7 @@ def load_module(name, value): for _submodule in _module: if isinstance(_submodule, nn.Layer): _submodule.eval() - if (paddle_dtype is not None and - _submodule.dtype != paddle_dtype): + if paddle_dtype is not None and _submodule.dtype != paddle_dtype: _submodule.to(dtype=paddle_dtype) # 9. Instantiate the pipeline @@ -1210,8 +1179,7 @@ def load_module(name, value): return model @classmethod - def download(cls, pretrained_model_name, - **kwargs) -> Union[str, os.PathLike]: + def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]: r""" Download and cache a PyTorch diffusion pipeline from pre-trained pipeline weights. Parameters: @@ -1284,8 +1252,9 @@ def download(cls, pretrained_model_name, """ from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB) - cache_dir = (kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub - else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE)) + cache_dir = ( + kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE) + ) from_diffusers = kwargs.pop("from_diffusers", FROM_DIFFUSERS) resume_download = kwargs.pop("resume_download", False) force_download = kwargs.pop("force_download", False) @@ -1299,8 +1268,7 @@ def download(cls, pretrained_model_name, use_safetensors = kwargs.pop("use_safetensors", None) max_workers = int(kwargs.pop("max_workers", 1)) - if from_diffusers and use_safetensors and not is_safetensors_available( - ): + if from_diffusers and use_safetensors and not is_safetensors_available(): raise ValueError( "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors" ) @@ -1324,14 +1292,14 @@ def download(cls, pretrained_model_name, use_auth_token=use_auth_token, revision=revision, from_hf_hub=from_hf_hub, - return_config_file=True, ) + return_config_file=True, + ) ignore_filenames = config_dict.pop("_ignore_files", []) # is_fastdeploy_model we wont use safetensors if cls == DiffusionPipeline: - is_fastdeploy_model = ( - "fastdeploy" in config_dict.get("_class_name", "").lower()) + is_fastdeploy_model = "fastdeploy" in config_dict.get("_class_name", "").lower() else: is_fastdeploy_model = "fastdeploy" in cls.__name__.lower() if is_fastdeploy_model: @@ -1354,46 +1322,38 @@ def download(cls, pretrained_model_name, info = model_info( pretrained_model_name, use_auth_token=use_auth_token, - revision=revision, ) + revision=revision, + ) filenames = {sibling.rfilename for sibling in info.siblings} - model_filenames, variant_filenames = variant_compatible_siblings( - filenames, variant=variant) + model_filenames, variant_filenames = variant_compatible_siblings(filenames, variant=variant) # remove ignored filenames model_filenames = set(model_filenames) - set(ignore_filenames) - variant_filenames = set(variant_filenames) - set( - ignore_filenames) + variant_filenames = set(variant_filenames) - set(ignore_filenames) # if the whole pipeline is cached we don't have to ping the Hub if revision in DEPRECATED_REVISION_ARGS and version.parse( - version.parse(__version__) - .base_version) >= version.parse("0.17.0"): + version.parse(__version__).base_version + ) >= version.parse("0.17.0"): warn_deprecated_model_variant( pretrained_model_name, use_auth_token, variant, revision, - model_filenames, ) + model_filenames, + ) - model_folder_names = { - os.path.split(f)[0] - for f in model_filenames - } + model_folder_names = {os.path.split(f)[0] for f in model_filenames} # all filenames compatible with variant will be added allow_patterns = list(model_filenames) # allow all patterns from non-model folders # this enables downloading schedulers, tokenizers, ... - allow_patterns += [ - os.path.join(k, "*") for k in folder_names - if k not in model_folder_names - ] + allow_patterns += [os.path.join(k, "*") for k in folder_names if k not in model_folder_names] # also allow downloading config.json files with the model - allow_patterns += [ - os.path.join(k, "config.json") for k in model_folder_names - ] + allow_patterns += [os.path.join(k, "config.json") for k in model_folder_names] allow_patterns += [ SCHEDULER_CONFIG_NAME, @@ -1408,24 +1368,28 @@ def download(cls, pretrained_model_name, config_dict, custom_pipeline=custom_pipeline, cache_dir=cache_dir, - revision=custom_revision, ) + revision=custom_revision, + ) expected_components, _ = cls._get_signature_keys(pipeline_class) - passed_components = [ - k for k in expected_components if k in kwargs - ] + passed_components = [k for k in expected_components if k in kwargs] - if (use_safetensors and not allow_pickle and - not is_safetensors_compatible( - model_filenames, - variant=variant, - passed_components=passed_components, )): + if ( + use_safetensors + and not allow_pickle + and not is_safetensors_compatible( + model_filenames, + variant=variant, + passed_components=passed_components, + ) + ): raise EnvironmentError( f"Could not found the necessary `safetensors` weights in {model_filenames} (variant={variant})" ) elif use_safetensors and is_safetensors_compatible( - model_filenames, - variant=variant, - passed_components=passed_components, ): + model_filenames, + variant=variant, + passed_components=passed_components, + ): ignore_patterns = [ "*.msgpack", "*.bin", @@ -1434,79 +1398,50 @@ def download(cls, pretrained_model_name, "*.pdmodel", ] - safetensors_variant_filenames = { - f - for f in variant_filenames if f.endswith(".safetensors") - } - safetensors_model_filenames = { - f - for f in model_filenames if f.endswith(".safetensors") - } - if (len(safetensors_variant_filenames) > 0 and - safetensors_model_filenames != - safetensors_variant_filenames): + safetensors_variant_filenames = {f for f in variant_filenames if f.endswith(".safetensors")} + safetensors_model_filenames = {f for f in model_filenames if f.endswith(".safetensors")} + if ( + len(safetensors_variant_filenames) > 0 + and safetensors_model_filenames != safetensors_variant_filenames + ): logger.warn( f"\nA mixture of {variant} and non-{variant} filenames will be loaded.\nLoaded {variant} filenames:\n[{', '.join(safetensors_variant_filenames)}]\nLoaded non-{variant} filenames:\n[{', '.join(safetensors_model_filenames - safetensors_variant_filenames)}\nIf this behavior is not expected, please check your folder structure." ) else: ignore_patterns = ["*.safetensors", "*.msgpack"] if from_diffusers: - ignore_patterns.extend( - ["*.pdparams", "*.pdiparams", "*.pdmodel"]) + ignore_patterns.extend(["*.pdparams", "*.pdiparams", "*.pdmodel"]) suffix = ".bin" else: if is_fastdeploy_model: ignore_patterns.extend(["*.pdparams", "*.bin"]) suffix = ".pdmodel" else: - ignore_patterns.extend( - ["*.pdiparams", "*.pdmodel", "*.bin"]) + ignore_patterns.extend(["*.pdiparams", "*.pdmodel", "*.bin"]) suffix = ".pdparams" - bin_variant_filenames = { - f - for f in variant_filenames if f.endswith(suffix) - } - bin_model_filenames = { - f - for f in model_filenames if f.endswith(suffix) - } - if (len(bin_variant_filenames) > 0 and - bin_model_filenames != bin_variant_filenames): + bin_variant_filenames = {f for f in variant_filenames if f.endswith(suffix)} + bin_model_filenames = {f for f in model_filenames if f.endswith(suffix)} + if len(bin_variant_filenames) > 0 and bin_model_filenames != bin_variant_filenames: logger.warn( f"\nA mixture of {variant} and non-{variant} filenames will be loaded.\nLoaded {variant} filenames:\n[{', '.join(bin_variant_filenames)}]\nLoaded non-{variant} filenames:\n[{', '.join(bin_model_filenames - bin_variant_filenames)}\nIf this behavior is not expected, please check your folder structure." ) # Don't download any objects that are passed allow_patterns = [ - p for p in allow_patterns - if not (len(p.split("/")) == 2 and p.split("/")[0] in - passed_components) + p for p in allow_patterns if not (len(p.split("/")) == 2 and p.split("/")[0] in passed_components) ] # Don't download index files of forbidden patterns either - ignore_patterns = ignore_patterns + [ - f"{i}.index.*json" for i in ignore_patterns - ] + ignore_patterns = ignore_patterns + [f"{i}.index.*json" for i in ignore_patterns] - re_ignore_pattern = [ - re.compile(fnmatch.translate(p)) for p in ignore_patterns - ] - re_allow_pattern = [ - re.compile(fnmatch.translate(p)) for p in allow_patterns - ] + re_ignore_pattern = [re.compile(fnmatch.translate(p)) for p in ignore_patterns] + re_allow_pattern = [re.compile(fnmatch.translate(p)) for p in allow_patterns] - expected_files = [ - f for f in filenames - if not any(p.match(f) for p in re_ignore_pattern) - ] - expected_files = [ - f for f in expected_files - if any(p.match(f) for p in re_allow_pattern) - ] + expected_files = [f for f in filenames if not any(p.match(f) for p in re_ignore_pattern)] + expected_files = [f for f in expected_files if any(p.match(f) for p in re_allow_pattern)] snapshot_folder = Path(config_file).parent - pipeline_is_cached = all((snapshot_folder / f).is_file() - for f in expected_files) + pipeline_is_cached = all((snapshot_folder / f).is_file() for f in expected_files) if pipeline_is_cached: # if the pipeline is cached, we can directly return it @@ -1514,8 +1449,7 @@ def download(cls, pretrained_model_name, return snapshot_folder user_agent = {"pipeline_class": cls.__name__} - if custom_pipeline is not None and not custom_pipeline.endswith( - ".py"): + if custom_pipeline is not None and not custom_pipeline.endswith(".py"): user_agent["custom_pipeline"] = custom_pipeline # download all allow_patterns - ignore_patterns @@ -1528,13 +1462,13 @@ def download(cls, pretrained_model_name, local_files_only=local_files_only, use_auth_token=use_auth_token, revision=revision, - allow_patterns=list( - set(allow_patterns) - set(ignore_filenames)), + allow_patterns=list(set(allow_patterns) - set(ignore_filenames)), ignore_patterns=list( set(ignore_patterns + ignore_filenames) ), # diffusers bug, so we must add this ignore_filenames! user_agent=user_agent, - max_workers=max_workers, ) + max_workers=max_workers, + ) else: # only support [PD] .pdparams, fastdeploy model cached_folder = ppdiffusers_bos_dir_download( @@ -1547,17 +1481,16 @@ def download(cls, pretrained_model_name, variant=variant, is_fastdeploy_model=is_fastdeploy_model, local_files_only=local_files_only, - max_workers=max_workers, ) + max_workers=max_workers, + ) return cached_folder @classmethod - def from_pretrained_original_ckpt( - cls, - pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], - **kwargs): - from .stable_diffusion.convert_from_ckpt_deprecated import \ - load_pipeline_from_original_stable_diffusion_ckpt + def from_pretrained_original_ckpt(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs): + from .stable_diffusion.convert_from_ckpt_deprecated import ( + load_pipeline_from_original_stable_diffusion_ckpt, + ) resume_download = kwargs.pop("resume_download", False) force_download = kwargs.pop("force_download", False) @@ -1568,37 +1501,33 @@ def from_pretrained_original_ckpt( pretrained_model_name_or_path = str(pretrained_model_name_or_path) if os.path.isfile(pretrained_model_name_or_path): checkpoint_path = pretrained_model_name_or_path - elif pretrained_model_name_or_path.startswith( - "http://") or pretrained_model_name_or_path.startswith( - "https://"): + elif pretrained_model_name_or_path.startswith("http://") or pretrained_model_name_or_path.startswith( + "https://" + ): checkpoint_path = ppdiffusers_url_download( pretrained_model_name_or_path, cache_dir=cache_dir, resume_download=resume_download, - force_download=force_download, ) + force_download=force_download, + ) else: - raise EnvironmentError( - f"Please check your {pretrained_model_name_or_path}.") + raise EnvironmentError(f"Please check your {pretrained_model_name_or_path}.") pipeline = load_pipeline_from_original_stable_diffusion_ckpt( checkpoint_path=checkpoint_path, original_config_file=original_config_file, paddle_dtype=paddle_dtype, requires_safety_checker=requires_safety_checker, cls=cls, - **kwargs, ) + **kwargs, + ) return pipeline @staticmethod def _get_signature_keys(obj): parameters = inspect.signature(obj.__init__).parameters - required_parameters = { - k: v - for k, v in parameters.items() if v.default == inspect._empty - } - optional_parameters = set( - {k - for k, v in parameters.items() if v.default != inspect._empty}) + required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty} + optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty}) expected_modules = set(required_parameters.keys()) - {"self"} return expected_modules, optional_parameters @@ -1628,9 +1557,7 @@ def components(self) -> Dict[str, Any]: """ expected_modules, optional_parameters = self._get_signature_keys(self) components = { - k: getattr(self, k) - for k in self.config.keys() - if not k.startswith("_") and k not in optional_parameters + k: getattr(self, k) for k in self.config.keys() if not k.startswith("_") and k not in optional_parameters } if set(components.keys()) != expected_modules: @@ -1666,8 +1593,7 @@ def progress_bar(self, iterable=None, total=None): def set_progress_bar_config(self, **kwargs): self._progress_bar_config = kwargs - def enable_xformers_memory_efficient_attention( - self, attention_op: Optional[str]=None): + def enable_xformers_memory_efficient_attention(self, attention_op: Optional[str] = None): r""" Enable memory efficient attention as implemented in xformers. @@ -1701,15 +1627,13 @@ def disable_xformers_memory_efficient_attention(self): """ self.set_use_memory_efficient_attention_xformers(False) - def set_use_memory_efficient_attention_xformers( - self, valid: bool, attention_op: Optional[str]=None) -> None: + def set_use_memory_efficient_attention_xformers(self, valid: bool, attention_op: Optional[str] = None) -> None: # Recursively walk through all the children. # Any children which exposes the set_use_memory_efficient_attention_xformers method # gets the message def fn_recursive_set_mem_eff(module: nn.Layer): if hasattr(module, "set_use_memory_efficient_attention_xformers"): - module.set_use_memory_efficient_attention_xformers(valid, - attention_op) + module.set_use_memory_efficient_attention_xformers(valid, attention_op) for child in module.children(): fn_recursive_set_mem_eff(child) @@ -1721,8 +1645,7 @@ def fn_recursive_set_mem_eff(module: nn.Layer): for module in modules: fn_recursive_set_mem_eff(module) - def enable_attention_slicing(self, - slice_size: Optional[Union[str, int]]="auto"): + def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): r""" Enable sliced attention computation. @@ -1749,10 +1672,7 @@ def disable_attention_slicing(self): def set_attention_slice(self, slice_size: Optional[int]): module_names, _ = self._get_signature_keys(self) modules = [getattr(self, n, None) for n in module_names] - modules = [ - m for m in modules - if isinstance(m, nn.Layer) and hasattr(m, "set_attention_slice") - ] + modules = [m for m in modules if isinstance(m, nn.Layer) and hasattr(m, "set_attention_slice")] for module in modules: module.set_attention_slice(slice_size) diff --git a/ppdiffusers/ppdiffusers/pipelines/pndm/pipeline_pndm.py b/ppdiffusers/ppdiffusers/pipelines/pndm/pipeline_pndm.py index 975204896be93..c946ea77ac787 100644 --- a/ppdiffusers/ppdiffusers/pipelines/pndm/pipeline_pndm.py +++ b/ppdiffusers/ppdiffusers/pipelines/pndm/pipeline_pndm.py @@ -46,14 +46,14 @@ def __init__(self, unet: UNet2DModel, scheduler: PNDMScheduler): @paddle.no_grad() def __call__( - self, - batch_size: int=1, - num_inference_steps: int=50, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - **kwargs, ) -> Union[ImagePipelineOutput, Tuple]: + self, + batch_size: int = 1, + num_inference_steps: int = 50, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + **kwargs, + ) -> Union[ImagePipelineOutput, Tuple]: r""" Args: batch_size (`int`, `optional`, defaults to 1): The number of images to generate. @@ -80,8 +80,10 @@ def __call__( batch_size, self.unet.config.in_channels, self.unet.config.sample_size, - self.unet.config.sample_size, ), - generator=generator, ) + self.unet.config.sample_size, + ), + generator=generator, + ) self.scheduler.set_timesteps(num_inference_steps) for t in self.progress_bar(self.scheduler.timesteps): @@ -95,6 +97,6 @@ def __call__( image = self.numpy_to_pil(image) if not return_dict: - return (image, ) + return (image,) return ImagePipelineOutput(images=image) diff --git a/ppdiffusers/ppdiffusers/pipelines/repaint/pipeline_repaint.py b/ppdiffusers/ppdiffusers/pipelines/repaint/pipeline_repaint.py index a44fac86017af..b0d248fac49cc 100644 --- a/ppdiffusers/ppdiffusers/pipelines/repaint/pipeline_repaint.py +++ b/ppdiffusers/ppdiffusers/pipelines/repaint/pipeline_repaint.py @@ -38,11 +38,7 @@ def _preprocess_image(image: Union[List, PIL.Image.Image, paddle.Tensor]): w, h = image[0].size w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 - image = [ - np.array(i.resize( - (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] - for i in image - ] + image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image] image = np.concatenate(image, axis=0) image = np.array(image).astype(np.float32) / 255.0 image = image.transpose(0, 3, 1, 2) @@ -62,12 +58,7 @@ def _preprocess_mask(mask: Union[List, PIL.Image.Image, paddle.Tensor]): if isinstance(mask[0], PIL.Image.Image): w, h = mask[0].size w, h = (x - x % 32 for x in (w, h)) # resize to integer multiple of 32 - mask = [ - np.array( - m.convert("L").resize( - (w, h), resample=PIL_INTERPOLATION["nearest"]))[None, :] - for m in mask - ] + mask = [np.array(m.convert("L").resize((w, h), resample=PIL_INTERPOLATION["nearest"]))[None, :] for m in mask] mask = np.concatenate(mask, axis=0) mask = mask.astype(np.float32) / 255.0 mask[mask < 0.5] = 0 @@ -88,17 +79,17 @@ def __init__(self, unet, scheduler): @paddle.no_grad() def __call__( - self, - image: Union[paddle.Tensor, PIL.Image.Image], - mask_image: Union[paddle.Tensor, PIL.Image.Image], - num_inference_steps: int=250, - eta: float=0.0, - jump_length: int=10, - jump_n_sample: int=10, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, ) -> Union[ImagePipelineOutput, Tuple]: + self, + image: Union[paddle.Tensor, PIL.Image.Image], + mask_image: Union[paddle.Tensor, PIL.Image.Image], + num_inference_steps: int = 250, + eta: float = 0.0, + jump_length: int = 10, + jump_n_sample: int = 10, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + ) -> Union[ImagePipelineOutput, Tuple]: r""" Args: image (`paddle.Tensor` or `PIL.Image.Image`): @@ -146,12 +137,10 @@ def __call__( ) image_shape = original_image.shape - image = randn_tensor( - image_shape, generator=generator, dtype=self.unet.dtype) + image = randn_tensor(image_shape, generator=generator, dtype=self.unet.dtype) # set step values - self.scheduler.set_timesteps(num_inference_steps, jump_length, - jump_n_sample) + self.scheduler.set_timesteps(num_inference_steps, jump_length, jump_n_sample) self.scheduler.eta = eta t_last = self.scheduler.timesteps[0] + 1 @@ -161,9 +150,7 @@ def __call__( # predict the noise residual model_output = self.unet(image, t).sample # compute previous image: x_t -> x_t-1 - image = self.scheduler.step(model_output, t, image, - original_image, mask_image, - generator).prev_sample + image = self.scheduler.step(model_output, t, image, original_image, mask_image, generator).prev_sample else: # compute the reverse: x_t-1 -> x_t @@ -176,6 +163,6 @@ def __call__( image = self.numpy_to_pil(image) if not return_dict: - return (image, ) + return (image,) return ImagePipelineOutput(images=image) diff --git a/ppdiffusers/ppdiffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py b/ppdiffusers/ppdiffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py index e3ce24a7eaf72..4e81855ba00f1 100644 --- a/ppdiffusers/ppdiffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py +++ b/ppdiffusers/ppdiffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py @@ -40,14 +40,14 @@ def __init__(self, unet: UNet2DModel, scheduler: DiffusionPipeline): @paddle.no_grad() def __call__( - self, - batch_size: int=1, - num_inference_steps: int=2000, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - **kwargs, ) -> Union[ImagePipelineOutput, Tuple]: + self, + batch_size: int = 1, + num_inference_steps: int = 2000, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + **kwargs, + ) -> Union[ImagePipelineOutput, Tuple]: r""" Args: batch_size (`int`, *optional*, defaults to 1): @@ -70,25 +70,22 @@ def __call__( model = self.unet - sample = (randn_tensor( - shape, generator=generator) * self.scheduler.init_noise_sigma) + sample = randn_tensor(shape, generator=generator) * self.scheduler.init_noise_sigma self.scheduler.set_timesteps(num_inference_steps) self.scheduler.set_sigmas(num_inference_steps) for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)): - sigma_t = self.scheduler.sigmas[i] * paddle.ones((shape[0], )) + sigma_t = self.scheduler.sigmas[i] * paddle.ones((shape[0],)) # correction step for _ in range(self.scheduler.config.correct_steps): model_output = self.unet(sample, sigma_t).sample - sample = self.scheduler.step_correct( - model_output, sample, generator=generator).prev_sample + sample = self.scheduler.step_correct(model_output, sample, generator=generator).prev_sample # prediction step model_output = model(sample, sigma_t).sample - output = self.scheduler.step_pred( - model_output, t, sample, generator=generator) + output = self.scheduler.step_pred(model_output, t, sample, generator=generator) sample, sample_mean = output.prev_sample, output.prev_sample_mean @@ -98,6 +95,6 @@ def __call__( sample = self.numpy_to_pil(sample) if not return_dict: - return (sample, ) + return (sample,) return ImagePipelineOutput(images=sample) diff --git a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/__init__.py index e24cb5eee2eb1..9842e59ad078e 100644 --- a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/__init__.py +++ b/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/__init__.py @@ -42,5 +42,4 @@ class SemanticStableDiffusionPipelineOutput(BaseOutput): if is_paddle_available() and is_paddlenlp_available(): - from .pipeline_semantic_stable_diffusion import \ - SemanticStableDiffusionPipeline + from .pipeline_semantic_stable_diffusion import SemanticStableDiffusionPipeline diff --git a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/custom_quantile.py b/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/custom_quantile.py index b8778c74b1d86..7fd2b4f407754 100644 --- a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/custom_quantile.py +++ b/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/custom_quantile.py @@ -68,8 +68,7 @@ def _compute_quantile(x, q, axis=None, keepdim=False, ignore_nan=False): if isinstance(axis, list): axis_src, axis_dst = [], [] for axis_single in axis: - if not isinstance(axis_single, int) or not ( - axis_single < dims and axis_single >= -dims): + if not isinstance(axis_single, int) or not (axis_single < dims and axis_single >= -dims): raise ValueError( "Axis should be None, int, or a list, element should in range [-rank(x), rank(x))." ) @@ -88,17 +87,13 @@ def _compute_quantile(x, q, axis=None, keepdim=False, ignore_nan=False): axis = axis_dst[0] else: if not isinstance(axis, int) or not (axis < dims and axis >= -dims): - raise ValueError( - "Axis should be None, int, or a list, element should in range [-rank(x), rank(x))." - ) + raise ValueError("Axis should be None, int, or a list, element should in range [-rank(x), rank(x)).") if axis < 0: axis += dims out_shape[axis] = 1 mask = x.isnan() - valid_counts = mask.logical_not().sum(axis=axis, - keepdim=True, - dtype="float64") + valid_counts = mask.logical_not().sum(axis=axis, keepdim=True, dtype="float64") indices = [] @@ -127,15 +122,14 @@ def _compute_quantile(x, q, axis=None, keepdim=False, ignore_nan=False): for index in indices: indices_below = paddle.floor(index).astype(paddle.int32) indices_upper = paddle.ceil(index).astype(paddle.int32) - tensor_upper = paddle.take_along_axis( - sorted_tensor, indices_upper, axis=axis) - tensor_below = paddle.take_along_axis( - sorted_tensor, indices_below, axis=axis) + tensor_upper = paddle.take_along_axis(sorted_tensor, indices_upper, axis=axis) + tensor_below = paddle.take_along_axis(sorted_tensor, indices_below, axis=axis) weights = index - indices_below.astype("float64") out = paddle.lerp( tensor_below.astype("float64"), tensor_upper.astype("float64"), - weights, ) + weights, + ) if not keepdim: out = paddle.squeeze(out, axis=axis) else: diff --git a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py index df0e298fe252a..70eaa17e88188 100644 --- a/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py @@ -18,13 +18,11 @@ from typing import Callable, List, Optional, Union import paddle -from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...models import AutoencoderKL, UNet2DConditionModel from ...pipeline_utils import DiffusionPipeline -from ...pipelines.stable_diffusion.safety_checker import \ - StableDiffusionSafetyChecker +from ...pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from ...schedulers import KarrasDiffusionSchedulers from ...utils import logging, randn_tensor from . import SemanticStableDiffusionPipelineOutput @@ -107,15 +105,16 @@ class SemanticStableDiffusionPipeline(DiffusionPipeline): _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + ): super().__init__() if safety_checker is None and requires_safety_checker: @@ -141,8 +140,9 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents @@ -161,54 +161,50 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs def check_inputs( - self, - prompt, - height, - width, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -221,23 +217,26 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = ( batch_size, num_channels_latents, height // self.vae_scale_factor, - width // self.vae_scale_factor, ) + width // self.vae_scale_factor, + ) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" @@ -253,33 +252,33 @@ def prepare_latents( @paddle.no_grad() def __call__( - self, - prompt: Union[str, List[str]], - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: int=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - editing_prompt: Optional[Union[str, List[str]]]=None, - editing_prompt_embeddings: Optional[paddle.Tensor]=None, - reverse_editing_direction: Optional[Union[bool, List[bool]]]=False, - edit_guidance_scale: Optional[Union[float, List[float]]]=5, - edit_warmup_steps: Optional[Union[int, List[int]]]=10, - edit_cooldown_steps: Optional[Union[int, List[int]]]=None, - edit_threshold: Optional[Union[float, List[float]]]=0.9, - edit_momentum_scale: Optional[float]=0.1, - edit_mom_beta: Optional[float]=0.4, - edit_weights: Optional[List[float]]=None, - sem_guidance: Optional[List[paddle.Tensor]]=None, ): + self, + prompt: Union[str, List[str]], + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: int = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + editing_prompt: Optional[Union[str, List[str]]] = None, + editing_prompt_embeddings: Optional[paddle.Tensor] = None, + reverse_editing_direction: Optional[Union[bool, List[bool]]] = False, + edit_guidance_scale: Optional[Union[float, List[float]]] = 5, + edit_warmup_steps: Optional[Union[int, List[int]]] = 10, + edit_cooldown_steps: Optional[Union[int, List[int]]] = None, + edit_threshold: Optional[Union[float, List[float]]] = 0.9, + edit_momentum_scale: Optional[float] = 0.1, + edit_mom_beta: Optional[float] = 0.4, + edit_weights: Optional[List[float]] = None, + sem_guidance: Optional[List[paddle.Tensor]] = None, + ): r""" Function invoked when calling the pipeline for generation. @@ -399,61 +398,53 @@ def __call__( prompt, padding="max_length", max_length=self.tokenizer.model_max_length, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids if text_input_ids.shape[-1] > self.tokenizer.model_max_length: - removed_text = self.tokenizer.batch_decode( - text_input_ids[:, self.tokenizer.model_max_length:]) + removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :]) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") - text_input_ids = text_input_ids[:, :self.tokenizer.model_max_length] + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length] text_embeddings = self.text_encoder(text_input_ids)[0] # duplicate text embeddings for each generation per prompt, using mps friendly method bs_embed, seq_len, _ = text_embeddings.shape text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1]) - text_embeddings = text_embeddings.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) if enable_edit_guidance: # get safety text embeddings if editing_prompt_embeddings is None: edit_concepts_input = self.tokenizer( - [ - x - for item in editing_prompt - for x in repeat(item, batch_size) - ], + [x for item in editing_prompt for x in repeat(item, batch_size)], padding="max_length", max_length=self.tokenizer.model_max_length, - return_tensors="pd", ) + return_tensors="pd", + ) edit_concepts_input_ids = edit_concepts_input.input_ids - if edit_concepts_input_ids.shape[ - -1] > self.tokenizer.model_max_length: + if edit_concepts_input_ids.shape[-1] > self.tokenizer.model_max_length: removed_text = self.tokenizer.batch_decode( - edit_concepts_input_ids[:, self.tokenizer. - model_max_length:]) + edit_concepts_input_ids[:, self.tokenizer.model_max_length :] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" f" {self.tokenizer.model_max_length} tokens: {removed_text}" ) - edit_concepts_input_ids = edit_concepts_input_ids[:, :self. - tokenizer. - model_max_length] + edit_concepts_input_ids = edit_concepts_input_ids[:, : self.tokenizer.model_max_length] edit_concepts = self.text_encoder(edit_concepts_input_ids)[0] else: - edit_concepts = editing_prompt_embeddings.tile( - [batch_size, 1, 1]) + edit_concepts = editing_prompt_embeddings.tile([batch_size, 1, 1]) # duplicate text embeddings for each generation per prompt, using mps friendly method bs_embed_edit, seq_len_edit, _ = edit_concepts.shape edit_concepts = edit_concepts.tile([1, num_images_per_prompt, 1]) - edit_concepts = edit_concepts.reshape( - [bs_embed_edit * num_images_per_prompt, seq_len_edit, -1]) + edit_concepts = edit_concepts.reshape([bs_embed_edit * num_images_per_prompt, seq_len_edit, -1]) # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` @@ -468,14 +459,16 @@ def __call__( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt @@ -485,25 +478,22 @@ def __call__( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) uncond_embeddings = self.text_encoder(uncond_input.input_ids)[0] # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = uncond_embeddings.shape[1] - uncond_embeddings = uncond_embeddings.tile( - [batch_size, num_images_per_prompt, 1]) - uncond_embeddings = uncond_embeddings.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + uncond_embeddings = uncond_embeddings.tile([batch_size, num_images_per_prompt, 1]) + uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes if enable_edit_guidance: - text_embeddings = paddle.concat( - [uncond_embeddings, text_embeddings, edit_concepts]) + text_embeddings = paddle.concat([uncond_embeddings, text_embeddings, edit_concepts]) else: - text_embeddings = paddle.concat( - [uncond_embeddings, text_embeddings]) + text_embeddings = paddle.concat([uncond_embeddings, text_embeddings]) # get the initial random noise unless the user supplied it # 4. Prepare timesteps @@ -519,7 +509,8 @@ def __call__( width, text_embeddings.dtype, generator, - latents, ) + latents, + ) # 6. Prepare extra step kwargs. extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) @@ -534,41 +525,39 @@ def __call__( for i, t in enumerate(self.progress_bar(timesteps)): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * - (2 + enabled_editing_prompts)) - if do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = ( + paddle.concat([latents] * (2 + enabled_editing_prompts)) if do_classifier_free_guidance else latents + ) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual - noise_pred = self.unet( - latent_model_input, t, - encoder_hidden_states=text_embeddings).sample + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample # perform guidance if do_classifier_free_guidance: - noise_pred_out = noise_pred.chunk( - 2 + enabled_editing_prompts) # [b,4, 64, 64] + noise_pred_out = noise_pred.chunk(2 + enabled_editing_prompts) # [b,4, 64, 64] noise_pred_uncond, noise_pred_text = ( noise_pred_out[0], - noise_pred_out[1], ) + noise_pred_out[1], + ) noise_pred_edit_concepts = noise_pred_out[2:] # default text guidance - noise_guidance = guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_guidance = guidance_scale * (noise_pred_text - noise_pred_uncond) # noise_guidance = (noise_pred_text - noise_pred_edit_concepts[0]) if self.uncond_estimates is None: self.uncond_estimates = paddle.zeros( (num_inference_steps + 1, *noise_pred_uncond.shape), - dtype=noise_pred.dtype, ) + dtype=noise_pred.dtype, + ) self.uncond_estimates[i] = noise_pred_uncond.detach() if self.text_estimates is None: self.text_estimates = paddle.zeros( (num_inference_steps + 1, *noise_pred_text.shape), - dtype=noise_pred.dtype, ) + dtype=noise_pred.dtype, + ) self.text_estimates[i] = noise_pred_text.detach() if self.edit_estimates is None and enable_edit_guidance: @@ -576,29 +565,32 @@ def __call__( ( num_inference_steps + 1, len(noise_pred_edit_concepts), - *noise_pred_edit_concepts[0].shape, ), - dtype=noise_pred.dtype, ) + *noise_pred_edit_concepts[0].shape, + ), + dtype=noise_pred.dtype, + ) if self.sem_guidance is None: self.sem_guidance = paddle.zeros( (num_inference_steps + 1, *noise_pred_text.shape), - dtype=noise_pred.dtype, ) + dtype=noise_pred.dtype, + ) if edit_momentum is None: edit_momentum = paddle.zeros_like(noise_guidance) if enable_edit_guidance: concept_weights = paddle.zeros( - (len(noise_pred_edit_concepts), - noise_guidance.shape[0]), - dtype=noise_guidance.dtype, ) + (len(noise_pred_edit_concepts), noise_guidance.shape[0]), + dtype=noise_guidance.dtype, + ) noise_guidance_edit = paddle.zeros( (len(noise_pred_edit_concepts), *noise_guidance.shape), - dtype=noise_guidance.dtype, ) + dtype=noise_guidance.dtype, + ) # noise_guidance_edit = torch.zeros_like(noise_guidance) warmup_inds = [] - for c, noise_pred_edit_concept in enumerate( - noise_pred_edit_concepts): + for c, noise_pred_edit_concept in enumerate(noise_pred_edit_concepts): self.edit_estimates[i, c] = noise_pred_edit_concept if isinstance(edit_guidance_scale, list): edit_guidance_scale_c = edit_guidance_scale[c] @@ -610,8 +602,7 @@ def __call__( else: edit_threshold_c = edit_threshold if isinstance(reverse_editing_direction, list): - reverse_editing_direction_c = reverse_editing_direction[ - c] + reverse_editing_direction_c = reverse_editing_direction[c] else: reverse_editing_direction_c = reverse_editing_direction if edit_weights: @@ -632,27 +623,19 @@ def __call__( if i >= edit_warmup_steps_c: warmup_inds.append(c) if i >= edit_cooldown_steps_c: - noise_guidance_edit[ - c, :, :, :, :] = paddle.zeros_like( - noise_pred_edit_concept) + noise_guidance_edit[c, :, :, :, :] = paddle.zeros_like(noise_pred_edit_concept) continue - noise_guidance_edit_tmp = ( - noise_pred_edit_concept - noise_pred_uncond) + noise_guidance_edit_tmp = noise_pred_edit_concept - noise_pred_uncond # tmp_weights = (noise_pred_text - noise_pred_edit_concept).sum(dim=(1, 2, 3)) - tmp_weights = ( - noise_guidance - noise_pred_edit_concept).sum( - (1, 2, 3)) + tmp_weights = (noise_guidance - noise_pred_edit_concept).sum((1, 2, 3)) - tmp_weights = paddle.full_like( - tmp_weights, - edit_weight_c) # * (1 / enabled_editing_prompts) + tmp_weights = paddle.full_like(tmp_weights, edit_weight_c) # * (1 / enabled_editing_prompts) if reverse_editing_direction_c: noise_guidance_edit_tmp = noise_guidance_edit_tmp * -1 concept_weights[c, :] = tmp_weights - noise_guidance_edit_tmp = (noise_guidance_edit_tmp * - edit_guidance_scale_c) + noise_guidance_edit_tmp = noise_guidance_edit_tmp * edit_guidance_scale_c # quantile function expects float32 if noise_guidance_edit_tmp.dtype == paddle.float32: @@ -660,23 +643,22 @@ def __call__( paddle.abs(noise_guidance_edit_tmp).flatten(2), edit_threshold_c, axis=2, - keepdim=False, ) + keepdim=False, + ) else: tmp = quantile( - paddle.abs(noise_guidance_edit_tmp).flatten(2) - .cast(paddle.float32), + paddle.abs(noise_guidance_edit_tmp).flatten(2).cast(paddle.float32), edit_threshold_c, axis=2, keepdim=False, ).cast(noise_guidance_edit_tmp.dtype) noise_guidance_edit_tmp = paddle.where( - paddle.abs(noise_guidance_edit_tmp) >= - tmp[:, :, None, None], + paddle.abs(noise_guidance_edit_tmp) >= tmp[:, :, None, None], noise_guidance_edit_tmp, - paddle.zeros_like(noise_guidance_edit_tmp), ) - noise_guidance_edit[ - c, :, :, :, :] = noise_guidance_edit_tmp + paddle.zeros_like(noise_guidance_edit_tmp), + ) + noise_guidance_edit[c, :, :, :, :] = noise_guidance_edit_tmp # noise_guidance_edit = noise_guidance_edit + noise_guidance_edit_tmp @@ -685,22 +667,21 @@ def __call__( # concept_weights = concept_weights.to("cpu") # Offload to cpu # noise_guidance_edit = noise_guidance_edit.to("cpu") - concept_weights_tmp = paddle.index_select( - concept_weights, warmup_inds, 0) + concept_weights_tmp = paddle.index_select(concept_weights, warmup_inds, 0) concept_weights_tmp = paddle.where( concept_weights_tmp < 0, paddle.zeros_like(concept_weights_tmp), - concept_weights_tmp, ) - concept_weights_tmp = (concept_weights_tmp / - concept_weights_tmp.sum(0)) + concept_weights_tmp, + ) + concept_weights_tmp = concept_weights_tmp / concept_weights_tmp.sum(0) # concept_weights_tmp = torch.nan_to_num(concept_weights_tmp) - noise_guidance_edit_tmp = paddle.index_select( - noise_guidance_edit, warmup_inds, 0) + noise_guidance_edit_tmp = paddle.index_select(noise_guidance_edit, warmup_inds, 0) noise_guidance_edit_tmp = paddle.einsum( "cb,cbijk->bijk", concept_weights_tmp, - noise_guidance_edit_tmp, ) + noise_guidance_edit_tmp, + ) noise_guidance_edit_tmp = noise_guidance_edit_tmp noise_guidance = noise_guidance + noise_guidance_edit_tmp @@ -714,17 +695,15 @@ def __call__( concept_weights = paddle.where( concept_weights < 0, paddle.zeros_like(concept_weights), - concept_weights, ) + concept_weights, + ) # concept_weights = paddle.nan_to_num(concept_weights) - noise_guidance_edit = paddle.einsum( - "cb,cbijk->bijk", concept_weights, noise_guidance_edit) + noise_guidance_edit = paddle.einsum("cb,cbijk->bijk", concept_weights, noise_guidance_edit) - noise_guidance_edit = (noise_guidance_edit + - edit_momentum_scale * edit_momentum) + noise_guidance_edit = noise_guidance_edit + edit_momentum_scale * edit_momentum - edit_momentum = (edit_mom_beta * edit_momentum + - (1 - edit_mom_beta) * noise_guidance_edit) + edit_momentum = edit_mom_beta * edit_momentum + (1 - edit_mom_beta) * noise_guidance_edit if warmup_inds.shape[0] == len(noise_pred_edit_concepts): noise_guidance = noise_guidance + noise_guidance_edit @@ -737,8 +716,7 @@ def __call__( noise_pred = noise_pred_uncond + noise_guidance # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided if callback is not None and i % callback_steps == 0: @@ -748,12 +726,11 @@ def __call__( image = self.decode_latents(latents) if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( images=image, - clip_input=safety_checker_input.pixel_values.cast( - text_embeddings.dtype), ) + clip_input=safety_checker_input.pixel_values.cast(text_embeddings.dtype), + ) else: has_nsfw_concept = None @@ -763,5 +740,4 @@ def __call__( if not return_dict: return (image, has_nsfw_concept) - return SemanticStableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return SemanticStableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/__init__.py index 44d2a3ed3c947..53dd30da98557 100644 --- a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/__init__.py +++ b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/__init__.py @@ -12,8 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ...utils import (OptionalDependencyNotAvailable, is_note_seq_available, - is_paddle_available, is_paddlenlp_available) +from ...utils import ( + OptionalDependencyNotAvailable, + is_note_seq_available, + is_paddle_available, + is_paddlenlp_available, +) try: if not (is_paddlenlp_available() and is_paddle_available()): @@ -23,10 +27,12 @@ else: from .notes_encoder import SpectrogramNotesEncoder from .pipeline_spectrogram_diffusion import ( - SpectrogramContEncoder, SpectrogramDiffusionPipeline, T5FilmDecoder) + SpectrogramContEncoder, + SpectrogramDiffusionPipeline, + T5FilmDecoder, + ) try: - if not (is_paddlenlp_available() and is_paddle_available() and - is_note_seq_available()): + if not (is_paddlenlp_available() and is_paddle_available() and is_note_seq_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from ...utils.dummy_paddle_and_paddlenlp_and_note_seq_objects import * diff --git a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/continous_encoder.py b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/continous_encoder.py index 4378ce01e5784..d09306582dc21 100644 --- a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/continous_encoder.py +++ b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/continous_encoder.py @@ -17,28 +17,27 @@ from paddlenlp.transformers.t5.configuration import T5Config from paddlenlp.transformers.t5.modeling import T5Block, T5LayerNorm -from ...configuration_utils import (ConfigMixin, ModuleUtilsMixin, - register_to_config) +from ...configuration_utils import ConfigMixin, ModuleUtilsMixin, register_to_config from ...models import ModelMixin class SpectrogramContEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin): @register_to_config def __init__( - self, - input_dims: int, - targets_context_length: int, - d_model: int, - dropout_rate: float, - num_layers: int, - num_heads: int, - d_kv: int, - d_ff: int, - feed_forward_proj: str, - is_decoder: bool=False, ): + self, + input_dims: int, + targets_context_length: int, + d_model: int, + dropout_rate: float, + num_layers: int, + num_heads: int, + d_kv: int, + d_ff: int, + feed_forward_proj: str, + is_decoder: bool = False, + ): super().__init__() - self.input_proj = nn.Linear( - in_features=input_dims, out_features=d_model, bias_attr=False) + self.input_proj = nn.Linear(in_features=input_dims, out_features=d_model, bias_attr=False) self.position_encoding = nn.Embedding(targets_context_length, d_model) self.position_encoding.weight.stop_gradient = True self.dropout_pre = nn.Dropout(p=dropout_rate) @@ -50,7 +49,8 @@ def __init__( feed_forward_proj=feed_forward_proj, dropout_rate=dropout_rate, is_decoder=is_decoder, - is_encoder_decoder=False, ) + is_encoder_decoder=False, + ) self.encoders = nn.LayerList() for lyr_num in range(num_layers): lyr = T5Block(t5config) @@ -66,17 +66,13 @@ def forward(self, encoder_inputs, encoder_inputs_mask): input_positions = paddle.arange(end=max_positions) seq_lens = encoder_inputs_mask.sum(axis=-1) - input_positions = paddle.roll( - x=input_positions.unsqueeze(axis=0), - shifts=tuple(seq_lens.tolist()), - axis=0) + input_positions = paddle.roll(x=input_positions.unsqueeze(axis=0), shifts=tuple(seq_lens.tolist()), axis=0) x += self.position_encoding(input_positions) x = self.dropout_pre(x) # inverted the attention mask input_shape = encoder_inputs.shape - extended_attention_mask = self.get_extended_attention_mask( - encoder_inputs_mask, input_shape) + extended_attention_mask = self.get_extended_attention_mask(encoder_inputs_mask, input_shape) for lyr in self.encoders: x = lyr(x, extended_attention_mask)[0] diff --git a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/midi_utils.py b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/midi_utils.py index d8dcc8a98cf87..3997ce07f5845 100644 --- a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/midi_utils.py +++ b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/midi_utils.py @@ -15,8 +15,17 @@ import dataclasses import math import os -from typing import (Any, Callable, List, Mapping, MutableMapping, Optional, - Sequence, Tuple, Union) +from typing import ( + Any, + Callable, + List, + Mapping, + MutableMapping, + Optional, + Sequence, + Tuple, + Union, +) import numpy as np import paddle @@ -96,8 +105,7 @@ class NoteEncodingState: """Encoding state for note transcription, keeping track of active pitches.""" # velocity bin for active pitches and programs - active_pitches: MutableMapping[Tuple[int, int], int] = dataclasses.field( - default_factory=dict) + active_pitches: MutableMapping[Tuple[int, int], int] = dataclasses.field(default_factory=dict) @dataclasses.dataclass @@ -149,10 +157,11 @@ class Codec: """ def __init__( - self, - max_shift_steps: int, - steps_per_second: float, - event_ranges: List[EventRange], ): + self, + max_shift_steps: int, + steps_per_second: float, + event_ranges: List[EventRange], + ): """Define Codec. Args: @@ -162,14 +171,11 @@ def __init__( event_ranges: Other supported event types and their ranges. """ self.steps_per_second = steps_per_second - self._shift_range = EventRange( - type="shift", min_value=0, max_value=max_shift_steps) + self._shift_range = EventRange(type="shift", min_value=0, max_value=max_shift_steps) self._event_ranges = [self._shift_range] + event_ranges # Ensure all event types have unique names. - assert len(self._event_ranges) == len( - {er.type - for er in self._event_ranges}) + assert len(self._event_ranges) == len({er.type for er in self._event_ranges}) @property def num_classes(self) -> int: @@ -179,8 +185,7 @@ def num_classes(self) -> int: # events that are intended to be used from within autograph functions. def is_shift_event_index(self, index: int) -> bool: - return (self._shift_range.min_value <= index and - index <= self._shift_range.max_value) + return self._shift_range.min_value <= index and index <= self._shift_range.max_value @property def max_shift_steps(self) -> int: @@ -235,31 +240,29 @@ def programs_to_midi_classes(tokens, codec): """Modifies program events to be the first program in the MIDI class.""" min_program_id, max_program_id = codec.event_type_range("program") is_program = (tokens >= min_program_id) & (tokens <= max_program_id) - return np.where(is_program, min_program_id + 8 * ( - (tokens - min_program_id) // 8), tokens) + return np.where(is_program, min_program_id + 8 * ((tokens - min_program_id) // 8), tokens) PROGRAM_GRANULARITIES = { # "flat" granularity; drop program change tokens and set NoteSequence # programs to zero - "flat": ProgramGranularity( - tokens_map_fn=drop_programs, program_map_fn=lambda program: 0), + "flat": ProgramGranularity(tokens_map_fn=drop_programs, program_map_fn=lambda program: 0), # map each program to the first program in its MIDI class "midi_class": ProgramGranularity( tokens_map_fn=programs_to_midi_classes, - program_map_fn=lambda program: 8 * (program // 8), ), + program_map_fn=lambda program: 8 * (program // 8), + ), # leave programs as is "full": ProgramGranularity( tokens_map_fn=lambda tokens, codec: tokens, - program_map_fn=lambda program: program, ), + program_map_fn=lambda program: program, + ), } def unfold(tensor, dimension, size, step=1): - assert dimension < len( - tensor.shape), "dimension must be less than tensor dimensions" - assert (tensor.shape[dimension] >= size - ), "size should not be greater than the dimension of tensor" + assert dimension < len(tensor.shape), "dimension must be less than tensor dimensions" + assert tensor.shape[dimension] >= size, "size should not be greater than the dimension of tensor" slices = [] for i in range(0, tensor.shape[dimension] - size + 1, step): @@ -276,24 +279,19 @@ def unfold(tensor, dimension, size, step=1): return unfolded_tensor -def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, - axis=-1): +def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1): """ equivalent of tf.signal.frame """ signal_length = signal.shape[axis] if pad_end: frames_overlap = frame_length - frame_step - rest_samples = np.abs(signal_length - frames_overlap) % np.abs( - frame_length - frames_overlap) + rest_samples = np.abs(signal_length - frames_overlap) % np.abs(frame_length - frames_overlap) pad_size = int(frame_length - rest_samples) if pad_size != 0: pad_axis = [0] * signal.ndim pad_axis[axis] = pad_size - signal = F.pad(x=signal, - pad=pad_axis, - mode="constant", - value=pad_value) + signal = F.pad(x=signal, pad=pad_axis, mode="constant", value=pad_value) frames = unfold(signal, axis, frame_length, frame_step) return frames @@ -305,28 +303,26 @@ def program_to_slakh_program(program): return slakh_program -def audio_to_frames( - samples, hop_size: int, - frame_rate: int) -> Tuple[Sequence[Sequence[int]], paddle.Tensor]: +def audio_to_frames(samples, hop_size: int, frame_rate: int) -> Tuple[Sequence[Sequence[int]], paddle.Tensor]: """Convert audio samples to non-overlapping frames and frame times.""" frame_size = hop_size - samples = np.pad(samples, [0, frame_size - len(samples) % frame_size], - mode="constant") + samples = np.pad(samples, [0, frame_size - len(samples) % frame_size], mode="constant") # Split audio into frames. frames = frame( paddle.to_tensor(data=samples).unsqueeze(axis=0), frame_length=frame_size, frame_step=frame_size, - pad_end=False, ) + pad_end=False, + ) num_frames = len(samples) // frame_size times = np.arange(num_frames) / frame_rate return frames, times def note_sequence_to_onsets_and_offsets_and_programs( - ns: note_seq.NoteSequence, ) -> Tuple[Sequence[float], Sequence[ - NoteEventData]]: + ns: note_seq.NoteSequence, +) -> Tuple[Sequence[float], Sequence[NoteEventData]]: """Extract onset & offset times and pitches & programs from a NoteSequence. The onset & offset times will not necessarily be in sorted order. @@ -341,21 +337,20 @@ def note_sequence_to_onsets_and_offsets_and_programs( """ # Sort by program and pitch and put offsets before onsets as a tiebreaker for # subsequent stable sort. - notes = sorted( - ns.notes, key=lambda note: (note.is_drum, note.program, note.pitch)) - times = [note.end_time for note in notes if not note.is_drum] + [ - note.start_time for note in notes - ] + notes = sorted(ns.notes, key=lambda note: (note.is_drum, note.program, note.pitch)) + times = [note.end_time for note in notes if not note.is_drum] + [note.start_time for note in notes] values = [ - NoteEventData( - pitch=note.pitch, velocity=0, program=note.program, is_drum=False) - for note in notes if not note.is_drum + NoteEventData(pitch=note.pitch, velocity=0, program=note.program, is_drum=False) + for note in notes + if not note.is_drum ] + [ NoteEventData( pitch=note.pitch, velocity=note.velocity, program=note.program, - is_drum=note.is_drum, ) for note in notes + is_drum=note.is_drum, + ) + for note in notes ] return times, values @@ -368,20 +363,19 @@ def num_velocity_bins_from_codec(codec: Codec): # segment an array into segments of length n def segment(a, n): - return [a[i:i + n] for i in range(0, len(a), n)] + return [a[i : i + n] for i in range(0, len(a), n)] def velocity_to_bin(velocity, num_velocity_bins): if velocity == 0: return 0 else: - return math.ceil(num_velocity_bins * velocity / - note_seq.MAX_MIDI_VELOCITY) + return math.ceil(num_velocity_bins * velocity / note_seq.MAX_MIDI_VELOCITY) -def note_event_data_to_events(state: Optional[NoteEncodingState], - value: NoteEventData, - codec: Codec) -> Sequence[Event]: +def note_event_data_to_events( + state: Optional[NoteEncodingState], value: NoteEventData, codec: Codec +) -> Sequence[Event]: """Convert note event data to a sequence of events.""" if value.velocity is None: # onsets only, no program or velocity @@ -393,9 +387,7 @@ def note_event_data_to_events(state: Optional[NoteEncodingState], # onsets + offsets + velocities only, no programs if state is not None: state.active_pitches[value.pitch, 0] = velocity_bin - return [ - Event("velocity", velocity_bin), Event("pitch", value.pitch) - ] + return [Event("velocity", velocity_bin), Event("pitch", value.pitch)] elif value.is_drum: # drum events use a separate vocabulary return [Event("velocity", velocity_bin), Event("drum", value.pitch)] @@ -413,8 +405,7 @@ def note_event_data_to_events(state: Optional[NoteEncodingState], def note_encoding_state_to_events(state: NoteEncodingState) -> Sequence[Event]: """Output program and pitch events for active notes plus a final tie event.""" events = [] - for pitch, program in sorted( - state.active_pitches.keys(), key=lambda k: k[::-1]): + for pitch, program in sorted(state.active_pitches.keys(), key=lambda k: k[::-1]): if state.active_pitches[pitch, program]: events += [Event("program", program), Event("pitch", pitch)] events.append(Event("tie", 0)) @@ -422,13 +413,14 @@ def note_encoding_state_to_events(state: NoteEncodingState) -> Sequence[Event]: def encode_and_index_events( - state, - event_times, - event_values, - codec, - frame_times, - encode_event_fn, - encoding_state_to_events_fn=None, ): + state, + event_times, + event_values, + codec, + frame_times, + encode_event_fn, + encoding_state_to_events_fn=None, +): """Encode a sequence of timed events and index to audio frame times. Encodes time shifts as repeated single step shifts for later run length encoding. @@ -460,9 +452,7 @@ def encode_and_index_events( state_event_indices: Corresponding state event index for every audio frame. """ indices = np.argsort(event_times, kind="stable") - event_steps = [ - round(event_times[i] * codec.steps_per_second) for i in indices - ] + event_steps = [round(event_times[i] * codec.steps_per_second) for i in indices] event_values = [event_values[i] for i in indices] events = [] state_events = [] @@ -473,9 +463,10 @@ def encode_and_index_events( cur_state_event_idx = 0 def fill_event_start_indices_to_cur_step(): - while (len(event_start_indices) < len(frame_times) and - frame_times[len(event_start_indices)] < cur_step / - codec.steps_per_second): + while ( + len(event_start_indices) < len(frame_times) + and frame_times[len(event_start_indices)] < cur_step / codec.steps_per_second + ): event_start_indices.append(cur_event_idx) state_event_indices.append(cur_state_event_idx) @@ -511,28 +502,24 @@ def fill_event_start_indices_to_cur_step(): events = np.array(events).astype(np.int32) state_events = np.array(state_events).astype(np.int32) - event_start_indices = segment( - np.array(event_start_indices).astype(np.int32), TARGET_FEATURE_LENGTH) - event_end_indices = segment( - np.array(event_end_indices).astype(np.int32), TARGET_FEATURE_LENGTH) - state_event_indices = segment( - np.array(state_event_indices).astype(np.int32), TARGET_FEATURE_LENGTH) + event_start_indices = segment(np.array(event_start_indices).astype(np.int32), TARGET_FEATURE_LENGTH) + event_end_indices = segment(np.array(event_end_indices).astype(np.int32), TARGET_FEATURE_LENGTH) + state_event_indices = segment(np.array(state_event_indices).astype(np.int32), TARGET_FEATURE_LENGTH) outputs = [] - for start_indices, end_indices, event_indices in zip( - event_start_indices, event_end_indices, state_event_indices): - outputs.append({ - "inputs": events, - "event_start_indices": start_indices, - "event_end_indices": end_indices, - "state_events": state_events, - "state_event_indices": event_indices, - }) + for start_indices, end_indices, event_indices in zip(event_start_indices, event_end_indices, state_event_indices): + outputs.append( + { + "inputs": events, + "event_start_indices": start_indices, + "event_end_indices": end_indices, + "state_events": state_events, + "state_event_indices": event_indices, + } + ) return outputs -def extract_sequence_with_indices(features, - state_events_end_token=None, - feature_key="inputs"): +def extract_sequence_with_indices(features, state_events_end_token=None, feature_key="inputs"): """Extract target sequence corresponding to audio token segment.""" features = features.copy() start_idx = features["event_start_indices"][0] @@ -543,36 +530,33 @@ def extract_sequence_with_indices(features, # prepend them to the targets array. state_event_start_idx = features["state_event_indices"][0] state_event_end_idx = state_event_start_idx + 1 - while (features["state_events"][state_event_end_idx - 1] != - state_events_end_token): + while features["state_events"][state_event_end_idx - 1] != state_events_end_token: state_event_end_idx += 1 features[feature_key] = np.concatenate( [ - features["state_events"][state_event_start_idx: - state_event_end_idx], + features["state_events"][state_event_start_idx:state_event_end_idx], features[feature_key], ], - axis=0, ) + axis=0, + ) return features -def map_midi_programs(feature, - codec: Codec, - granularity_type: str="full", - feature_key: str="inputs") -> Mapping[str, Any]: +def map_midi_programs( + feature, codec: Codec, granularity_type: str = "full", feature_key: str = "inputs" +) -> Mapping[str, Any]: """Apply MIDI program map to token sequences.""" granularity = PROGRAM_GRANULARITIES[granularity_type] - feature[feature_key] = granularity.tokens_map_fn(feature[feature_key], - codec) + feature[feature_key] = granularity.tokens_map_fn(feature[feature_key], codec) return feature def run_length_encode_shifts_fn( - features, - codec: Codec, - feature_key: str="inputs", - state_change_event_types: Sequence[str]=( - ), ) -> Callable[[Mapping[str, Any]], Mapping[str, Any]]: + features, + codec: Codec, + feature_key: str = "inputs", + state_change_event_types: Sequence[str] = (), +) -> Callable[[Mapping[str, Any]], Mapping[str, Any]]: """Return a function that run-length encodes shifts for a given codec. Args: @@ -585,13 +569,9 @@ def run_length_encode_shifts_fn( Returns: A preprocessing function that run-length encodes single-step shifts. """ - state_change_event_ranges = [ - codec.event_type_range(event_type) - for event_type in state_change_event_types - ] + state_change_event_ranges = [codec.event_type_range(event_type) for event_type in state_change_event_types] - def run_length_encode_shifts( - features: MutableMapping[str, Any]) -> Mapping[str, Any]: + def run_length_encode_shifts(features: MutableMapping[str, Any]) -> Mapping[str, Any]: """Combine leading/interior shifts, trim trailing shifts. Args: @@ -613,8 +593,7 @@ def run_length_encode_shifts( # If this event is a state change and has the same value as the current # state, we can skip it entirely. is_redundant = False - for i, (min_index, - max_index) in enumerate(state_change_event_ranges): + for i, (min_index, max_index) in enumerate(state_change_event_ranges): if min_index <= event and event <= max_index: if current_state[i] == event: is_redundant = True @@ -627,10 +606,8 @@ def run_length_encode_shifts( if shift_steps > 0: shift_steps = total_shift_steps while shift_steps > 0: - output_steps = np.minimum(codec.max_shift_steps, - shift_steps) - output = np.concatenate( - [output, [output_steps]], axis=0) + output_steps = np.minimum(codec.max_shift_steps, shift_steps) + output = np.concatenate([output, [output_steps]], axis=0) shift_steps -= output_steps output = np.concatenate([output, [event]], axis=0) features[feature_key] = output @@ -639,42 +616,32 @@ def run_length_encode_shifts( return run_length_encode_shifts(features) -def note_representation_processor_chain( - features, - codec: Codec, - note_representation_config: NoteRepresentationConfig): +def note_representation_processor_chain(features, codec: Codec, note_representation_config: NoteRepresentationConfig): tie_token = codec.encode_event(Event("tie", 0)) - state_events_end_token = (tie_token if - note_representation_config.include_ties else None) + state_events_end_token = tie_token if note_representation_config.include_ties else None features = extract_sequence_with_indices( - features, - state_events_end_token=state_events_end_token, - feature_key="inputs") + features, state_events_end_token=state_events_end_token, feature_key="inputs" + ) features = map_midi_programs(features, codec) - features = run_length_encode_shifts_fn( - features, codec, state_change_event_types=["velocity", "program"]) + features = run_length_encode_shifts_fn(features, codec, state_change_event_types=["velocity", "program"]) return features class MidiProcessor: def __init__(self): self.codec = Codec( - max_shift_steps=DEFAULT_MAX_SHIFT_SECONDS * - DEFAULT_STEPS_PER_SECOND, + max_shift_steps=DEFAULT_MAX_SHIFT_SECONDS * DEFAULT_STEPS_PER_SECOND, steps_per_second=DEFAULT_STEPS_PER_SECOND, event_ranges=[ - EventRange("pitch", note_seq.MIN_MIDI_PITCH, - note_seq.MAX_MIDI_PITCH), + EventRange("pitch", note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH), EventRange("velocity", 0, DEFAULT_NUM_VELOCITY_BINS), EventRange("tie", 0, 0), - EventRange("program", note_seq.MIN_MIDI_PROGRAM, - note_seq.MAX_MIDI_PROGRAM), - EventRange("drum", note_seq.MIN_MIDI_PITCH, - note_seq.MAX_MIDI_PITCH), - ], ) + EventRange("program", note_seq.MIN_MIDI_PROGRAM, note_seq.MAX_MIDI_PROGRAM), + EventRange("drum", note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH), + ], + ) self.tokenizer = Tokenizer(self.codec.num_classes) - self.note_representation_config = NoteRepresentationConfig( - onsets_only=False, include_ties=True) + self.note_representation_config = NoteRepresentationConfig(onsets_only=False, include_ties=True) def __call__(self, midi: Union[bytes, os.PathLike, str]): if not isinstance(midi, bytes): @@ -695,13 +662,10 @@ def __call__(self, midi: Union[bytes, os.PathLike, str]): frame_times=frame_times, codec=self.codec, encode_event_fn=note_event_data_to_events, - encoding_state_to_events_fn=note_encoding_state_to_events, ) + encoding_state_to_events_fn=note_encoding_state_to_events, + ) events = [ - note_representation_processor_chain(event, self.codec, - self.note_representation_config) - for event in events - ] - input_tokens = [ - self.tokenizer.encode(event["inputs"]) for event in events + note_representation_processor_chain(event, self.codec, self.note_representation_config) for event in events ] + input_tokens = [self.tokenizer.encode(event["inputs"]) for event in events] return input_tokens diff --git a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/notes_encoder.py b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/notes_encoder.py index 73d0d48ee3f28..bcf4c659a6e5f 100644 --- a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/notes_encoder.py +++ b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/notes_encoder.py @@ -17,25 +17,25 @@ from paddlenlp.transformers.t5.configuration import T5Config from paddlenlp.transformers.t5.modeling import T5Block, T5LayerNorm -from ...configuration_utils import (ConfigMixin, ModuleUtilsMixin, - register_to_config) +from ...configuration_utils import ConfigMixin, ModuleUtilsMixin, register_to_config from ...models import ModelMixin class SpectrogramNotesEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin): @register_to_config def __init__( - self, - max_length: int, - vocab_size: int, - d_model: int, - dropout_rate: float, - num_layers: int, - num_heads: int, - d_kv: int, - d_ff: int, - feed_forward_proj: str, - is_decoder: bool=False, ): + self, + max_length: int, + vocab_size: int, + d_model: int, + dropout_rate: float, + num_layers: int, + num_heads: int, + d_kv: int, + d_ff: int, + feed_forward_proj: str, + is_decoder: bool = False, + ): super().__init__() self.token_embedder = nn.Embedding(vocab_size, d_model) self.position_encoding = nn.Embedding(max_length, d_model) @@ -50,7 +50,8 @@ def __init__( dropout_rate=dropout_rate, feed_forward_proj=feed_forward_proj, is_decoder=is_decoder, - is_encoder_decoder=False, ) + is_encoder_decoder=False, + ) self.encoders = nn.LayerList() for lyr_num in range(num_layers): lyr = T5Block(t5config) @@ -67,8 +68,7 @@ def forward(self, encoder_input_tokens, encoder_inputs_mask): # inverted the attention mask input_shape = encoder_input_tokens.shape - extended_attention_mask = self.get_extended_attention_mask( - encoder_inputs_mask, input_shape) + extended_attention_mask = self.get_extended_attention_mask(encoder_inputs_mask, input_shape) for lyr in self.encoders: x = lyr(x, extended_attention_mask)[0] x = self.layer_norm(x) diff --git a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py index a7c2673f560f3..000fc9a868b02 100644 --- a/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/spectrogram_diffusion/pipeline_spectrogram_diffusion.py @@ -33,12 +33,13 @@ class SpectrogramDiffusionPipeline(DiffusionPipeline): _optional_components = ["melgan"] def __init__( - self, - notes_encoder: SpectrogramNotesEncoder, - continuous_encoder: SpectrogramContEncoder, - decoder: T5FilmDecoder, - scheduler: DDPMScheduler, - melgan: (Any), ) -> None: + self, + notes_encoder: SpectrogramNotesEncoder, + continuous_encoder: SpectrogramContEncoder, + decoder: T5FilmDecoder, + scheduler: DDPMScheduler, + melgan: (Any), + ) -> None: super().__init__() # From MELGAN @@ -50,25 +51,23 @@ def __init__( continuous_encoder=continuous_encoder, decoder=decoder, scheduler=scheduler, - melgan=melgan, ) + melgan=melgan, + ) def scale_features(self, features, output_range=(-1.0, 1.0), clip=False): """Linearly scale features to network outputs range.""" min_out, max_out = output_range if clip: - features = paddle.clip( - x=features, min=self.min_value, max=self.max_value) + features = paddle.clip(x=features, min=self.min_value, max=self.max_value) # Scale to [0, 1]. - zero_one = (features - self.min_value) / ( - self.max_value - self.min_value) + zero_one = (features - self.min_value) / (self.max_value - self.min_value) # Scale to [min_out, max_out]. return zero_one * (max_out - min_out) + min_out def scale_to_features(self, outputs, input_range=(-1.0, 1.0), clip=False): """Invert by linearly scaling network outputs to features range.""" min_out, max_out = input_range - outputs = paddle.clip( - x=outputs, min=min_out, max=max_out) if clip else outputs + outputs = paddle.clip(x=outputs, min=min_out, max=max_out) if clip else outputs # Scale to [0, 1]. zero_one = (outputs - min_out) / (max_out - min_out) # Scale to [self.min_value, self.max_value]. @@ -77,29 +76,27 @@ def scale_to_features(self, outputs, input_range=(-1.0, 1.0), clip=False): def encode(self, input_tokens, continuous_inputs, continuous_mask): tokens_mask = input_tokens > 0 tokens_encoded, tokens_mask = self.notes_encoder( - encoder_input_tokens=input_tokens, encoder_inputs_mask=tokens_mask) + encoder_input_tokens=input_tokens, encoder_inputs_mask=tokens_mask + ) continuous_encoded, continuous_mask = self.continuous_encoder( - encoder_inputs=continuous_inputs.cast( - self.continuous_encoder.dtype), - encoder_inputs_mask=continuous_mask, ) - return [(tokens_encoded, tokens_mask), (continuous_encoded, - continuous_mask)] + encoder_inputs=continuous_inputs.cast(self.continuous_encoder.dtype), + encoder_inputs_mask=continuous_mask, + ) + return [(tokens_encoded, tokens_mask), (continuous_encoded, continuous_mask)] def decode(self, encodings_and_masks, input_tokens, noise_time): timesteps = noise_time if not paddle.is_tensor(x=timesteps): - timesteps = paddle.to_tensor( - data=[timesteps], dtype="int64", place=input_tokens.place) + timesteps = paddle.to_tensor(data=[timesteps], dtype="int64", place=input_tokens.place) elif paddle.is_tensor(x=timesteps) and len(timesteps.shape) == 0: if isinstance(input_tokens.place, paddle.dtype): dtype = input_tokens.place - elif isinstance(input_tokens.place, - str) and input_tokens.place not in [ - "cpu", - "cuda", - "ipu", - "xpu", - ]: + elif isinstance(input_tokens.place, str) and input_tokens.place not in [ + "cpu", + "cuda", + "ipu", + "xpu", + ]: dtype = input_tokens.place elif isinstance(input_tokens.place, paddle.Tensor): dtype = input_tokens.place.dtype @@ -107,40 +104,41 @@ def decode(self, encodings_and_masks, input_tokens, noise_time): dtype = timesteps[None].dtype timesteps = timesteps[None].cast(dtype) # broadcast to batch dimension in a way that's compatible with ONNX/Core ML - timesteps = timesteps * paddle.ones( - shape=input_tokens.shape[0], dtype=timesteps.dtype) + timesteps = timesteps * paddle.ones(shape=input_tokens.shape[0], dtype=timesteps.dtype) logits = self.decoder( encodings_and_masks=encodings_and_masks, decoder_input_tokens=input_tokens, - decoder_noise_time=timesteps, ) + decoder_noise_time=timesteps, + ) return logits @paddle.no_grad() def __call__( - self, - input_tokens: List[List[int]], - generator: Optional[paddle.Generator]=None, - num_inference_steps: int=100, - return_dict: bool=True, - output_type: str="numpy", - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: int=1, ) -> Union[AudioPipelineOutput, Tuple]: - if (callback_steps is None or callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + self, + input_tokens: List[List[int]], + generator: Optional[paddle.Generator] = None, + num_inference_steps: int = 100, + return_dict: bool = True, + output_type: str = "numpy", + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: int = 1, + ) -> Union[AudioPipelineOutput, Tuple]: + if ( + callback_steps is None + or callback_steps is not None + and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}." ) - pred_mel = np.zeros( - [1, TARGET_FEATURE_LENGTH, self.n_dims], dtype=np.float32) + pred_mel = np.zeros([1, TARGET_FEATURE_LENGTH, self.n_dims], dtype=np.float32) full_pred_mel = np.zeros([1, 0, self.n_dims], np.float32) ones = paddle.ones(shape=(1, TARGET_FEATURE_LENGTH), dtype=bool) for i, encoder_input_tokens in enumerate(input_tokens): if i == 0: - encoder_continuous_inputs = paddle.to_tensor( - data=pred_mel[:1].copy()).cast(self.decoder.dtype) + encoder_continuous_inputs = paddle.to_tensor(data=pred_mel[:1].copy()).cast(self.decoder.dtype) # The first chunk has no previous context. - encoder_continuous_mask = paddle.zeros( - shape=(1, TARGET_FEATURE_LENGTH), dtype=bool) + encoder_continuous_mask = paddle.zeros(shape=(1, TARGET_FEATURE_LENGTH), dtype=bool) else: # The full song pipeline does not feed in a context feature, so the mask # will be all 0s after the feature converter. Because we know we're @@ -148,17 +146,19 @@ def __call__( # to all 1s. encoder_continuous_mask = ones encoder_continuous_inputs = self.scale_features( - encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True) + encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True + ) encodings_and_masks = self.encode( - input_tokens=paddle.to_tensor( - data=[encoder_input_tokens], dtype="int32"), + input_tokens=paddle.to_tensor(data=[encoder_input_tokens], dtype="int32"), continuous_inputs=encoder_continuous_inputs, - continuous_mask=encoder_continuous_mask, ) + continuous_mask=encoder_continuous_mask, + ) # Sample encoder_continuous_inputs shaped gaussian noise to begin loop x = randn_tensor( shape=encoder_continuous_inputs.shape, generator=generator, - dtype=self.decoder.dtype, ) + dtype=self.decoder.dtype, + ) # set step values self.scheduler.set_timesteps(num_inference_steps) # Denoising diffusion loop @@ -166,26 +166,24 @@ def __call__( output = self.decode( encodings_and_masks=encodings_and_masks, input_tokens=x, - noise_time=t / self.scheduler.config.num_train_timesteps, ) + noise_time=t / self.scheduler.config.num_train_timesteps, + ) # Compute previous output: x_t -> x_t-1 - x = self.scheduler.step( - output, t, x, generator=generator).prev_sample + x = self.scheduler.step(output, t, x, generator=generator).prev_sample mel = self.scale_to_features(x, input_range=[-1.0, 1.0]) encoder_continuous_inputs = mel[:1] pred_mel = mel.cpu().astype(dtype="float32").numpy() - full_pred_mel = np.concatenate( - [full_pred_mel, pred_mel[:1]], axis=1) + full_pred_mel = np.concatenate([full_pred_mel, pred_mel[:1]], axis=1) # call the callback, if provided if callback is not None and i % callback_steps == 0: callback(i, full_pred_mel) logger.info("Generated segment", i) if output_type == "numpy": - output = self.melgan( - input_features=full_pred_mel.astype(np.float32))[0] + output = self.melgan(input_features=full_pred_mel.astype(np.float32))[0] else: output = full_pred_mel if not return_dict: - return (output, ) + return (output,) return AudioPipelineOutput(audios=output) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/__init__.py index 5bcf303c00772..fa4dcc515380f 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/__init__.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/__init__.py @@ -19,10 +19,15 @@ import numpy as np import PIL.Image -from ...utils import (BaseOutput, OptionalDependencyNotAvailable, - is_fastdeploy_available, is_k_diffusion_available, - is_k_diffusion_version, is_paddle_available, - is_paddlenlp_available) +from ...utils import ( + BaseOutput, + OptionalDependencyNotAvailable, + is_fastdeploy_available, + is_k_diffusion_available, + is_k_diffusion_version, + is_paddle_available, + is_paddlenlp_available, +) @dataclass @@ -51,44 +56,46 @@ class StableDiffusionPipelineOutput(BaseOutput): else: # new added from .hf_clip_model import ( - HFCLIPModel, HFCLIPTextModel, HFCLIPTextModelWithProjection, - HFCLIPVisionModel, HFCLIPVisionModelWithProjection) + HFCLIPModel, + HFCLIPTextModel, + HFCLIPTextModelWithProjection, + HFCLIPVisionModel, + HFCLIPVisionModelWithProjection, + ) from .pipeline_cycle_diffusion import CycleDiffusionPipeline from .pipeline_stable_diffusion import StableDiffusionPipeline - from .pipeline_stable_diffusion_adapter import \ - StableDiffusionAdapterPipeline - from .pipeline_stable_diffusion_all_in_one import \ - StableDiffusionPipelineAllinOne - from .pipeline_stable_diffusion_attend_and_excite import \ - StableDiffusionAttendAndExcitePipeline - from .pipeline_stable_diffusion_controlnet import \ - StableDiffusionControlNetPipeline - from .pipeline_stable_diffusion_depth2img import \ - StableDiffusionDepth2ImgPipeline - from .pipeline_stable_diffusion_image_variation import \ - StableDiffusionImageVariationPipeline - from .pipeline_stable_diffusion_img2img import \ - StableDiffusionImg2ImgPipeline - from .pipeline_stable_diffusion_inpaint import \ - StableDiffusionInpaintPipeline - from .pipeline_stable_diffusion_inpaint_legacy import \ - StableDiffusionInpaintPipelineLegacy - from .pipeline_stable_diffusion_instruct_pix2pix import \ - StableDiffusionInstructPix2PixPipeline - from .pipeline_stable_diffusion_k_diffusion import \ - StableDiffusionKDiffusionPipeline - from .pipeline_stable_diffusion_latent_upscale import \ - StableDiffusionLatentUpscalePipeline + from .pipeline_stable_diffusion_adapter import StableDiffusionAdapterPipeline + from .pipeline_stable_diffusion_all_in_one import StableDiffusionPipelineAllinOne + from .pipeline_stable_diffusion_attend_and_excite import ( + StableDiffusionAttendAndExcitePipeline, + ) + from .pipeline_stable_diffusion_controlnet import StableDiffusionControlNetPipeline + from .pipeline_stable_diffusion_depth2img import StableDiffusionDepth2ImgPipeline + from .pipeline_stable_diffusion_image_variation import ( + StableDiffusionImageVariationPipeline, + ) + from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline + from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline + from .pipeline_stable_diffusion_inpaint_legacy import ( + StableDiffusionInpaintPipelineLegacy, + ) + from .pipeline_stable_diffusion_instruct_pix2pix import ( + StableDiffusionInstructPix2PixPipeline, + ) + from .pipeline_stable_diffusion_k_diffusion import StableDiffusionKDiffusionPipeline + from .pipeline_stable_diffusion_latent_upscale import ( + StableDiffusionLatentUpscalePipeline, + ) from .pipeline_stable_diffusion_mega import StableDiffusionMegaPipeline - from .pipeline_stable_diffusion_model_editing import \ - StableDiffusionModelEditingPipeline - from .pipeline_stable_diffusion_panorama import \ - StableDiffusionPanoramaPipeline - from .pipeline_stable_diffusion_pix2pix_zero import \ - StableDiffusionPix2PixZeroPipeline + from .pipeline_stable_diffusion_model_editing import ( + StableDiffusionModelEditingPipeline, + ) + from .pipeline_stable_diffusion_panorama import StableDiffusionPanoramaPipeline + from .pipeline_stable_diffusion_pix2pix_zero import ( + StableDiffusionPix2PixZeroPipeline, + ) from .pipeline_stable_diffusion_sag import StableDiffusionSAGPipeline - from .pipeline_stable_diffusion_upscale import \ - StableDiffusionUpscalePipeline + from .pipeline_stable_diffusion_upscale import StableDiffusionUpscalePipeline from .pipeline_stable_unclip import StableUnCLIPPipeline from .pipeline_stable_unclip_img2img import StableUnCLIPImg2ImgPipeline from .safety_checker import StableDiffusionSafetyChecker @@ -100,21 +107,26 @@ class StableDiffusionPipelineOutput(BaseOutput): except OptionalDependencyNotAvailable: from ...utils.dummy_fastdeploy_objects import * # noqa F403 else: - from .pipeline_fastdeploy_cycle_diffusion import \ - FastDeployCycleDiffusionPipeline - from .pipeline_fastdeploy_stable_diffusion import \ - FastDeployStableDiffusionPipeline - from .pipeline_fastdeploy_stable_diffusion_controlnet import \ - FastDeployStableDiffusionControlNetPipeline - from .pipeline_fastdeploy_stable_diffusion_image_variation import \ - FastDeployStableDiffusionImageVariationPipeline - from .pipeline_fastdeploy_stable_diffusion_img2img import \ - FastDeployStableDiffusionImg2ImgPipeline - from .pipeline_fastdeploy_stable_diffusion_inpaint import \ - FastDeployStableDiffusionInpaintPipeline - from .pipeline_fastdeploy_stable_diffusion_inpaint_legacy import \ - FastDeployStableDiffusionInpaintPipelineLegacy - from .pipeline_fastdeploy_stable_diffusion_mega import \ - FastDeployStableDiffusionMegaPipeline - from .pipeline_fastdeploy_stable_diffusion_upscale import \ - FastDeployStableDiffusionUpscalePipeline + from .pipeline_fastdeploy_cycle_diffusion import FastDeployCycleDiffusionPipeline + from .pipeline_fastdeploy_stable_diffusion import FastDeployStableDiffusionPipeline + from .pipeline_fastdeploy_stable_diffusion_controlnet import ( + FastDeployStableDiffusionControlNetPipeline, + ) + from .pipeline_fastdeploy_stable_diffusion_image_variation import ( + FastDeployStableDiffusionImageVariationPipeline, + ) + from .pipeline_fastdeploy_stable_diffusion_img2img import ( + FastDeployStableDiffusionImg2ImgPipeline, + ) + from .pipeline_fastdeploy_stable_diffusion_inpaint import ( + FastDeployStableDiffusionInpaintPipeline, + ) + from .pipeline_fastdeploy_stable_diffusion_inpaint_legacy import ( + FastDeployStableDiffusionInpaintPipelineLegacy, + ) + from .pipeline_fastdeploy_stable_diffusion_mega import ( + FastDeployStableDiffusionMegaPipeline, + ) + from .pipeline_fastdeploy_stable_diffusion_upscale import ( + FastDeployStableDiffusionUpscalePipeline, + ) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt.py index 1b9ac762bae8a..3f1cbee1f4454 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt.py @@ -22,21 +22,37 @@ import numpy as np import requests from paddlenlp.transformers import ( - BertTokenizer, CLIPFeatureExtractor, CLIPImageProcessor, CLIPTextModel, - CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionConfig, - CLIPVisionModelWithProjection) - -from ...models import (AutoencoderKL, ControlNetModel, PriorTransformer, - UNet2DConditionModel) + BertTokenizer, + CLIPFeatureExtractor, + CLIPImageProcessor, + CLIPTextModel, + CLIPTextModelWithProjection, + CLIPTokenizer, + CLIPVisionConfig, + CLIPVisionModelWithProjection, +) + +from ...models import ( + AutoencoderKL, + ControlNetModel, + PriorTransformer, + UNet2DConditionModel, +) from ...schedulers import ( - DDIMScheduler, DDPMScheduler, DPMSolverMultistepScheduler, - EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, - HeunDiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler, UnCLIPScheduler) + DDIMScheduler, + DDPMScheduler, + DPMSolverMultistepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + HeunDiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + UnCLIPScheduler, +) from ...utils import is_omegaconf_available, logging from ...utils.import_utils import BACKENDS_MAPPING from ...utils.load_utils import smart_load -from ..latent_diffusion.pipeline_latent_diffusion import (LDMBertConfig, - LDMBertModel) +from ..latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel from ..paint_by_example import PaintByExampleImageEncoder from ..pipeline_utils import DiffusionPipeline from .safety_checker import StableDiffusionSafetyChecker @@ -70,8 +86,7 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0): new_item = new_item.replace("emb_layers.1", "time_emb_proj") new_item = new_item.replace("skip_connection", "conv_shortcut") - new_item = shave_segments( - new_item, n_shave_prefix_segments=n_shave_prefix_segments) + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) mapping.append({"old": old_item, "new": new_item}) @@ -87,8 +102,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0): new_item = old_item new_item = new_item.replace("nin_shortcut", "conv_shortcut") - new_item = shave_segments( - new_item, n_shave_prefix_segments=n_shave_prefix_segments) + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) mapping.append({"old": old_item, "new": new_item}) @@ -131,8 +145,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): new_item = new_item.replace("proj_out.weight", "proj_attn.weight") new_item = new_item.replace("proj_out.bias", "proj_attn.bias") - new_item = shave_segments( - new_item, n_shave_prefix_segments=n_shave_prefix_segments) + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) mapping.append({"old": old_item, "new": new_item}) @@ -140,21 +153,20 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): def assign_to_checkpoint( - paths, - checkpoint, - old_checkpoint, - attention_paths_to_split=None, - additional_replacements=None, - config=None, ): + paths, + checkpoint, + old_checkpoint, + attention_paths_to_split=None, + additional_replacements=None, + config=None, +): """ This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits attention layers, and takes into account additional replacements that may arise. Assigns the weights to the new checkpoint. """ - assert isinstance( - paths, - list), "Paths should be a list of dicts containing 'old' and 'new' keys." + assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys." # Splits the attention layers into three variables. if attention_paths_to_split is not None: @@ -162,13 +174,11 @@ def assign_to_checkpoint( old_tensor = old_checkpoint[path] channels = old_tensor.shape[0] // 3 - target_shape = (-1, channels) if len(old_tensor.shape) == 3 else ( - -1) + target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1) num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3 - old_tensor = old_tensor.reshape((num_heads, 3 * channels // - num_heads) + old_tensor.shape[1:]) + old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:]) query, key, value = np.split(old_tensor, 3, axis=1) checkpoint[path_map["query"]] = query.reshape(target_shape) @@ -179,8 +189,7 @@ def assign_to_checkpoint( new_path = path["new"] # These have already been assigned - if (attention_paths_to_split is not None and - new_path in attention_paths_to_split): + if attention_paths_to_split is not None and new_path in attention_paths_to_split: continue # Global renaming happens here @@ -190,8 +199,7 @@ def assign_to_checkpoint( if additional_replacements is not None: for replacement in additional_replacements: - new_path = new_path.replace(replacement["old"], - replacement["new"]) + new_path = new_path.replace(replacement["old"], replacement["new"]) # proj_attn.weight has to be converted from conv 1D to linear if "proj_attn.weight" in new_path: @@ -212,9 +220,7 @@ def conv_attn_to_linear(checkpoint): checkpoint[key] = checkpoint[key][:, :, 0] -def create_unet_diffusers_config(original_config, - image_size: int, - controlnet=False): +def create_unet_diffusers_config(original_config, image_size: int, controlnet=False): """ Creates a config for the diffusers based on the config of the LDM model. """ @@ -225,34 +231,28 @@ def create_unet_diffusers_config(original_config, vae_params = original_config.model.params.first_stage_config.params.ddconfig - block_out_channels = [ - unet_params.model_channels * mult for mult in unet_params.channel_mult - ] + block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult] down_block_types = [] resolution = 1 for i in range(len(block_out_channels)): - block_type = ("CrossAttnDownBlock2D" - if resolution in unet_params.attention_resolutions else - "DownBlock2D") + block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D" down_block_types.append(block_type) if i != len(block_out_channels) - 1: resolution *= 2 up_block_types = [] for i in range(len(block_out_channels)): - block_type = ("CrossAttnUpBlock2D" - if resolution in unet_params.attention_resolutions else - "UpBlock2D") + block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" up_block_types.append(block_type) resolution //= 2 - vae_scale_factor = 2**(len(vae_params.ch_mult) - 1) + vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1) head_dim = unet_params.num_heads if "num_heads" in unet_params else None - use_linear_projection = (unet_params.use_linear_in_transformer - if "use_linear_in_transformer" in unet_params else - False) + use_linear_projection = ( + unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False + ) if use_linear_projection: # stable diffusion 2-base-512 and 2-768 if head_dim is None: @@ -267,9 +267,7 @@ def create_unet_diffusers_config(original_config, assert "adm_in_channels" in unet_params projection_class_embeddings_input_dim = unet_params.adm_in_channels else: - raise NotImplementedError( - f"Unknown conditional unet num_classes config: {unet_params.num_classes}" - ) + raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params.num_classes}") config = { "sample_size": image_size // vae_scale_factor, @@ -281,8 +279,7 @@ def create_unet_diffusers_config(original_config, "attention_head_dim": head_dim, "use_linear_projection": use_linear_projection, "class_embed_type": class_embed_type, - "projection_class_embeddings_input_dim": - projection_class_embeddings_input_dim, + "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim, } if not controlnet: @@ -328,7 +325,8 @@ def create_diffusers_schedular(original_config): num_train_timesteps=original_config.model.params.timesteps, beta_start=original_config.model.params.linear_start, beta_end=original_config.model.params.linear_end, - beta_schedule="scaled_linear", ) + beta_schedule="scaled_linear", + ) return schedular @@ -347,17 +345,19 @@ def create_ldm_bert_config(original_config): attention_dropout=0.0, activation_dropout=0.0, init_std=0.02, - pad_token_id=0, ) + pad_token_id=0, + ) return LDMBertConfig(**config) def convert_ldm_unet_checkpoint( - checkpoint, - config, - path=None, - extract_ema=False, - controlnet=False, - no_unet_key=False, ): + checkpoint, + config, + path=None, + extract_ema=False, + controlnet=False, + no_unet_key=False, +): """ Takes a state dict and a config, and returns a converted checkpoint. """ @@ -384,8 +384,7 @@ def convert_ldm_unet_checkpoint( for key in keys: if key.startswith("model.diffusion_model"): flat_ema_key = "model_ema." + "".join(key.split(".")[1:]) - unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop( - flat_ema_key) + unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key) else: if sum(k.startswith("model_ema") for k in keys) > 100: print( @@ -399,34 +398,23 @@ def convert_ldm_unet_checkpoint( new_checkpoint = {} - new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict[ - "time_embed.0.weight"] - new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict[ - "time_embed.0.bias"] - new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict[ - "time_embed.2.weight"] - new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict[ - "time_embed.2.bias"] + new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"] + new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"] + new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"] + new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"] if config["class_embed_type"] is None: # No parameters to port ... - elif (config["class_embed_type"] == "timestep" or - config["class_embed_type"] == "projection"): - new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict[ - "label_emb.0.0.weight"] - new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict[ - "label_emb.0.0.bias"] - new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict[ - "label_emb.0.2.weight"] - new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict[ - "label_emb.0.2.bias"] + elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection": + new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"] + new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"] + new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"] + new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"] else: - raise NotImplementedError( - f"Not implemented `class_embed_type`: {config['class_embed_type']}") + raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}") - new_checkpoint["conv_in.weight"] = unet_state_dict[ - "input_blocks.0.0.weight"] + new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"] new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"] if not controlnet: @@ -436,35 +424,23 @@ def convert_ldm_unet_checkpoint( new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"] # Retrieves the keys for the input blocks only - num_input_blocks = len({ - ".".join(layer.split(".")[:2]) - for layer in unet_state_dict if "input_blocks" in layer - }) + num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer}) input_blocks = { - layer_id: - [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key] + layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key] for layer_id in range(num_input_blocks) } # Retrieves the keys for the middle blocks only - num_middle_blocks = len({ - ".".join(layer.split(".")[:2]) - for layer in unet_state_dict if "middle_block" in layer - }) + num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer}) middle_blocks = { - layer_id: - [key for key in unet_state_dict if f"middle_block.{layer_id}" in key] + layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key] for layer_id in range(num_middle_blocks) } # Retrieves the keys for the output blocks only - num_output_blocks = len({ - ".".join(layer.split(".")[:2]) - for layer in unet_state_dict if "output_blocks" in layer - }) + num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer}) output_blocks = { - layer_id: - [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key] + layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key] for layer_id in range(num_output_blocks) } @@ -473,21 +449,17 @@ def convert_ldm_unet_checkpoint( layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1) resnets = [ - key for key in input_blocks[i] - if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in - key - ] - attentions = [ - key for key in input_blocks[i] if f"input_blocks.{i}.1" in key + key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key ] + attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key] if f"input_blocks.{i}.0.op.weight" in unet_state_dict: - new_checkpoint[ - f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop( - f"input_blocks.{i}.0.op.weight") - new_checkpoint[ - f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop( - f"input_blocks.{i}.0.op.bias") + new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop( + f"input_blocks.{i}.0.op.weight" + ) + new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop( + f"input_blocks.{i}.0.op.bias" + ) paths = renew_resnet_paths(resnets) meta_path = { @@ -499,7 +471,8 @@ def convert_ldm_unet_checkpoint( new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) if len(attentions): paths = renew_attention_paths(attentions) @@ -512,19 +485,18 @@ def convert_ldm_unet_checkpoint( new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) resnet_0 = middle_blocks[0] attentions = middle_blocks[1] resnet_1 = middle_blocks[2] resnet_0_paths = renew_resnet_paths(resnet_0) - assign_to_checkpoint( - resnet_0_paths, new_checkpoint, unet_state_dict, config=config) + assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config) resnet_1_paths = renew_resnet_paths(resnet_1) - assign_to_checkpoint( - resnet_1_paths, new_checkpoint, unet_state_dict, config=config) + assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config) attentions_paths = renew_attention_paths(attentions) meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"} @@ -533,14 +505,13 @@ def convert_ldm_unet_checkpoint( new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) for i in range(num_output_blocks): block_id = i // (config["layers_per_block"] + 1) layer_in_block_id = i % (config["layers_per_block"] + 1) - output_block_layers = [ - shave_segments(name, 2) for name in output_blocks[i] - ] + output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]] output_block_list = {} for layer in output_block_layers: @@ -551,12 +522,8 @@ def convert_ldm_unet_checkpoint( output_block_list[layer_id] = [layer_name] if len(output_block_list) > 1: - resnets = [ - key for key in output_blocks[i] if f"output_blocks.{i}.0" in key - ] - attentions = [ - key for key in output_blocks[i] if f"output_blocks.{i}.1" in key - ] + resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key] + attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key] resnet_0_paths = renew_resnet_paths(resnets) paths = renew_resnet_paths(resnets) @@ -570,22 +537,19 @@ def convert_ldm_unet_checkpoint( new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) - output_block_list = { - k: sorted(v) - for k, v in output_block_list.items() - } + output_block_list = {k: sorted(v) for k, v in output_block_list.items()} if ["conv.bias", "conv.weight"] in output_block_list.values(): - index = list(output_block_list.values()).index( - ["conv.bias", "conv.weight"]) - new_checkpoint[ - f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[ - f"output_blocks.{i}.{index}.conv.weight"] - new_checkpoint[ - f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[ - f"output_blocks.{i}.{index}.conv.bias"] + index = list(output_block_list.values()).index(["conv.bias", "conv.weight"]) + new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[ + f"output_blocks.{i}.{index}.conv.weight" + ] + new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[ + f"output_blocks.{i}.{index}.conv.bias" + ] # Clear attentions as they have been attributed above. if len(attentions) == 2: @@ -595,27 +559,28 @@ def convert_ldm_unet_checkpoint( paths = renew_attention_paths(attentions) meta_path = { "old": f"output_blocks.{i}.1", - "new": - f"up_blocks.{block_id}.attentions.{layer_in_block_id}", + "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}", } assign_to_checkpoint( paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) else: - resnet_0_paths = renew_resnet_paths( - output_block_layers, n_shave_prefix_segments=1) + resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1) for path in resnet_0_paths: old_path = ".".join(["output_blocks", str(i), path["old"]]) - new_path = ".".join([ - "up_blocks", - str(block_id), - "resnets", - str(layer_in_block_id), - path["new"], - ]) + new_path = ".".join( + [ + "up_blocks", + str(block_id), + "resnets", + str(layer_in_block_id), + path["new"], + ] + ) new_checkpoint[new_path] = unet_state_dict[old_path] @@ -624,48 +589,42 @@ def convert_ldm_unet_checkpoint( orig_index = 0 - new_checkpoint[ - "controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop( - f"input_hint_block.{orig_index}.weight") - new_checkpoint[ - "controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop( - f"input_hint_block.{orig_index}.bias") + new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop( + f"input_hint_block.{orig_index}.weight" + ) + new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop( + f"input_hint_block.{orig_index}.bias" + ) orig_index += 2 diffusers_index = 0 while diffusers_index < 6: - new_checkpoint[ - f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop( - f"input_hint_block.{orig_index}.weight") - new_checkpoint[ - f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop( - f"input_hint_block.{orig_index}.bias") + new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop( + f"input_hint_block.{orig_index}.weight" + ) + new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop( + f"input_hint_block.{orig_index}.bias" + ) diffusers_index += 1 orig_index += 2 - new_checkpoint[ - "controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop( - f"input_hint_block.{orig_index}.weight") - new_checkpoint[ - "controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop( - f"input_hint_block.{orig_index}.bias") + new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop( + f"input_hint_block.{orig_index}.weight" + ) + new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop( + f"input_hint_block.{orig_index}.bias" + ) # down blocks for i in range(num_input_blocks): - new_checkpoint[ - f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop( - f"zero_convs.{i}.0.weight") - new_checkpoint[ - f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop( - f"zero_convs.{i}.0.bias") + new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight") + new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias") # mid block - new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop( - "middle_block_out.0.weight") - new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop( - "middle_block_out.0.bias") + new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight") + new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias") return new_checkpoint @@ -681,107 +640,74 @@ def convert_ldm_vae_checkpoint(checkpoint, config): new_checkpoint = {} - new_checkpoint["encoder.conv_in.weight"] = vae_state_dict[ - "encoder.conv_in.weight"] - new_checkpoint["encoder.conv_in.bias"] = vae_state_dict[ - "encoder.conv_in.bias"] - new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[ - "encoder.conv_out.weight"] - new_checkpoint["encoder.conv_out.bias"] = vae_state_dict[ - "encoder.conv_out.bias"] - new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[ - "encoder.norm_out.weight"] - new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[ - "encoder.norm_out.bias"] - - new_checkpoint["decoder.conv_in.weight"] = vae_state_dict[ - "decoder.conv_in.weight"] - new_checkpoint["decoder.conv_in.bias"] = vae_state_dict[ - "decoder.conv_in.bias"] - new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[ - "decoder.conv_out.weight"] - new_checkpoint["decoder.conv_out.bias"] = vae_state_dict[ - "decoder.conv_out.bias"] - new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[ - "decoder.norm_out.weight"] - new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[ - "decoder.norm_out.bias"] + new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"] + new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"] + new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"] + new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"] + new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"] + new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"] + + new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"] + new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"] + new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"] + new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"] + new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"] + new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"] new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"] new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"] - new_checkpoint["post_quant_conv.weight"] = vae_state_dict[ - "post_quant_conv.weight"] - new_checkpoint["post_quant_conv.bias"] = vae_state_dict[ - "post_quant_conv.bias"] + new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"] + new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"] # Retrieves the keys for the encoder down blocks only - num_down_blocks = len({ - ".".join(layer.split(".")[:3]) - for layer in vae_state_dict if "encoder.down" in layer - }) + num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer}) down_blocks = { - layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] - for layer_id in range(num_down_blocks) + layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks) } # Retrieves the keys for the decoder up blocks only - num_up_blocks = len({ - ".".join(layer.split(".")[:3]) - for layer in vae_state_dict if "decoder.up" in layer - }) + num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer}) up_blocks = { - layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] - for layer_id in range(num_up_blocks) + layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks) } for i in range(num_down_blocks): - resnets = [ - key for key in down_blocks[i] - if f"down.{i}" in key and f"down.{i}.downsample" not in key - ] + resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key] if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict: - new_checkpoint[ - f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop( - f"encoder.down.{i}.downsample.conv.weight") - new_checkpoint[ - f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop( - f"encoder.down.{i}.downsample.conv.bias") + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop( + f"encoder.down.{i}.downsample.conv.weight" + ) + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop( + f"encoder.down.{i}.downsample.conv.bias" + ) paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"down.{i}.block", - "new": f"down_blocks.{i}.resnets" - } + meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key] num_mid_res_blocks = 2 for i in range(1, num_mid_res_blocks + 1): - resnets = [ - key for key in mid_resnets if f"encoder.mid.block_{i}" in key - ] + resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key] paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"mid.block_{i}", - "new": f"mid_block.resnets.{i - 1}" - } + meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) - mid_attentions = [ - key for key in vae_state_dict if "encoder.mid.attn" in key - ] + mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key] paths = renew_vae_attention_paths(mid_attentions) meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} assign_to_checkpoint( @@ -789,58 +715,50 @@ def convert_ldm_vae_checkpoint(checkpoint, config): new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) conv_attn_to_linear(new_checkpoint) for i in range(num_up_blocks): block_id = num_up_blocks - 1 - i resnets = [ - key for key in up_blocks[block_id] - if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key + key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key ] if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict: - new_checkpoint[ - f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[ - f"decoder.up.{block_id}.upsample.conv.weight"] - new_checkpoint[ - f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[ - f"decoder.up.{block_id}.upsample.conv.bias"] + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[ + f"decoder.up.{block_id}.upsample.conv.weight" + ] + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[ + f"decoder.up.{block_id}.upsample.conv.bias" + ] paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"up.{block_id}.block", - "new": f"up_blocks.{i}.resnets" - } + meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key] num_mid_res_blocks = 2 for i in range(1, num_mid_res_blocks + 1): - resnets = [ - key for key in mid_resnets if f"decoder.mid.block_{i}" in key - ] + resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key] paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"mid.block_{i}", - "new": f"mid_block.resnets.{i - 1}" - } + meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) - mid_attentions = [ - key for key in vae_state_dict if "decoder.mid.attn" in key - ] + mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key] paths = renew_vae_attention_paths(mid_attentions) meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} assign_to_checkpoint( @@ -848,13 +766,13 @@ def convert_ldm_vae_checkpoint(checkpoint, config): new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) conv_attn_to_linear(new_checkpoint) return new_checkpoint -def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, - diffusers_vae_unet_checkpoint): +def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint): import paddle.nn as nn need_transpose = [] @@ -880,52 +798,56 @@ def convert_ldm_bert_checkpoint(checkpoint, config): bert_state_dict[key.replace(bert_key, "")] = checkpoint.get(key) new_checkpoint = {} - new_checkpoint["embeddings.word_embeddings.weight"] = bert_state_dict[ - "transformer.token_emb.weight"] - new_checkpoint["embeddings.position_embeddings.weight"] = bert_state_dict[ - "transformer.pos_emb.emb.weight"] + new_checkpoint["embeddings.word_embeddings.weight"] = bert_state_dict["transformer.token_emb.weight"] + new_checkpoint["embeddings.position_embeddings.weight"] = bert_state_dict["transformer.pos_emb.emb.weight"] for i in range(config.encoder_layers): double_i = 2 * i double_i_plus1 = 2 * i + 1 # convert norm new_checkpoint[f"encoder.layers.{i}.norm1.weight"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i}.0.weight"] + f"transformer.attn_layers.layers.{double_i}.0.weight" + ] new_checkpoint[f"encoder.layers.{i}.norm1.bias"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i}.0.bias"] - - new_checkpoint[ - f"encoder.layers.{i}.self_attn.q_proj.weight"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i}.1.to_q.weight"].T - new_checkpoint[ - f"encoder.layers.{i}.self_attn.k_proj.weight"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i}.1.to_k.weight"].T - new_checkpoint[ - f"encoder.layers.{i}.self_attn.v_proj.weight"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i}.1.to_v.weight"].T - new_checkpoint[ - f"encoder.layers.{i}.self_attn.out_proj.weight"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i}.1.to_out.weight"].T - new_checkpoint[ - f"encoder.layers.{i}.self_attn.out_proj.bias"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i}.1.to_out.bias"] + f"transformer.attn_layers.layers.{double_i}.0.bias" + ] + + new_checkpoint[f"encoder.layers.{i}.self_attn.q_proj.weight"] = bert_state_dict[ + f"transformer.attn_layers.layers.{double_i}.1.to_q.weight" + ].T + new_checkpoint[f"encoder.layers.{i}.self_attn.k_proj.weight"] = bert_state_dict[ + f"transformer.attn_layers.layers.{double_i}.1.to_k.weight" + ].T + new_checkpoint[f"encoder.layers.{i}.self_attn.v_proj.weight"] = bert_state_dict[ + f"transformer.attn_layers.layers.{double_i}.1.to_v.weight" + ].T + new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.weight"] = bert_state_dict[ + f"transformer.attn_layers.layers.{double_i}.1.to_out.weight" + ].T + new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.bias"] = bert_state_dict[ + f"transformer.attn_layers.layers.{double_i}.1.to_out.bias" + ] new_checkpoint[f"encoder.layers.{i}.norm2.weight"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i_plus1}.0.weight"] + f"transformer.attn_layers.layers.{double_i_plus1}.0.weight" + ] new_checkpoint[f"encoder.layers.{i}.norm2.bias"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i_plus1}.0.bias"] + f"transformer.attn_layers.layers.{double_i_plus1}.0.bias" + ] new_checkpoint[f"encoder.layers.{i}.linear1.weight"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight"].T + f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight" + ].T new_checkpoint[f"encoder.layers.{i}.linear1.bias"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"] + f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias" + ] new_checkpoint[f"encoder.layers.{i}.linear2.weight"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight"].T + f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight" + ].T new_checkpoint[f"encoder.layers.{i}.linear2.bias"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"].T + f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias" + ].T - new_checkpoint["final_layer_norm.weight"] = bert_state_dict[ - "transformer.norm.weight"] - new_checkpoint["final_layer_norm.bias"] = bert_state_dict[ - "transformer.norm.bias"] + new_checkpoint["final_layer_norm.weight"] = bert_state_dict["transformer.norm.weight"] + new_checkpoint["final_layer_norm.bias"] = bert_state_dict["transformer.norm.bias"] ldmbert = LDMBertModel(config) ldmbert.eval() ldmbert.load_dict(new_checkpoint) @@ -942,12 +864,10 @@ def convert_ldm_clip_checkpoint(checkpoint): for key in keys: if key.startswith("cond_stage_model.transformer"): - text_model_dict[key[len( - "cond_stage_model.transformer."):]] = checkpoint[key] + text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key] if len(text_model_dict) > 0: - text_model.load_dict( - CLIPTextModel.smart_convert(text_model_dict, text_model)) + text_model.load_dict(CLIPTextModel.smart_convert(text_model_dict, text_model)) return text_model @@ -955,14 +875,14 @@ def convert_ldm_clip_checkpoint(checkpoint): textenc_conversion_lst = [ ( "cond_stage_model.model.positional_embedding", - "text_model.embeddings.position_embedding.weight", ), + "text_model.embeddings.position_embedding.weight", + ), ( "cond_stage_model.model.token_embedding.weight", - "text_model.embeddings.token_embedding.weight", ), - ("cond_stage_model.model.ln_final.weight", - "text_model.final_layer_norm.weight"), - ("cond_stage_model.model.ln_final.bias", - "text_model.final_layer_norm.bias"), + "text_model.embeddings.token_embedding.weight", + ), + ("cond_stage_model.model.ln_final.weight", "text_model.final_layer_norm.weight"), + ("cond_stage_model.model.ln_final.bias", "text_model.final_layer_norm.bias"), ] textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst} @@ -977,10 +897,12 @@ def convert_ldm_clip_checkpoint(checkpoint): ("ln_final.", "transformer.text_model.final_layer_norm."), ( "token_embedding.weight", - "transformer.text_model.embeddings.token_embedding.weight", ), + "transformer.text_model.embeddings.token_embedding.weight", + ), ( "positional_embedding", - "transformer.text_model.embeddings.position_embedding.weight", ), + "transformer.text_model.embeddings.position_embedding.weight", + ), ] protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst} textenc_pattern = re.compile("|".join(protected.keys())) @@ -997,12 +919,11 @@ def convert_paint_by_example_checkpoint(checkpoint): for key in keys: if key.startswith("cond_stage_model.transformer"): - model_dict[key[len("cond_stage_model.transformer."):]] = checkpoint[ - key] + model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key] # load mapper keys_mapper = { - k[len("cond_stage_model.mapper.res"):]: v + k[len("cond_stage_model.mapper.res") :]: v for k, v in checkpoint.items() if k.startswith("cond_stage_model.mapper") } @@ -1017,7 +938,7 @@ def convert_paint_by_example_checkpoint(checkpoint): } for key, value in keys_mapper.items(): - prefix = key[:len("blocks.i")] + prefix = key[: len("blocks.i")] suffix = key.split(prefix)[-1].split(".")[-1] name = key.split(prefix)[-1].split(suffix)[0][1:-1] mapped_names = MAPPING[name] @@ -1026,13 +947,11 @@ def convert_paint_by_example_checkpoint(checkpoint): for i, mapped_name in enumerate(mapped_names): new_name = ".".join([prefix, mapped_name, suffix]) shape = value.shape[0] // num_splits - model_dict[new_name] = value[i * shape:(i + 1) * shape] + model_dict[new_name] = value[i * shape : (i + 1) * shape] # load final layer norm - model_dict["final_layer_norm.bias"] = checkpoint[ - "cond_stage_model.final_ln.bias"] - model_dict["final_layer_norm.weight"] = checkpoint[ - "cond_stage_model.final_ln.bias"] + model_dict["final_layer_norm.bias"] = checkpoint["cond_stage_model.final_ln.bias"] + model_dict["final_layer_norm.weight"] = checkpoint["cond_stage_model.final_ln.bias"] # load proj_out model_dict["proj_out.bias"] = checkpoint["proj_out.bias"] @@ -1042,64 +961,50 @@ def convert_paint_by_example_checkpoint(checkpoint): model_dict["uncond_vector"] = checkpoint["learnable_vector"] if len(model_dict) > 0: - model.load_dict( - PaintByExampleImageEncoder.smart_convert(model_dict, model)) + model.load_dict(PaintByExampleImageEncoder.smart_convert(model_dict, model)) return model def convert_open_clip_checkpoint(checkpoint): - text_model = CLIPTextModel.from_pretrained( - "stabilityai/stable-diffusion-2", subfolder="text_encoder") + text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder") text_model.eval() keys = list(checkpoint.keys()) text_model_dict = {} if "cond_stage_model.model.text_projection" in checkpoint: - d_model = int(checkpoint["cond_stage_model.model.text_projection"] - .shape[0]) + d_model = int(checkpoint["cond_stage_model.model.text_projection"].shape[0]) else: d_model = 1024 # text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids") for key in keys: - if ("resblocks.23" in - key): # Diffusers drops the final layer and only uses the penultimate layer + if "resblocks.23" in key: # Diffusers drops the final layer and only uses the penultimate layer continue if key in textenc_conversion_map: text_model_dict[textenc_conversion_map[key]] = checkpoint[key] if key.startswith("cond_stage_model.model.transformer."): - new_key = key[len("cond_stage_model.model.transformer."):] + new_key = key[len("cond_stage_model.model.transformer.") :] if new_key.endswith(".in_proj_weight"): - new_key = new_key[:-len(".in_proj_weight")] - new_key = textenc_pattern.sub( - lambda m: protected[re.escape(m.group(0))], new_key) - text_model_dict[new_key + ".q_proj.weight"] = checkpoint[ - key][:d_model, :] - text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][ - d_model:d_model * 2, :] - text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][ - d_model * 2:, :] + new_key = new_key[: -len(".in_proj_weight")] + new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key) + text_model_dict[new_key + ".q_proj.weight"] = checkpoint[key][:d_model, :] + text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][d_model : d_model * 2, :] + text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][d_model * 2 :, :] elif new_key.endswith(".in_proj_bias"): - new_key = new_key[:-len(".in_proj_bias")] - new_key = textenc_pattern.sub( - lambda m: protected[re.escape(m.group(0))], new_key) - text_model_dict[new_key + ".q_proj.bias"] = checkpoint[ - key][:d_model] - text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][ - d_model:d_model * 2] - text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][ - d_model * 2:] + new_key = new_key[: -len(".in_proj_bias")] + new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key) + text_model_dict[new_key + ".q_proj.bias"] = checkpoint[key][:d_model] + text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][d_model : d_model * 2] + text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][d_model * 2 :] else: - new_key = textenc_pattern.sub( - lambda m: protected[re.escape(m.group(0))], new_key) + new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key) text_model_dict[new_key] = checkpoint[key] if len(text_model_dict) > 0: - text_model.load_dict( - CLIPTextModel.smart_convert(text_model_dict, text_model)) + text_model.load_dict(CLIPTextModel.smart_convert(text_model_dict, text_model)) return text_model @@ -1121,17 +1026,13 @@ def stable_unclip_image_encoder(original_config): if clip_model_name == "ViT-L/14": feature_extractor = CLIPImageProcessor() - image_encoder = CLIPVisionModelWithProjection.from_pretrained( - "openai/clip-vit-large-patch14") + image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14") else: - raise NotImplementedError( - f"Unknown CLIP checkpoint name in stable diffusion checkpoint {clip_model_name}" - ) + raise NotImplementedError(f"Unknown CLIP checkpoint name in stable diffusion checkpoint {clip_model_name}") elif sd_clip_image_embedder_class == "FrozenOpenCLIPImageEmbedder": feature_extractor = CLIPImageProcessor() - image_encoder = CLIPVisionModelWithProjection.from_pretrained( - "laion/CLIP-ViT-H-14-laion2B-s32B-b79K") + image_encoder = CLIPVisionModelWithProjection.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K") else: raise NotImplementedError( f"Unknown CLIP image embedder class in stable diffusion checkpoint {sd_clip_image_embedder_class}" @@ -1141,8 +1042,9 @@ def stable_unclip_image_encoder(original_config): def stable_unclip_image_noising_components( - original_config, - clip_stats_path: Optional[str]=None, ): + original_config, + clip_stats_path: Optional[str] = None, +): """ Returns the noising components for the img2img and txt2img unclip pipelines. @@ -1162,15 +1064,12 @@ def stable_unclip_image_noising_components( max_noise_level = noise_aug_config.noise_schedule_config.timesteps beta_schedule = noise_aug_config.noise_schedule_config.beta_schedule - image_normalizer = StableUnCLIPImageNormalizer( - embedding_dim=embedding_dim) - image_noising_scheduler = DDPMScheduler( - num_train_timesteps=max_noise_level, beta_schedule=beta_schedule) + image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedding_dim) + image_noising_scheduler = DDPMScheduler(num_train_timesteps=max_noise_level, beta_schedule=beta_schedule) if "clip_stats_path" in noise_aug_config: if clip_stats_path is None: - raise ValueError( - "This stable unclip config requires a `clip_stats_path`") + raise ValueError("This stable unclip config requires a `clip_stats_path`") from ...utils import torch_load @@ -1189,22 +1088,21 @@ def stable_unclip_image_noising_components( image_normalizer.load_dict(clip_stats_state_dict) else: - raise NotImplementedError( - f"Unknown noise augmentor class: {noise_aug_class}") + raise NotImplementedError(f"Unknown noise augmentor class: {noise_aug_class}") image_normalizer.eval() return image_normalizer, image_noising_scheduler def convert_controlnet_checkpoint( - checkpoint, - original_config, - checkpoint_path, - image_size, - upcast_attention, - extract_ema, - no_unet_key=False, ): - ctrlnet_config = create_unet_diffusers_config( - original_config, image_size=image_size, controlnet=True) + checkpoint, + original_config, + checkpoint_path, + image_size, + upcast_attention, + extract_ema, + no_unet_key=False, +): + ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True) ctrlnet_config["upcast_attention"] = upcast_attention ctrlnet_config.pop("sample_size") @@ -1217,33 +1115,33 @@ def convert_controlnet_checkpoint( path=checkpoint_path, extract_ema=extract_ema, controlnet=True, - no_unet_key=no_unet_key, ) + no_unet_key=no_unet_key, + ) - controlnet_model.load_dict( - convert_diffusers_vae_unet_to_ppdiffusers(controlnet_model, - converted_ctrl_checkpoint)) + controlnet_model.load_dict(convert_diffusers_vae_unet_to_ppdiffusers(controlnet_model, converted_ctrl_checkpoint)) controlnet_model.eval() return controlnet_model def download_from_original_stable_diffusion_ckpt( - checkpoint_path: str, - original_config_file: str=None, - image_size: int=512, - prediction_type: str=None, - model_type: str=None, - extract_ema: bool=False, - scheduler_type: str="pndm", - num_in_channels: Optional[int]=None, - upcast_attention: Optional[bool]=None, - stable_unclip: Optional[str]=None, - stable_unclip_prior: Optional[str]=None, - clip_stats_path: Optional[str]=None, - controlnet: Optional[bool]=None, - load_safety_checker: bool=True, - pipeline_class: DiffusionPipeline=None, - paddle_dtype=None, - **kwargs, ) -> DiffusionPipeline: + checkpoint_path: str, + original_config_file: str = None, + image_size: int = 512, + prediction_type: str = None, + model_type: str = None, + extract_ema: bool = False, + scheduler_type: str = "pndm", + num_in_channels: Optional[int] = None, + upcast_attention: Optional[bool] = None, + stable_unclip: Optional[str] = None, + stable_unclip_prior: Optional[str] = None, + clip_stats_path: Optional[str] = None, + controlnet: Optional[bool] = None, + load_safety_checker: bool = True, + pipeline_class: DiffusionPipeline = None, + paddle_dtype=None, + **kwargs, +) -> DiffusionPipeline: """ Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml` config file. @@ -1288,10 +1186,14 @@ def download_from_original_stable_diffusion_ckpt( """ # import pipelines here to avoid circular import error when using from_ckpt method - from ppdiffusers import (LDMTextToImagePipeline, PaintByExamplePipeline, - StableDiffusionControlNetPipeline, - StableDiffusionPipeline, - StableUnCLIPImg2ImgPipeline, StableUnCLIPPipeline) + from ppdiffusers import ( + LDMTextToImagePipeline, + PaintByExamplePipeline, + StableDiffusionControlNetPipeline, + StableDiffusionPipeline, + StableUnCLIPImg2ImgPipeline, + StableUnCLIPPipeline, + ) if pipeline_class is None or pipeline_class.__name__ == "DiffusionPipeline": pipeline_class = StableDiffusionPipeline @@ -1304,8 +1206,7 @@ def download_from_original_stable_diffusion_ckpt( from omegaconf import OmegaConf - checkpoint = smart_load( - checkpoint_path, return_numpy=True, return_global_step=True) + checkpoint = smart_load(checkpoint_path, return_numpy=True, return_global_step=True) # NOTE: this while loop isn't great but this controlnet checkpoint has one additional # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21 @@ -1347,11 +1248,12 @@ def download_from_original_stable_diffusion_ckpt( original_config = OmegaConf.load(original_config_file) if num_in_channels is not None: - original_config["model"]["params"]["unet_config"]["params"][ - "in_channels"] = num_in_channels + original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels - if ("parameterization" in original_config["model"]["params"] and - original_config["model"]["params"]["parameterization"] == "v"): + if ( + "parameterization" in original_config["model"]["params"] + and original_config["model"]["params"]["parameterization"] == "v" + ): if prediction_type is None: # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"` # as it relies on a brittle global step parameter here @@ -1376,7 +1278,8 @@ def download_from_original_stable_diffusion_ckpt( checkpoint_path, image_size, upcast_attention, - extract_ema, ) + extract_ema, + ) num_train_timesteps = original_config.model.params.timesteps beta_start = original_config.model.params.linear_start beta_end = original_config.model.params.linear_end @@ -1389,7 +1292,8 @@ def download_from_original_stable_diffusion_ckpt( steps_offset=1, clip_sample=False, set_alpha_to_one=False, - prediction_type=prediction_type, ) + prediction_type=prediction_type, + ) # make sure scheduler works correctly with DDIM scheduler.register_to_config(clip_sample=False) @@ -1404,8 +1308,7 @@ def download_from_original_stable_diffusion_ckpt( elif scheduler_type == "euler": scheduler = EulerDiscreteScheduler.from_config(scheduler.config) elif scheduler_type == "euler-ancestral": - scheduler = EulerAncestralDiscreteScheduler.from_config( - scheduler.config) + scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config) elif scheduler_type == "dpm": scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config) elif scheduler_type == "ddim": @@ -1414,40 +1317,31 @@ def download_from_original_stable_diffusion_ckpt( raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") # Convert the UNet2DConditionModel model. - unet_config = create_unet_diffusers_config( - original_config, image_size=image_size) + unet_config = create_unet_diffusers_config(original_config, image_size=image_size) unet_config["upcast_attention"] = upcast_attention unet = UNet2DConditionModel(**unet_config) unet.eval() converted_unet_checkpoint = convert_ldm_unet_checkpoint( - checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema) - unet.load_dict( - convert_diffusers_vae_unet_to_ppdiffusers(unet, - converted_unet_checkpoint)) + checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema + ) + unet.load_dict(convert_diffusers_vae_unet_to_ppdiffusers(unet, converted_unet_checkpoint)) # Convert the VAE model. - vae_config = create_vae_diffusers_config( - original_config, image_size=image_size) - converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, - vae_config) + vae_config = create_vae_diffusers_config(original_config, image_size=image_size) + converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config) vae = AutoencoderKL(**vae_config) vae.eval() - vae.load_dict( - convert_diffusers_vae_unet_to_ppdiffusers(vae, - converted_vae_checkpoint)) + vae.load_dict(convert_diffusers_vae_unet_to_ppdiffusers(vae, converted_vae_checkpoint)) # Convert the text model. if model_type is None: - model_type = original_config.model.params.cond_stage_config.target.split( - ".")[-1] - logger.debug( - f"no `model_type` given, `model_type` inferred as: {model_type}") + model_type = original_config.model.params.cond_stage_config.target.split(".")[-1] + logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}") if model_type == "FrozenOpenCLIPEmbedder": text_model = convert_open_clip_checkpoint(checkpoint) - tokenizer = CLIPTokenizer.from_pretrained( - "stabilityai/stable-diffusion-2/tokenizer") + tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2/tokenizer") if stable_unclip is None: if controlnet: @@ -1460,7 +1354,8 @@ def download_from_original_stable_diffusion_ckpt( controlnet=controlnet_model, safety_checker=None, feature_extractor=None, - requires_safety_checker=False, ) + requires_safety_checker=False, + ) else: pipe = pipeline_class( vae=vae, @@ -1470,18 +1365,16 @@ def download_from_original_stable_diffusion_ckpt( scheduler=scheduler, safety_checker=None, feature_extractor=None, - requires_safety_checker=False, ) + requires_safety_checker=False, + ) else: - ( - image_normalizer, - image_noising_scheduler, - ) = stable_unclip_image_noising_components( + (image_normalizer, image_noising_scheduler,) = stable_unclip_image_noising_components( original_config, - clip_stats_path=clip_stats_path, ) + clip_stats_path=clip_stats_path, + ) if stable_unclip == "img2img": - feature_extractor, image_encoder = stable_unclip_image_encoder( - original_config) + feature_extractor, image_encoder = stable_unclip_image_encoder(original_config) pipe = StableUnCLIPImg2ImgPipeline( # image encoding components @@ -1496,26 +1389,20 @@ def download_from_original_stable_diffusion_ckpt( unet=unet, scheduler=scheduler, # vae - vae=vae, ) + vae=vae, + ) elif stable_unclip == "txt2img": if stable_unclip_prior is None or stable_unclip_prior == "karlo": karlo_model = "kakaobrain/karlo-v1-alpha" - prior = PriorTransformer.from_pretrained( - karlo_model, subfolder="prior") - - prior_tokenizer = CLIPTokenizer.from_pretrained( - "openai/clip-vit-large-patch14") - prior_text_model = CLIPTextModelWithProjection.from_pretrained( - "openai/clip-vit-large-patch14") - - prior_scheduler = UnCLIPScheduler.from_pretrained( - karlo_model, subfolder="prior_scheduler") - prior_scheduler = DDPMScheduler.from_config( - prior_scheduler.config) + prior = PriorTransformer.from_pretrained(karlo_model, subfolder="prior") + + prior_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") + prior_text_model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-large-patch14") + + prior_scheduler = UnCLIPScheduler.from_pretrained(karlo_model, subfolder="prior_scheduler") + prior_scheduler = DDPMScheduler.from_config(prior_scheduler.config) else: - raise NotImplementedError( - f"unknown prior for stable unclip model: {stable_unclip_prior}" - ) + raise NotImplementedError(f"unknown prior for stable unclip model: {stable_unclip_prior}") pipe = StableUnCLIPPipeline( # prior components @@ -1532,33 +1419,29 @@ def download_from_original_stable_diffusion_ckpt( unet=unet, scheduler=scheduler, # vae - vae=vae, ) + vae=vae, + ) else: - raise NotImplementedError( - f"unknown `stable_unclip` type: {stable_unclip}") + raise NotImplementedError(f"unknown `stable_unclip` type: {stable_unclip}") elif model_type == "PaintByExample": vision_model = convert_paint_by_example_checkpoint(checkpoint) - tokenizer = CLIPTokenizer.from_pretrained( - "openai/clip-vit-large-patch14") - feature_extractor = CLIPFeatureExtractor.from_pretrained( - "CompVis/stable-diffusion-safety-checker") + tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") + feature_extractor = CLIPFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker") pipe = PaintByExamplePipeline( vae=vae, image_encoder=vision_model, unet=unet, scheduler=scheduler, safety_checker=None, - feature_extractor=feature_extractor, ) + feature_extractor=feature_extractor, + ) elif model_type == "FrozenCLIPEmbedder": text_model = convert_ldm_clip_checkpoint(checkpoint) - tokenizer = CLIPTokenizer.from_pretrained( - "openai/clip-vit-large-patch14") + tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") if load_safety_checker: - safety_checker = StableDiffusionSafetyChecker.from_pretrained( - "CompVis/stable-diffusion-safety-checker") - feature_extractor = CLIPFeatureExtractor.from_pretrained( - "CompVis/stable-diffusion-safety-checker") + safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker") + feature_extractor = CLIPFeatureExtractor.from_pretrained("CompVis/stable-diffusion-safety-checker") else: safety_checker = None feature_extractor = None @@ -1573,7 +1456,8 @@ def download_from_original_stable_diffusion_ckpt( scheduler=scheduler, safety_checker=safety_checker, feature_extractor=feature_extractor, - requires_safety_checker=load_safety_checker, ) + requires_safety_checker=load_safety_checker, + ) else: pipe = pipeline_class( vae=vae, @@ -1583,19 +1467,20 @@ def download_from_original_stable_diffusion_ckpt( scheduler=scheduler, safety_checker=safety_checker, feature_extractor=feature_extractor, - requires_safety_checker=load_safety_checker, ) + requires_safety_checker=load_safety_checker, + ) else: text_config = create_ldm_bert_config(original_config) text_model = convert_ldm_bert_checkpoint(checkpoint, text_config) - tokenizer = BertTokenizer.from_pretrained( - "bert-base-uncased", model_max_length=77) + tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", model_max_length=77) pipe = LDMTextToImagePipeline( vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, - scheduler=scheduler, ) + scheduler=scheduler, + ) if paddle_dtype is not None: pipe.to(paddle_dtype=paddle_dtype) @@ -1603,13 +1488,14 @@ def download_from_original_stable_diffusion_ckpt( def download_controlnet_from_original_ckpt( - checkpoint_path: str, - original_config_file: str, - image_size: int=512, - extract_ema: bool=False, - num_in_channels: Optional[int]=None, - upcast_attention: Optional[bool]=None, - no_unet_key: Optional[bool]=False, ) -> DiffusionPipeline: + checkpoint_path: str, + original_config_file: str, + image_size: int = 512, + extract_ema: bool = False, + num_in_channels: Optional[int] = None, + upcast_attention: Optional[bool] = None, + no_unet_key: Optional[bool] = False, +) -> DiffusionPipeline: if not is_omegaconf_available(): raise ValueError(BACKENDS_MAPPING["omegaconf"][1]) @@ -1636,12 +1522,10 @@ def download_controlnet_from_original_ckpt( original_config = OmegaConf.load(original_config_file) if num_in_channels is not None: - original_config["model"]["params"]["unet_config"]["params"][ - "in_channels"] = num_in_channels + original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels if "control_stage_config" not in original_config.model.params: - raise ValueError( - "`control_stage_config` not present in original config") + raise ValueError("`control_stage_config` not present in original config") controlnet_model = convert_controlnet_checkpoint( checkpoint, @@ -1650,6 +1534,7 @@ def download_controlnet_from_original_ckpt( image_size, upcast_attention, extract_ema, - no_unet_key, ) + no_unet_key, + ) return controlnet_model diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt_deprecated.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt_deprecated.py index fc8dfda8a0781..4a8c6336fd55d 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt_deprecated.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/convert_from_ckpt_deprecated.py @@ -20,17 +20,32 @@ import numpy as np import requests -from paddlenlp.transformers import (BertTokenizer, CLIPFeatureExtractor, - CLIPTextModel, CLIPTokenizer) +from paddlenlp.transformers import ( + BertTokenizer, + CLIPFeatureExtractor, + CLIPTextModel, + CLIPTokenizer, +) from ppdiffusers import ( - AutoencoderKL, ControlNetModel, DDIMScheduler, DPMSolverMultistepScheduler, - EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, - HeunDiscreteScheduler, LDMTextToImagePipeline, LMSDiscreteScheduler, - PNDMScheduler, StableDiffusionControlNetPipeline, StableDiffusionPipeline, - UNet2DConditionModel) + AutoencoderKL, + ControlNetModel, + DDIMScheduler, + DPMSolverMultistepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + HeunDiscreteScheduler, + LDMTextToImagePipeline, + LMSDiscreteScheduler, + PNDMScheduler, + StableDiffusionControlNetPipeline, + StableDiffusionPipeline, + UNet2DConditionModel, +) from ppdiffusers.pipelines.latent_diffusion.pipeline_latent_diffusion import ( - LDMBertConfig, LDMBertModel) + LDMBertConfig, + LDMBertModel, +) from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker from ...utils import is_omegaconf_available, logging @@ -65,8 +80,7 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0): new_item = new_item.replace("emb_layers.1", "time_emb_proj") new_item = new_item.replace("skip_connection", "conv_shortcut") - new_item = shave_segments( - new_item, n_shave_prefix_segments=n_shave_prefix_segments) + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) mapping.append({"old": old_item, "new": new_item}) @@ -82,8 +96,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0): new_item = old_item new_item = new_item.replace("nin_shortcut", "conv_shortcut") - new_item = shave_segments( - new_item, n_shave_prefix_segments=n_shave_prefix_segments) + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) mapping.append({"old": old_item, "new": new_item}) @@ -126,8 +139,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): new_item = new_item.replace("proj_out.weight", "proj_attn.weight") new_item = new_item.replace("proj_out.bias", "proj_attn.bias") - new_item = shave_segments( - new_item, n_shave_prefix_segments=n_shave_prefix_segments) + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) mapping.append({"old": old_item, "new": new_item}) @@ -135,21 +147,20 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): def assign_to_checkpoint( - paths, - checkpoint, - old_checkpoint, - attention_paths_to_split=None, - additional_replacements=None, - config=None, ): + paths, + checkpoint, + old_checkpoint, + attention_paths_to_split=None, + additional_replacements=None, + config=None, +): """ This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits attention layers, and takes into account additional replacements that may arise. Assigns the weights to the new checkpoint. """ - assert isinstance( - paths, - list), "Paths should be a list of dicts containing 'old' and 'new' keys." + assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys." # Splits the attention layers into three variables. if attention_paths_to_split is not None: @@ -157,13 +168,11 @@ def assign_to_checkpoint( old_tensor = old_checkpoint[path] channels = old_tensor.shape[0] // 3 - target_shape = (-1, channels) if len(old_tensor.shape) == 3 else ( - -1) + target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1) num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3 - old_tensor = old_tensor.reshape((num_heads, 3 * channels // - num_heads) + old_tensor.shape[1:]) + old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:]) query, key, value = np.split(old_tensor, 3, axis=1) checkpoint[path_map["query"]] = query.reshape(target_shape) @@ -174,8 +183,7 @@ def assign_to_checkpoint( new_path = path["new"] # These have already been assigned - if (attention_paths_to_split is not None and - new_path in attention_paths_to_split): + if attention_paths_to_split is not None and new_path in attention_paths_to_split: continue # Global renaming happens here @@ -185,8 +193,7 @@ def assign_to_checkpoint( if additional_replacements is not None: for replacement in additional_replacements: - new_path = new_path.replace(replacement["old"], - replacement["new"]) + new_path = new_path.replace(replacement["old"], replacement["new"]) # proj_attn.weight has to be converted from conv 1D to linear if "proj_attn.weight" in new_path: @@ -207,9 +214,7 @@ def conv_attn_to_linear(checkpoint): checkpoint[key] = checkpoint[key][:, :, 0] -def create_unet_diffusers_config(original_config, - image_size: int, - controlnet=False): +def create_unet_diffusers_config(original_config, image_size: int, controlnet=False): """ Creates a config for the diffusers based on the config of the LDM model. """ @@ -220,34 +225,28 @@ def create_unet_diffusers_config(original_config, vae_params = original_config.model.params.first_stage_config.params.ddconfig - block_out_channels = [ - unet_params.model_channels * mult for mult in unet_params.channel_mult - ] + block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult] down_block_types = [] resolution = 1 for i in range(len(block_out_channels)): - block_type = ("CrossAttnDownBlock2D" - if resolution in unet_params.attention_resolutions else - "DownBlock2D") + block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D" down_block_types.append(block_type) if i != len(block_out_channels) - 1: resolution *= 2 up_block_types = [] for i in range(len(block_out_channels)): - block_type = ("CrossAttnUpBlock2D" - if resolution in unet_params.attention_resolutions else - "UpBlock2D") + block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" up_block_types.append(block_type) resolution //= 2 - vae_scale_factor = 2**(len(vae_params.ch_mult) - 1) + vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1) head_dim = unet_params.num_heads if "num_heads" in unet_params else None - use_linear_projection = (unet_params.use_linear_in_transformer - if "use_linear_in_transformer" in unet_params else - False) + use_linear_projection = ( + unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False + ) if use_linear_projection: # stable diffusion 2-base-512 and 2-768 if head_dim is None: @@ -262,9 +261,7 @@ def create_unet_diffusers_config(original_config, assert "adm_in_channels" in unet_params projection_class_embeddings_input_dim = unet_params.adm_in_channels else: - raise NotImplementedError( - f"Unknown conditional unet num_classes config: {unet_params.num_classes}" - ) + raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params.num_classes}") config = dict( sample_size=image_size // vae_scale_factor, @@ -304,7 +301,8 @@ def create_vae_diffusers_config(original_config, image_size: int): up_block_types=tuple(up_block_types), block_out_channels=tuple(block_out_channels), latent_channels=vae_params.z_channels, - layers_per_block=vae_params.num_res_blocks, ) + layers_per_block=vae_params.num_res_blocks, + ) return config @@ -330,15 +328,12 @@ def create_ldm_bert_config(original_config): attention_dropout=0.0, activation_dropout=0.0, init_std=0.02, - pad_token_id=0, ) + pad_token_id=0, + ) return LDMBertConfig(**config) -def convert_ldm_unet_checkpoint(checkpoint, - config, - path=None, - extract_ema=False, - controlnet=False): +def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False, controlnet=False): """ Takes a state dict and a config, and returns a converted checkpoint. """ @@ -362,8 +357,7 @@ def convert_ldm_unet_checkpoint(checkpoint, for key in keys: if key.startswith(unet_key[:-1]): flat_ema_key = "model_ema." + "".join(key.split(".")[1:]) - unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop( - flat_ema_key) + unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key) else: if sum(k.startswith("model_ema") for k in keys) > 100: print( @@ -377,34 +371,23 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint = {} - new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict[ - "time_embed.0.weight"] - new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict[ - "time_embed.0.bias"] - new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict[ - "time_embed.2.weight"] - new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict[ - "time_embed.2.bias"] + new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"] + new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"] + new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"] + new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"] if config["class_embed_type"] is None: # No parameters to port ... - elif (config["class_embed_type"] == "timestep" or - config["class_embed_type"] == "projection"): - new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict[ - "label_emb.0.0.weight"] - new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict[ - "label_emb.0.0.bias"] - new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict[ - "label_emb.0.2.weight"] - new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict[ - "label_emb.0.2.bias"] + elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection": + new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"] + new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"] + new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"] + new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"] else: - raise NotImplementedError( - f"Not implemented `class_embed_type`: {config['class_embed_type']}") + raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}") - new_checkpoint["conv_in.weight"] = unet_state_dict[ - "input_blocks.0.0.weight"] + new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"] new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"] if not controlnet: @@ -414,35 +397,23 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"] # Retrieves the keys for the input blocks only - num_input_blocks = len({ - ".".join(layer.split(".")[:2]) - for layer in unet_state_dict if "input_blocks" in layer - }) + num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer}) input_blocks = { - layer_id: - [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key] + layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key] for layer_id in range(num_input_blocks) } # Retrieves the keys for the middle blocks only - num_middle_blocks = len({ - ".".join(layer.split(".")[:2]) - for layer in unet_state_dict if "middle_block" in layer - }) + num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer}) middle_blocks = { - layer_id: - [key for key in unet_state_dict if f"middle_block.{layer_id}" in key] + layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key] for layer_id in range(num_middle_blocks) } # Retrieves the keys for the output blocks only - num_output_blocks = len({ - ".".join(layer.split(".")[:2]) - for layer in unet_state_dict if "output_blocks" in layer - }) + num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer}) output_blocks = { - layer_id: - [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key] + layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key] for layer_id in range(num_output_blocks) } @@ -451,21 +422,17 @@ def convert_ldm_unet_checkpoint(checkpoint, layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1) resnets = [ - key for key in input_blocks[i] - if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in - key - ] - attentions = [ - key for key in input_blocks[i] if f"input_blocks.{i}.1" in key + key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key ] + attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key] if f"input_blocks.{i}.0.op.weight" in unet_state_dict: - new_checkpoint[ - f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop( - f"input_blocks.{i}.0.op.weight") - new_checkpoint[ - f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop( - f"input_blocks.{i}.0.op.bias") + new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop( + f"input_blocks.{i}.0.op.weight" + ) + new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop( + f"input_blocks.{i}.0.op.bias" + ) paths = renew_resnet_paths(resnets) meta_path = { @@ -477,7 +444,8 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) if len(attentions): paths = renew_attention_paths(attentions) @@ -490,19 +458,18 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) resnet_0 = middle_blocks[0] attentions = middle_blocks[1] resnet_1 = middle_blocks[2] resnet_0_paths = renew_resnet_paths(resnet_0) - assign_to_checkpoint( - resnet_0_paths, new_checkpoint, unet_state_dict, config=config) + assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config) resnet_1_paths = renew_resnet_paths(resnet_1) - assign_to_checkpoint( - resnet_1_paths, new_checkpoint, unet_state_dict, config=config) + assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config) attentions_paths = renew_attention_paths(attentions) meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"} @@ -511,14 +478,13 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) for i in range(num_output_blocks): block_id = i // (config["layers_per_block"] + 1) layer_in_block_id = i % (config["layers_per_block"] + 1) - output_block_layers = [ - shave_segments(name, 2) for name in output_blocks[i] - ] + output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]] output_block_list = {} for layer in output_block_layers: @@ -529,12 +495,8 @@ def convert_ldm_unet_checkpoint(checkpoint, output_block_list[layer_id] = [layer_name] if len(output_block_list) > 1: - resnets = [ - key for key in output_blocks[i] if f"output_blocks.{i}.0" in key - ] - attentions = [ - key for key in output_blocks[i] if f"output_blocks.{i}.1" in key - ] + resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key] + attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key] resnet_0_paths = renew_resnet_paths(resnets) paths = renew_resnet_paths(resnets) @@ -548,22 +510,19 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) - output_block_list = { - k: sorted(v) - for k, v in output_block_list.items() - } + output_block_list = {k: sorted(v) for k, v in output_block_list.items()} if ["conv.bias", "conv.weight"] in output_block_list.values(): - index = list(output_block_list.values()).index( - ["conv.bias", "conv.weight"]) - new_checkpoint[ - f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[ - f"output_blocks.{i}.{index}.conv.weight"] - new_checkpoint[ - f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[ - f"output_blocks.{i}.{index}.conv.bias"] + index = list(output_block_list.values()).index(["conv.bias", "conv.weight"]) + new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[ + f"output_blocks.{i}.{index}.conv.weight" + ] + new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[ + f"output_blocks.{i}.{index}.conv.bias" + ] # Clear attentions as they have been attributed above. if len(attentions) == 2: @@ -573,27 +532,28 @@ def convert_ldm_unet_checkpoint(checkpoint, paths = renew_attention_paths(attentions) meta_path = { "old": f"output_blocks.{i}.1", - "new": - f"up_blocks.{block_id}.attentions.{layer_in_block_id}", + "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}", } assign_to_checkpoint( paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) else: - resnet_0_paths = renew_resnet_paths( - output_block_layers, n_shave_prefix_segments=1) + resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1) for path in resnet_0_paths: old_path = ".".join(["output_blocks", str(i), path["old"]]) - new_path = ".".join([ - "up_blocks", - str(block_id), - "resnets", - str(layer_in_block_id), - path["new"], - ]) + new_path = ".".join( + [ + "up_blocks", + str(block_id), + "resnets", + str(layer_in_block_id), + path["new"], + ] + ) new_checkpoint[new_path] = unet_state_dict[old_path] @@ -602,48 +562,42 @@ def convert_ldm_unet_checkpoint(checkpoint, orig_index = 0 - new_checkpoint[ - "controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop( - f"input_hint_block.{orig_index}.weight") - new_checkpoint[ - "controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop( - f"input_hint_block.{orig_index}.bias") + new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop( + f"input_hint_block.{orig_index}.weight" + ) + new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop( + f"input_hint_block.{orig_index}.bias" + ) orig_index += 2 diffusers_index = 0 while diffusers_index < 6: - new_checkpoint[ - f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop( - f"input_hint_block.{orig_index}.weight") - new_checkpoint[ - f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop( - f"input_hint_block.{orig_index}.bias") + new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop( + f"input_hint_block.{orig_index}.weight" + ) + new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop( + f"input_hint_block.{orig_index}.bias" + ) diffusers_index += 1 orig_index += 2 - new_checkpoint[ - "controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop( - f"input_hint_block.{orig_index}.weight") - new_checkpoint[ - "controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop( - f"input_hint_block.{orig_index}.bias") + new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop( + f"input_hint_block.{orig_index}.weight" + ) + new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop( + f"input_hint_block.{orig_index}.bias" + ) # down blocks for i in range(num_input_blocks): - new_checkpoint[ - f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop( - f"zero_convs.{i}.0.weight") - new_checkpoint[ - f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop( - f"zero_convs.{i}.0.bias") + new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight") + new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias") # mid block - new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop( - "middle_block_out.0.weight") - new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop( - "middle_block_out.0.bias") + new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight") + new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias") return new_checkpoint @@ -659,107 +613,74 @@ def convert_ldm_vae_checkpoint(checkpoint, config): new_checkpoint = {} - new_checkpoint["encoder.conv_in.weight"] = vae_state_dict[ - "encoder.conv_in.weight"] - new_checkpoint["encoder.conv_in.bias"] = vae_state_dict[ - "encoder.conv_in.bias"] - new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[ - "encoder.conv_out.weight"] - new_checkpoint["encoder.conv_out.bias"] = vae_state_dict[ - "encoder.conv_out.bias"] - new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[ - "encoder.norm_out.weight"] - new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[ - "encoder.norm_out.bias"] - - new_checkpoint["decoder.conv_in.weight"] = vae_state_dict[ - "decoder.conv_in.weight"] - new_checkpoint["decoder.conv_in.bias"] = vae_state_dict[ - "decoder.conv_in.bias"] - new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[ - "decoder.conv_out.weight"] - new_checkpoint["decoder.conv_out.bias"] = vae_state_dict[ - "decoder.conv_out.bias"] - new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[ - "decoder.norm_out.weight"] - new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[ - "decoder.norm_out.bias"] + new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"] + new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"] + new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"] + new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"] + new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"] + new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"] + + new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"] + new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"] + new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"] + new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"] + new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"] + new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"] new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"] new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"] - new_checkpoint["post_quant_conv.weight"] = vae_state_dict[ - "post_quant_conv.weight"] - new_checkpoint["post_quant_conv.bias"] = vae_state_dict[ - "post_quant_conv.bias"] + new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"] + new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"] # Retrieves the keys for the encoder down blocks only - num_down_blocks = len({ - ".".join(layer.split(".")[:3]) - for layer in vae_state_dict if "encoder.down" in layer - }) + num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer}) down_blocks = { - layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] - for layer_id in range(num_down_blocks) + layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks) } # Retrieves the keys for the decoder up blocks only - num_up_blocks = len({ - ".".join(layer.split(".")[:3]) - for layer in vae_state_dict if "decoder.up" in layer - }) + num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer}) up_blocks = { - layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] - for layer_id in range(num_up_blocks) + layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks) } for i in range(num_down_blocks): - resnets = [ - key for key in down_blocks[i] - if f"down.{i}" in key and f"down.{i}.downsample" not in key - ] + resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key] if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict: - new_checkpoint[ - f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop( - f"encoder.down.{i}.downsample.conv.weight") - new_checkpoint[ - f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop( - f"encoder.down.{i}.downsample.conv.bias") + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop( + f"encoder.down.{i}.downsample.conv.weight" + ) + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop( + f"encoder.down.{i}.downsample.conv.bias" + ) paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"down.{i}.block", - "new": f"down_blocks.{i}.resnets" - } + meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key] num_mid_res_blocks = 2 for i in range(1, num_mid_res_blocks + 1): - resnets = [ - key for key in mid_resnets if f"encoder.mid.block_{i}" in key - ] + resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key] paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"mid.block_{i}", - "new": f"mid_block.resnets.{i - 1}" - } + meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) - mid_attentions = [ - key for key in vae_state_dict if "encoder.mid.attn" in key - ] + mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key] paths = renew_vae_attention_paths(mid_attentions) meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} assign_to_checkpoint( @@ -767,58 +688,50 @@ def convert_ldm_vae_checkpoint(checkpoint, config): new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) conv_attn_to_linear(new_checkpoint) for i in range(num_up_blocks): block_id = num_up_blocks - 1 - i resnets = [ - key for key in up_blocks[block_id] - if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key + key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key ] if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict: - new_checkpoint[ - f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[ - f"decoder.up.{block_id}.upsample.conv.weight"] - new_checkpoint[ - f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[ - f"decoder.up.{block_id}.upsample.conv.bias"] + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[ + f"decoder.up.{block_id}.upsample.conv.weight" + ] + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[ + f"decoder.up.{block_id}.upsample.conv.bias" + ] paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"up.{block_id}.block", - "new": f"up_blocks.{i}.resnets" - } + meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key] num_mid_res_blocks = 2 for i in range(1, num_mid_res_blocks + 1): - resnets = [ - key for key in mid_resnets if f"decoder.mid.block_{i}" in key - ] + resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key] paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"mid.block_{i}", - "new": f"mid_block.resnets.{i - 1}" - } + meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) - mid_attentions = [ - key for key in vae_state_dict if "decoder.mid.attn" in key - ] + mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key] paths = renew_vae_attention_paths(mid_attentions) meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} assign_to_checkpoint( @@ -826,13 +739,13 @@ def convert_ldm_vae_checkpoint(checkpoint, config): new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) conv_attn_to_linear(new_checkpoint) return new_checkpoint -def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, - diffusers_vae_unet_checkpoint): +def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint): import paddle.nn as nn need_transpose = [] @@ -858,52 +771,56 @@ def convert_ldm_bert_checkpoint(checkpoint, config): bert_state_dict[key.replace(bert_key, "")] = checkpoint.get(key) new_checkpoint = {} - new_checkpoint["embeddings.word_embeddings.weight"] = bert_state_dict[ - "transformer.token_emb.weight"] - new_checkpoint["embeddings.position_embeddings.weight"] = bert_state_dict[ - "transformer.pos_emb.emb.weight"] + new_checkpoint["embeddings.word_embeddings.weight"] = bert_state_dict["transformer.token_emb.weight"] + new_checkpoint["embeddings.position_embeddings.weight"] = bert_state_dict["transformer.pos_emb.emb.weight"] for i in range(config.encoder_layers): double_i = 2 * i double_i_plus1 = 2 * i + 1 # convert norm new_checkpoint[f"encoder.layers.{i}.norm1.weight"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i}.0.weight"] + f"transformer.attn_layers.layers.{double_i}.0.weight" + ] new_checkpoint[f"encoder.layers.{i}.norm1.bias"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i}.0.bias"] - - new_checkpoint[ - f"encoder.layers.{i}.self_attn.q_proj.weight"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i}.1.to_q.weight"].T - new_checkpoint[ - f"encoder.layers.{i}.self_attn.k_proj.weight"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i}.1.to_k.weight"].T - new_checkpoint[ - f"encoder.layers.{i}.self_attn.v_proj.weight"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i}.1.to_v.weight"].T - new_checkpoint[ - f"encoder.layers.{i}.self_attn.out_proj.weight"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i}.1.to_out.weight"].T - new_checkpoint[ - f"encoder.layers.{i}.self_attn.out_proj.bias"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i}.1.to_out.bias"] + f"transformer.attn_layers.layers.{double_i}.0.bias" + ] + + new_checkpoint[f"encoder.layers.{i}.self_attn.q_proj.weight"] = bert_state_dict[ + f"transformer.attn_layers.layers.{double_i}.1.to_q.weight" + ].T + new_checkpoint[f"encoder.layers.{i}.self_attn.k_proj.weight"] = bert_state_dict[ + f"transformer.attn_layers.layers.{double_i}.1.to_k.weight" + ].T + new_checkpoint[f"encoder.layers.{i}.self_attn.v_proj.weight"] = bert_state_dict[ + f"transformer.attn_layers.layers.{double_i}.1.to_v.weight" + ].T + new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.weight"] = bert_state_dict[ + f"transformer.attn_layers.layers.{double_i}.1.to_out.weight" + ].T + new_checkpoint[f"encoder.layers.{i}.self_attn.out_proj.bias"] = bert_state_dict[ + f"transformer.attn_layers.layers.{double_i}.1.to_out.bias" + ] new_checkpoint[f"encoder.layers.{i}.norm2.weight"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i_plus1}.0.weight"] + f"transformer.attn_layers.layers.{double_i_plus1}.0.weight" + ] new_checkpoint[f"encoder.layers.{i}.norm2.bias"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i_plus1}.0.bias"] + f"transformer.attn_layers.layers.{double_i_plus1}.0.bias" + ] new_checkpoint[f"encoder.layers.{i}.linear1.weight"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight"].T + f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.weight" + ].T new_checkpoint[f"encoder.layers.{i}.linear1.bias"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias"] + f"transformer.attn_layers.layers.{double_i_plus1}.1.net.0.0.bias" + ] new_checkpoint[f"encoder.layers.{i}.linear2.weight"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight"].T + f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.weight" + ].T new_checkpoint[f"encoder.layers.{i}.linear2.bias"] = bert_state_dict[ - f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias"].T + f"transformer.attn_layers.layers.{double_i_plus1}.1.net.2.bias" + ].T - new_checkpoint["final_layer_norm.weight"] = bert_state_dict[ - "transformer.norm.weight"] - new_checkpoint["final_layer_norm.bias"] = bert_state_dict[ - "transformer.norm.bias"] + new_checkpoint["final_layer_norm.weight"] = bert_state_dict["transformer.norm.weight"] + new_checkpoint["final_layer_norm.bias"] = bert_state_dict["transformer.norm.bias"] ldmbert = LDMBertModel(config) ldmbert.eval() ldmbert.load_dict(new_checkpoint) @@ -911,8 +828,7 @@ def convert_ldm_bert_checkpoint(checkpoint, config): def convert_ldm_clip_checkpoint(checkpoint): - text_model = CLIPTextModel.from_pretrained( - "CompVis/stable-diffusion-v1-4", subfolder="text_encoder") + text_model = CLIPTextModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="text_encoder") text_model.eval() keys = list(checkpoint.keys()) @@ -921,12 +837,10 @@ def convert_ldm_clip_checkpoint(checkpoint): for key in keys: if key.startswith("cond_stage_model.transformer"): - text_model_dict[key[len( - "cond_stage_model.transformer."):]] = checkpoint[key] + text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key] if len(text_model_dict) > 0: - text_model.load_dict( - CLIPTextModel.smart_convert(text_model_dict, text_model)) + text_model.load_dict(CLIPTextModel.smart_convert(text_model_dict, text_model)) return text_model @@ -934,14 +848,14 @@ def convert_ldm_clip_checkpoint(checkpoint): textenc_conversion_lst = [ ( "cond_stage_model.model.positional_embedding", - "text_model.embeddings.position_embedding.weight", ), + "text_model.embeddings.position_embedding.weight", + ), ( "cond_stage_model.model.token_embedding.weight", - "text_model.embeddings.token_embedding.weight", ), - ("cond_stage_model.model.ln_final.weight", - "text_model.final_layer_norm.weight"), - ("cond_stage_model.model.ln_final.bias", - "text_model.final_layer_norm.bias"), + "text_model.embeddings.token_embedding.weight", + ), + ("cond_stage_model.model.ln_final.weight", "text_model.final_layer_norm.weight"), + ("cond_stage_model.model.ln_final.bias", "text_model.final_layer_norm.bias"), ] textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst} @@ -956,83 +870,73 @@ def convert_ldm_clip_checkpoint(checkpoint): ("ln_final.", "transformer.text_model.final_layer_norm."), ( "token_embedding.weight", - "transformer.text_model.embeddings.token_embedding.weight", ), + "transformer.text_model.embeddings.token_embedding.weight", + ), ( "positional_embedding", - "transformer.text_model.embeddings.position_embedding.weight", ), + "transformer.text_model.embeddings.position_embedding.weight", + ), ] protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst} textenc_pattern = re.compile("|".join(protected.keys())) def convert_open_clip_checkpoint(checkpoint): - text_model = CLIPTextModel.from_pretrained( - "stabilityai/stable-diffusion-2", subfolder="text_encoder") + text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder") text_model.eval() keys = list(checkpoint.keys()) text_model_dict = {} if "cond_stage_model.model.text_projection" in checkpoint: - d_model = int(checkpoint["cond_stage_model.model.text_projection"] - .shape[0]) + d_model = int(checkpoint["cond_stage_model.model.text_projection"].shape[0]) else: d_model = 1024 for key in keys: - if ("resblocks.23" in - key): # Diffusers drops the final layer and only uses the penultimate layer + if "resblocks.23" in key: # Diffusers drops the final layer and only uses the penultimate layer continue if key in textenc_conversion_map: text_model_dict[textenc_conversion_map[key]] = checkpoint[key] if key.startswith("cond_stage_model.model.transformer."): - new_key = key[len("cond_stage_model.model.transformer."):] + new_key = key[len("cond_stage_model.model.transformer.") :] if new_key.endswith(".in_proj_weight"): - new_key = new_key[:-len(".in_proj_weight")] - new_key = textenc_pattern.sub( - lambda m: protected[re.escape(m.group(0))], new_key) - text_model_dict[new_key + ".q_proj.weight"] = checkpoint[ - key][:d_model, :] - text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][ - d_model:d_model * 2, :] - text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][ - d_model * 2:, :] + new_key = new_key[: -len(".in_proj_weight")] + new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key) + text_model_dict[new_key + ".q_proj.weight"] = checkpoint[key][:d_model, :] + text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][d_model : d_model * 2, :] + text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][d_model * 2 :, :] elif new_key.endswith(".in_proj_bias"): - new_key = new_key[:-len(".in_proj_bias")] - new_key = textenc_pattern.sub( - lambda m: protected[re.escape(m.group(0))], new_key) - text_model_dict[new_key + ".q_proj.bias"] = checkpoint[ - key][:d_model] - text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][ - d_model:d_model * 2] - text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][ - d_model * 2:] + new_key = new_key[: -len(".in_proj_bias")] + new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key) + text_model_dict[new_key + ".q_proj.bias"] = checkpoint[key][:d_model] + text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][d_model : d_model * 2] + text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][d_model * 2 :] else: - new_key = textenc_pattern.sub( - lambda m: protected[re.escape(m.group(0))], new_key) + new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key) text_model_dict[new_key] = checkpoint[key] if len(text_model_dict) > 0: - text_model.load_dict( - CLIPTextModel.smart_convert(text_model_dict, text_model)) + text_model.load_dict(CLIPTextModel.smart_convert(text_model_dict, text_model)) return text_model def load_pipeline_from_original_stable_diffusion_ckpt( - checkpoint_path: str, - original_config_file: str=None, - image_size: int=512, - prediction_type: str=None, - model_type: str=None, - extract_ema: bool=False, - scheduler_type: str="pndm", - num_in_channels: Optional[int]=None, - upcast_attention: Optional[bool]=None, - paddle_dtype: Optional[bool]=None, - requires_safety_checker: bool=False, - controlnet: Optional[bool]=None, - cls=None, - **kwargs, ) -> StableDiffusionPipeline: + checkpoint_path: str, + original_config_file: str = None, + image_size: int = 512, + prediction_type: str = None, + model_type: str = None, + extract_ema: bool = False, + scheduler_type: str = "pndm", + num_in_channels: Optional[int] = None, + upcast_attention: Optional[bool] = None, + paddle_dtype: Optional[bool] = None, + requires_safety_checker: bool = False, + controlnet: Optional[bool] = None, + cls=None, + **kwargs, +) -> StableDiffusionPipeline: """ Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml` config file. @@ -1079,8 +983,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt( from omegaconf import OmegaConf - checkpoint = smart_load( - checkpoint_path, return_numpy=True, return_global_step=True) + checkpoint = smart_load(checkpoint_path, return_numpy=True, return_global_step=True) global_step = int(checkpoint.pop("global_step", -1)) @@ -1106,8 +1009,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt( key_name = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight" original_config_file = os.path.join(tmpdir, "inference.yaml") - if key_name in checkpoint and checkpoint[key_name].shape[ - -1] == 1024: + if key_name in checkpoint and checkpoint[key_name].shape[-1] == 1024: if not os.path.isfile("v2-inference-v.yaml"): # model_type = "v2" r = requests.get( @@ -1129,11 +1031,12 @@ def load_pipeline_from_original_stable_diffusion_ckpt( original_config = OmegaConf.load(original_config_file) if num_in_channels is not None: - original_config["model"]["params"]["unet_config"]["params"][ - "in_channels"] = num_in_channels + original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels - if ("parameterization" in original_config["model"]["params"] and - original_config["model"]["params"]["parameterization"] == "v"): + if ( + "parameterization" in original_config["model"]["params"] + and original_config["model"]["params"]["parameterization"] == "v" + ): if prediction_type is None: # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"` # as it relies on a brittle global step parameter here @@ -1160,7 +1063,8 @@ def load_pipeline_from_original_stable_diffusion_ckpt( steps_offset=1, clip_sample=False, set_alpha_to_one=False, - prediction_type=prediction_type, ) + prediction_type=prediction_type, + ) # make sure scheduler works correctly with DDIM scheduler.register_to_config(clip_sample=False) @@ -1175,8 +1079,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt( elif scheduler_type == "euler": scheduler = EulerDiscreteScheduler.from_config(scheduler.config) elif scheduler_type == "euler-ancestral": - scheduler = EulerAncestralDiscreteScheduler.from_config( - scheduler.config) + scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config) elif scheduler_type == "dpm": scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config) elif scheduler_type == "ddim": @@ -1185,44 +1088,35 @@ def load_pipeline_from_original_stable_diffusion_ckpt( raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") # Convert the UNet2DConditionModel model. - unet_config = create_unet_diffusers_config( - original_config, image_size=image_size) + unet_config = create_unet_diffusers_config(original_config, image_size=image_size) unet_config["upcast_attention"] = upcast_attention unet = UNet2DConditionModel(**unet_config) unet.eval() converted_unet_checkpoint = convert_ldm_unet_checkpoint( - checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema) - unet.load_dict( - convert_diffusers_vae_unet_to_ppdiffusers(unet, - converted_unet_checkpoint)) + checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema + ) + unet.load_dict(convert_diffusers_vae_unet_to_ppdiffusers(unet, converted_unet_checkpoint)) # Convert the VAE model. - vae_config = create_vae_diffusers_config( - original_config, image_size=image_size) - converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, - vae_config) + vae_config = create_vae_diffusers_config(original_config, image_size=image_size) + converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config) vae = AutoencoderKL(**vae_config) vae.eval() - vae.load_dict( - convert_diffusers_vae_unet_to_ppdiffusers(vae, - converted_vae_checkpoint)) + vae.load_dict(convert_diffusers_vae_unet_to_ppdiffusers(vae, converted_vae_checkpoint)) # Convert the text model. if model_type is None: - model_type = original_config.model.params.cond_stage_config.target.split( - ".")[-1] - logger.debug( - f"no `model_type` given, `model_type` inferred as: {model_type}") + model_type = original_config.model.params.cond_stage_config.target.split(".")[-1] + logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}") if controlnet is None: controlnet = "control_stage_config" in original_config.model.params if model_type == "FrozenOpenCLIPEmbedder": text_model = convert_open_clip_checkpoint(checkpoint) - tokenizer = CLIPTokenizer.from_pretrained( - "stabilityai/stable-diffusion-2/tokenizer") + tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2/tokenizer") if paddle_dtype is not None: vae.to(dtype=paddle_dtype) @@ -1231,8 +1125,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt( if controlnet: # Convert the ControlNetModel model. - ctrlnet_config = create_unet_diffusers_config( - original_config, image_size=image_size, controlnet=True) + ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True) ctrlnet_config["upcast_attention"] = upcast_attention ctrlnet_config.pop("sample_size") @@ -1245,10 +1138,11 @@ def load_pipeline_from_original_stable_diffusion_ckpt( ctrlnet_config, path=checkpoint_path, extract_ema=extract_ema, - controlnet=True, ) + controlnet=True, + ) controlnet_model.load_dict( - convert_diffusers_vae_unet_to_ppdiffusers( - controlnet_model, converted_ctrl_checkpoint)) + convert_diffusers_vae_unet_to_ppdiffusers(controlnet_model, converted_ctrl_checkpoint) + ) if paddle_dtype is not None: controlnet_model.to(dtype=paddle_dtype) @@ -1262,7 +1156,8 @@ def load_pipeline_from_original_stable_diffusion_ckpt( scheduler=scheduler, safety_checker=None, feature_extractor=None, - requires_safety_checker=False, ) + requires_safety_checker=False, + ) else: pipe = cls( vae=vae, @@ -1272,17 +1167,19 @@ def load_pipeline_from_original_stable_diffusion_ckpt( scheduler=scheduler, safety_checker=None, feature_extractor=None, - requires_safety_checker=False, ) + requires_safety_checker=False, + ) elif model_type == "FrozenCLIPEmbedder": text_model = convert_ldm_clip_checkpoint(checkpoint) - tokenizer = CLIPTokenizer.from_pretrained( - "CompVis/stable-diffusion-v1-4/tokenizer") + tokenizer = CLIPTokenizer.from_pretrained("CompVis/stable-diffusion-v1-4/tokenizer") if requires_safety_checker: safety_checker = StableDiffusionSafetyChecker.from_pretrained( - "CompVis/stable-diffusion-v1-4", subfolder="safety_checker") + "CompVis/stable-diffusion-v1-4", subfolder="safety_checker" + ) feature_extractor = CLIPFeatureExtractor.from_pretrained( - "CompVis/stable-diffusion-v1-4", subfolder="feature_extractor") + "CompVis/stable-diffusion-v1-4", subfolder="feature_extractor" + ) else: safety_checker = feature_extractor = None @@ -1295,8 +1192,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt( if controlnet: # Convert the ControlNetModel model. - ctrlnet_config = create_unet_diffusers_config( - original_config, image_size=image_size, controlnet=True) + ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True) ctrlnet_config["upcast_attention"] = upcast_attention ctrlnet_config.pop("sample_size") @@ -1309,10 +1205,11 @@ def load_pipeline_from_original_stable_diffusion_ckpt( ctrlnet_config, path=checkpoint_path, extract_ema=extract_ema, - controlnet=True, ) + controlnet=True, + ) controlnet_model.load_dict( - convert_diffusers_vae_unet_to_ppdiffusers( - controlnet_model, converted_ctrl_checkpoint)) + convert_diffusers_vae_unet_to_ppdiffusers(controlnet_model, converted_ctrl_checkpoint) + ) if paddle_dtype is not None: controlnet_model.to(dtype=paddle_dtype) @@ -1326,7 +1223,8 @@ def load_pipeline_from_original_stable_diffusion_ckpt( scheduler=scheduler, safety_checker=safety_checker, feature_extractor=feature_extractor, - requires_safety_checker=requires_safety_checker, ) + requires_safety_checker=requires_safety_checker, + ) else: pipe = cls( vae=vae, @@ -1336,12 +1234,12 @@ def load_pipeline_from_original_stable_diffusion_ckpt( scheduler=scheduler, safety_checker=safety_checker, feature_extractor=feature_extractor, - requires_safety_checker=requires_safety_checker, ) + requires_safety_checker=requires_safety_checker, + ) else: text_config = create_ldm_bert_config(original_config) text_model = convert_ldm_bert_checkpoint(checkpoint, text_config) - tokenizer = BertTokenizer.from_pretrained( - "bert-base-uncased", model_max_length=77) + tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", model_max_length=77) if paddle_dtype is not None: vae.to(dtype=paddle_dtype) text_model.to(dtype=paddle_dtype) @@ -1351,6 +1249,7 @@ def load_pipeline_from_original_stable_diffusion_ckpt( bert=text_model, tokenizer=tokenizer, unet=unet, - scheduler=scheduler, ) + scheduler=scheduler, + ) return pipe diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/hf_clip_model.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/hf_clip_model.py index 29d9afb9eef79..5b406410e76aa 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/hf_clip_model.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/hf_clip_model.py @@ -26,9 +26,15 @@ from paddle.distributed.fleet.utils import recompute from paddlenlp.transformers.activations import ACT2FN from paddlenlp.transformers.clip.configuration import ( - CLIPConfig, CLIPTextConfig, CLIPVisionConfig) + CLIPConfig, + CLIPTextConfig, + CLIPVisionConfig, +) from paddlenlp.transformers.model_outputs import ( - BaseModelOutput, BaseModelOutputWithPooling, ModelOutput) + BaseModelOutput, + BaseModelOutputWithPooling, + ModelOutput, +) from paddlenlp.transformers.model_utils import PretrainedModel from ppdiffusers.initializer import normal_, ones_ @@ -39,7 +45,7 @@ ] -def finfo(dtype: paddle.dtype=None): +def finfo(dtype: paddle.dtype = None): if dtype is None: dtype = paddle.get_default_dtype() @@ -58,10 +64,7 @@ class BFloatFInfo: def Parameter(data: paddle.Tensor, requires_grad=True): - tensor = paddle.create_parameter( - data.shape, - dtype=data.dtype, - default_initializer=nn.initializer.Assign(data)) + tensor = paddle.create_parameter(data.shape, dtype=data.dtype, default_initializer=nn.initializer.Assign(data)) if not requires_grad: tensor.stop_gradient = True return tensor @@ -74,13 +77,14 @@ class TorchLinear(nn.Layer): """ def __init__( - self, - in_features, - out_features, - weight_attr=None, - bias_attr=None, - name=None, - bias=None, ): + self, + in_features, + out_features, + weight_attr=None, + bias_attr=None, + name=None, + bias=None, + ): super().__init__() self._dtype = self._helper.get_default_dtype() self._weight_attr = weight_attr @@ -96,23 +100,25 @@ def __init__( ], # regular linear has shape [in_features, out_features] attr=self._weight_attr, dtype=self._dtype, - is_bias=False, ) + is_bias=False, + ) self.bias = self.create_parameter( shape=[out_features], attr=self._bias_attr, dtype=self._dtype, - is_bias=True, ) + is_bias=True, + ) self.name = name def forward(self, input): - out = F.linear( - x=input, weight=self.weight.T, bias=self.bias, name=self.name) + out = F.linear(x=input, weight=self.weight.T, bias=self.bias, name=self.name) return out def extra_repr(self): name_str = ", name={}".format(self.name) if self.name else "" return "in_features={}, out_features={}, dtype={}{}".format( - self.weight.shape[1], self.weight.shape[0], self._dtype, name_str) + self.weight.shape[1], self.weight.shape[0], self._dtype, name_str + ) def str2bool(v): @@ -139,20 +145,18 @@ def masked_fill(x, mask, value): return paddle.where(mask, y, x) -def _expand_mask(mask: paddle.Tensor, dtype, tgt_len: Optional[int]=None): +def _expand_mask(mask: paddle.Tensor, dtype, tgt_len: Optional[int] = None): """ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. """ bsz, src_len = mask.shape tgt_len = tgt_len if tgt_len is not None else src_len - expanded_mask = ( - mask[:, None, None, :].expand([bsz, 1, tgt_len, src_len]).cast(dtype)) + expanded_mask = mask[:, None, None, :].expand([bsz, 1, tgt_len, src_len]).cast(dtype) inverted_mask = 1.0 - expanded_mask - return masked_fill(inverted_mask, - inverted_mask.cast(paddle.bool), finfo(dtype).min) + return masked_fill(inverted_mask, inverted_mask.cast(paddle.bool), finfo(dtype).min) # contrastive loss function, adapted from @@ -256,9 +260,10 @@ class HFCLIPOutput(ModelOutput): vision_model_output: BaseModelOutputWithPooling = None def to_tuple(self) -> Tuple[Any]: - return tuple(self[k] - if k not in ["text_model_output", "vision_model_output"] - else getattr(self, k).to_tuple() for k in self.keys()) + return tuple( + self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) class HFCLIPVisionEmbeddings(nn.Layer): @@ -269,30 +274,29 @@ def __init__(self, config: CLIPVisionConfig): self.image_size = config.image_size self.patch_size = config.patch_size - self.class_embedding = Parameter(paddle.randn((self.embed_dim, ))) + self.class_embedding = Parameter(paddle.randn((self.embed_dim,))) self.patch_embedding = nn.Conv2D( in_channels=config.num_channels, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size, - bias_attr=False, ) + bias_attr=False, + ) - self.num_patches = (self.image_size // self.patch_size)**2 + self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_positions = self.num_patches + 1 - self.position_embedding = nn.Embedding(self.num_positions, - self.embed_dim) + self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) self.register_buffer( "position_ids", - paddle.arange(self.num_positions).expand( - (1, -1), dtype="int64"), - persistable=False, ) + paddle.arange(self.num_positions).expand((1, -1), dtype="int64"), + persistable=False, + ) def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor: batch_size = pixel_values.shape[0] target_dtype = self.patch_embedding.weight.dtype - patch_embeds = self.patch_embedding( - pixel_values.cast(target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = self.patch_embedding(pixel_values.cast(target_dtype)) # shape = [*, width, grid, grid] patch_embeds = patch_embeds.flatten(2).transpose([0, 2, 1]) class_embeds = self.class_embedding.expand([batch_size, 1, -1]) @@ -307,23 +311,22 @@ def __init__(self, config: CLIPTextConfig): embed_dim = config.hidden_size self.token_embedding = nn.Embedding(config.vocab_size, embed_dim) - self.position_embedding = nn.Embedding(config.max_position_embeddings, - embed_dim) + self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.register_buffer( "position_ids", - paddle.arange( - config.max_position_embeddings, dtype="int64").expand((1, -1)), - persistable=False, ) + paddle.arange(config.max_position_embeddings, dtype="int64").expand((1, -1)), + persistable=False, + ) def forward( - self, - input_ids: Optional[paddle.Tensor]=None, - position_ids: Optional[paddle.Tensor]=None, - inputs_embeds: Optional[paddle.Tensor]=None, ) -> paddle.Tensor: - seq_length = (input_ids.shape[-1] - if input_ids is not None else inputs_embeds.shape[-2]) + self, + input_ids: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.Tensor] = None, + inputs_embeds: Optional[paddle.Tensor] = None, + ) -> paddle.Tensor: + seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2] if position_ids is None: position_ids = self.position_ids[:, :seq_length] @@ -349,7 +352,8 @@ def __init__(self, config): if self.head_dim * self.num_heads != self.embed_dim: raise ValueError( f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" - f" {self.num_heads}).") + f" {self.num_heads})." + ) self.scale = self.head_dim**-0.5 self.dropout = config.attention_dropout @@ -359,18 +363,15 @@ def __init__(self, config): self.out_proj = LinearClass(self.embed_dim, self.embed_dim) def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int): - return tensor.reshape( - [bsz, seq_len, self.num_heads, self.head_dim]).transpose( - [0, 2, 1, 3]) + return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) def forward( - self, - hidden_states: paddle.Tensor, - attention_mask: Optional[paddle.Tensor]=None, - causal_attention_mask: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=False, ) -> Tuple[ - paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[ - paddle.Tensor]]]: + self, + hidden_states: paddle.Tensor, + attention_mask: Optional[paddle.Tensor] = None, + causal_attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: """Input shape: Batch x Time x Channel""" bsz, tgt_len, embed_dim = hidden_states.shape @@ -381,8 +382,7 @@ def forward( value_states = self._shape(self.v_proj(hidden_states), -1, bsz) proj_shape = (bsz * self.num_heads, -1, self.head_dim) - query_states = self._shape(query_states, tgt_len, - bsz).reshape(proj_shape) + query_states = self._shape(query_states, tgt_len, bsz).reshape(proj_shape) key_states = key_states.reshape(proj_shape) value_states = value_states.reshape(proj_shape) @@ -392,29 +392,26 @@ def forward( if attn_weights.shape != [bsz * self.num_heads, tgt_len, src_len]: raise ValueError( f"Attention weights should be of size {[bsz * self.num_heads, tgt_len, src_len]}, but is" - f" {attn_weights.shape}") + f" {attn_weights.shape}" + ) # apply the causal_attention_mask first if causal_attention_mask is not None: if causal_attention_mask.shape != [bsz, 1, tgt_len, src_len]: raise ValueError( f"Attention mask should be of size {[bsz, 1, tgt_len, src_len]}, but is" - f" {causal_attention_mask.shape}") - attn_weights = ( - attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len]) + - causal_attention_mask) - attn_weights = attn_weights.reshape( - [bsz * self.num_heads, tgt_len, src_len]) + f" {causal_attention_mask.shape}" + ) + attn_weights = attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len]) + causal_attention_mask + attn_weights = attn_weights.reshape([bsz * self.num_heads, tgt_len, src_len]) if attention_mask is not None: if attention_mask.shape != [bsz, 1, tgt_len, src_len]: raise ValueError( f"Attention mask should be of size {[bsz, 1, tgt_len, src_len]}, but is {attention_mask.shape}" ) - attn_weights = (attn_weights.reshape( - [bsz, self.num_heads, tgt_len, src_len]) + attention_mask) - attn_weights = attn_weights.reshape( - [bsz * self.num_heads, tgt_len, src_len]) + attn_weights = attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len]) + attention_mask + attn_weights = attn_weights.reshape([bsz * self.num_heads, tgt_len, src_len]) attn_weights = F.softmax(attn_weights, axis=-1) @@ -423,25 +420,22 @@ def forward( # make sure that attn_weights keeps its gradient. # In order to do so, attn_weights have to reshaped # twice and have to be reused in the following - attn_weights_reshaped = attn_weights.reshape( - [bsz, self.num_heads, tgt_len, src_len]) - attn_weights = attn_weights_reshaped.reshape( - [bsz * self.num_heads, tgt_len, src_len]) + attn_weights_reshaped = attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len]) + attn_weights = attn_weights_reshaped.reshape([bsz * self.num_heads, tgt_len, src_len]) else: attn_weights_reshaped = None - attn_probs = F.dropout( - attn_weights, p=self.dropout, training=self.training) + attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) attn_output = paddle.matmul(attn_probs, value_states) if attn_output.shape != [bsz * self.num_heads, tgt_len, self.head_dim]: raise ValueError( f"`attn_output` should be of size {[bsz, self.num_heads, tgt_len, self.head_dim]}, but is" - f" {attn_output.shape}") + f" {attn_output.shape}" + ) - attn_output = attn_output.reshape( - [bsz, self.num_heads, tgt_len, self.head_dim]) + attn_output = attn_output.reshape([bsz, self.num_heads, tgt_len, self.head_dim]) attn_output = attn_output.transpose([0, 2, 1, 3]) attn_output = attn_output.reshape([bsz, tgt_len, embed_dim]) @@ -470,18 +464,17 @@ def __init__(self, config: CLIPTextConfig): super().__init__() self.embed_dim = config.hidden_size self.self_attn = HFCLIPAttention(config) - self.layer_norm1 = nn.LayerNorm( - self.embed_dim, epsilon=config.layer_norm_eps) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps) self.mlp = HFCLIPMLP(config) - self.layer_norm2 = nn.LayerNorm( - self.embed_dim, epsilon=config.layer_norm_eps) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps) def forward( - self, - hidden_states: paddle.Tensor, - attention_mask: paddle.Tensor, - causal_attention_mask: paddle.Tensor, - output_attentions: Optional[bool]=False, ) -> Tuple[paddle.Tensor]: + self, + hidden_states: paddle.Tensor, + attention_mask: paddle.Tensor, + causal_attention_mask: paddle.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[paddle.Tensor]: """ Args: hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` @@ -499,7 +492,8 @@ def forward( hidden_states=hidden_states, attention_mask=attention_mask, causal_attention_mask=causal_attention_mask, - output_attentions=output_attentions, ) + output_attentions=output_attentions, + ) hidden_states = residual + hidden_states residual = hidden_states @@ -507,10 +501,10 @@ def forward( hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states - outputs = (hidden_states, ) + outputs = (hidden_states,) if output_attentions: - outputs += (attn_weights, ) + outputs += (attn_weights,) return outputs @@ -531,24 +525,21 @@ def _init_weights(self, module): factor = self.config.initializer_factor if isinstance(module, HFCLIPTextEmbeddings): normal_(module.token_embedding.weight, mean=0.0, std=factor * 0.02) - normal_( - module.position_embedding.weight, mean=0.0, std=factor * 0.02) + normal_(module.position_embedding.weight, mean=0.0, std=factor * 0.02) elif isinstance(module, HFCLIPVisionEmbeddings): factor = self.config.initializer_factor - normal_( - module.class_embedding, - mean=0.0, - std=module.embed_dim**-0.5 * factor) + normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor) normal_( module.patch_embedding.weight, - std=module.config.initializer_range * factor, ) + std=module.config.initializer_range * factor, + ) normal_( module.position_embedding.weight, - std=module.config.initializer_range * factor, ) + std=module.config.initializer_range * factor, + ) elif isinstance(module, HFCLIPAttention): factor = self.config.initializer_factor - in_proj_std = ((module.embed_dim**-0.5) * ( - (2 * module.config.num_hidden_layers)**-0.5) * factor) + in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor out_proj_std = (module.embed_dim**-0.5) * factor normal_(module.q_proj.weight, std=in_proj_std) normal_(module.k_proj.weight, std=in_proj_std) @@ -556,30 +547,31 @@ def _init_weights(self, module): normal_(module.out_proj.weight, std=out_proj_std) elif isinstance(module, HFCLIPMLP): factor = self.config.initializer_factor - in_proj_std = ((module.config.hidden_size**-0.5) * ( - (2 * module.config.num_hidden_layers)**-0.5) * factor) - fc_std = (2 * module.config.hidden_size)**-0.5 * factor + in_proj_std = ( + (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor + ) + fc_std = (2 * module.config.hidden_size) ** -0.5 * factor normal_(module.fc1.weight, std=fc_std) normal_(module.fc2.weight, std=in_proj_std) elif isinstance(module, HFCLIPModel): normal_( module.text_projection.weight, - std=module.text_embed_dim - **-0.5 * self.config.initializer_factor, ) + std=module.text_embed_dim**-0.5 * self.config.initializer_factor, + ) normal_( module.visual_projection.weight, - std=module.vision_embed_dim - **-0.5 * self.config.initializer_factor, ) + std=module.vision_embed_dim**-0.5 * self.config.initializer_factor, + ) elif isinstance(module, HFCLIPVisionModelWithProjection): normal_( module.visual_projection.weight, - std=self.config.hidden_size - **-0.5 * self.config.initializer_factor, ) + std=self.config.hidden_size**-0.5 * self.config.initializer_factor, + ) elif isinstance(module, HFCLIPTextModelWithProjection): normal_( module.text_projection.weight, - std=self.config.hidden_size - **-0.5 * self.config.initializer_factor, ) + std=self.config.hidden_size**-0.5 * self.config.initializer_factor, + ) if isinstance(module, nn.LayerNorm): module.bias.zero_() @@ -599,9 +591,7 @@ def gradient_checkpointing_enable(self): activations". """ if not self.supports_gradient_checkpointing: - raise ValueError( - f"{self.__class__.__name__} does not support gradient checkpointing." - ) + raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.") self.apply(partial(self._set_gradient_checkpointing, value=True)) def gradient_checkpointing_disable(self): @@ -627,8 +617,7 @@ def register_load_torch_hook(self, function=None): def map_from(module, state_dict, *args, **kwargs): if state_dict.pop("is_torch_weight", False): need_transposed = [] - for name, layer in module.named_sublayers( - include_self=True): + for name, layer in module.named_sublayers(include_self=True): if isinstance(layer, nn.Linear): need_transposed.append(name + ".weight") module.need_transposed = need_transposed @@ -637,8 +626,7 @@ def map_from(module, state_dict, *args, **kwargs): else: map_from = function - self.load_torch_hook = self.register_load_state_dict_pre_hook( - map_from, with_module=True) + self.load_torch_hook = self.register_load_state_dict_pre_hook(map_from, with_module=True) return self.load_torch_hook def remove_load_torch_hook(self): @@ -651,7 +639,8 @@ def to(self=None, device=None, dtype=None, blocking=None): dtype=dtype, blocking=blocking, include_sublayers=True, - floating_only=True, ) + floating_only=True, + ) class HFCLIPEncoder(nn.Layer): @@ -666,20 +655,18 @@ class HFCLIPEncoder(nn.Layer): def __init__(self, config: CLIPConfig): super().__init__() self.config = config - self.layers = nn.LayerList([ - HFCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers) - ]) + self.layers = nn.LayerList([HFCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False def forward( - self, - inputs_embeds, - attention_mask: Optional[paddle.Tensor]=None, - causal_attention_mask: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - return_dict: Optional[bool]=None, ) -> Union[Tuple, - BaseModelOutput]: + self, + inputs_embeds, + attention_mask: Optional[paddle.Tensor] = None, + causal_attention_mask: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: r""" Args: inputs_embeds (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -709,13 +696,11 @@ def forward( return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ - output_attentions = (output_attentions if output_attentions is not None - else self.config.output_attentions) - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -723,7 +708,7 @@ def forward( hidden_states = inputs_embeds for idx, encoder_layer in enumerate(self.layers): if output_hidden_states: - encoder_states = encoder_states + (hidden_states, ) + encoder_states = encoder_states + (hidden_states,) if self.gradient_checkpointing and self.training: def create_custom_forward(module): @@ -736,30 +721,31 @@ def custom_forward(*inputs): create_custom_forward(encoder_layer), hidden_states, attention_mask, - causal_attention_mask, ) + causal_attention_mask, + ) else: layer_outputs = encoder_layer( hidden_states, attention_mask, causal_attention_mask, - output_attentions=output_attentions, ) + output_attentions=output_attentions, + ) hidden_states = layer_outputs[0] if output_attentions: - all_attentions = all_attentions + (layer_outputs[1], ) + all_attentions = all_attentions + (layer_outputs[1],) if output_hidden_states: - encoder_states = encoder_states + (hidden_states, ) + encoder_states = encoder_states + (hidden_states,) if not return_dict: - return tuple( - v for v in [hidden_states, encoder_states, all_attentions] - if v is not None) + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, - attentions=all_attentions, ) + attentions=all_attentions, + ) # def _make_causal_mask( @@ -786,31 +772,28 @@ def __init__(self, config: CLIPTextConfig): embed_dim = config.hidden_size self.embeddings = HFCLIPTextEmbeddings(config) self.encoder = HFCLIPEncoder(config) - self.final_layer_norm = nn.LayerNorm( - embed_dim, epsilon=config.layer_norm_eps) + self.final_layer_norm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps) # For `pooled_output` computation self.eos_token_id = config.eos_token_id def forward( - self, - input_ids: Optional[paddle.Tensor]=None, - attention_mask: Optional[paddle.Tensor]=None, - position_ids: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - return_dict: Optional[bool]=None, ) -> Union[ - Tuple, BaseModelOutputWithPooling]: + self, + input_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" Returns: """ - output_attentions = (output_attentions if output_attentions is not None - else self.config.output_attentions) - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is None: raise ValueError("You have to specify either input_ids") @@ -818,8 +801,7 @@ def forward( input_shape = input_ids.shape input_ids = input_ids.reshape([-1, input_shape[-1]]) - hidden_states = self.embeddings( - input_ids=input_ids, position_ids=position_ids) + hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids) bsz, seq_len = input_shape # CLIP's text model uses causal mask, prepare it here. @@ -828,7 +810,8 @@ def forward( causal_attention_mask = self._build_causal_attention_mask( bsz, seq_len, - hidden_states.dtype, ) + hidden_states.dtype, + ) # expand attention_mask if attention_mask is not None: # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] @@ -840,7 +823,8 @@ def forward( causal_attention_mask=causal_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) last_hidden_state = encoder_outputs[0] last_hidden_state = self.final_layer_norm(last_hidden_state) @@ -855,24 +839,24 @@ def forward( pooled_output = last_hidden_state.gather_nd( paddle.stack( [ - paddle.arange( - last_hidden_state.shape[0], dtype="int32"), - input_ids.argmax( - -1, dtype="int32"), + paddle.arange(last_hidden_state.shape[0], dtype="int32"), + input_ids.argmax(-1, dtype="int32"), ], - axis=-1, )) + axis=-1, + ) + ) else: # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible) # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`) pooled_output = last_hidden_state.gather_nd( paddle.stack( [ - paddle.arange( - last_hidden_state.shape[0], dtype="int32"), - (input_ids == self.eos_token_id).cast("int32").argmax( - axis=-1, dtype="int32"), + paddle.arange(last_hidden_state.shape[0], dtype="int32"), + (input_ids == self.eos_token_id).cast("int32").argmax(axis=-1, dtype="int32"), ], - axis=-1, )) + axis=-1, + ) + ) if not return_dict: return (last_hidden_state, pooled_output) + encoder_outputs[1:] @@ -881,12 +865,14 @@ def forward( last_hidden_state=last_hidden_state, pooler_output=pooled_output, hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, ) + attentions=encoder_outputs.attentions, + ) def _build_causal_attention_mask(self, bsz, seq_len, dtype): mask = paddle.triu( paddle.full((bsz, 1, seq_len, seq_len), finfo(dtype).min), - diagonal=1, ) + diagonal=1, + ) return mask @@ -908,14 +894,14 @@ def set_input_embeddings(self, value): self.text_model.embeddings.token_embedding = value def forward( - self, - input_ids: Optional[paddle.Tensor]=None, - attention_mask: Optional[paddle.Tensor]=None, - position_ids: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - return_dict: Optional[bool]=None, ) -> Union[ - Tuple, BaseModelOutputWithPooling]: + self, + input_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" Returns: @@ -933,8 +919,7 @@ def forward( >>> last_hidden_state = outputs.last_hidden_state >>> pooled_output = outputs.pooler_output # pooled (EOS token) states ```""" - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict return self.text_model( input_ids=input_ids, @@ -942,7 +927,8 @@ def forward( position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) class HFCLIPVisionTransformer(nn.Layer): @@ -952,30 +938,26 @@ def __init__(self, config: CLIPVisionConfig): embed_dim = config.hidden_size self.embeddings = HFCLIPVisionEmbeddings(config) - self.pre_layrnorm = nn.LayerNorm( - embed_dim, epsilon=config.layer_norm_eps) + self.pre_layrnorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps) self.encoder = HFCLIPEncoder(config) - self.post_layernorm = nn.LayerNorm( - embed_dim, epsilon=config.layer_norm_eps) + self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps) def forward( - self, - pixel_values: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - return_dict: Optional[bool]=None, ) -> Union[ - Tuple, BaseModelOutputWithPooling]: + self, + pixel_values: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" Returns: """ - output_attentions = (output_attentions if output_attentions is not None - else self.config.output_attentions) - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -987,7 +969,8 @@ def forward( inputs_embeds=hidden_states, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) last_hidden_state = encoder_outputs[0] pooled_output = last_hidden_state[:, 0, :] @@ -1000,7 +983,8 @@ def forward( last_hidden_state=last_hidden_state, pooler_output=pooled_output, hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, ) + attentions=encoder_outputs.attentions, + ) class HFCLIPVisionModel(HFCLIPPretrainedModel): @@ -1017,12 +1001,12 @@ def get_input_embeddings(self) -> nn.Layer: return self.vision_model.embeddings.patch_embedding def forward( - self, - pixel_values: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - return_dict: Optional[bool]=None, ) -> Union[ - Tuple, BaseModelOutputWithPooling]: + self, + pixel_values: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" Returns: @@ -1045,14 +1029,14 @@ def forward( >>> last_hidden_state = outputs.last_hidden_state >>> pooled_output = outputs.pooler_output # pooled CLS states ```""" - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict return self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) class HFCLIPModel(HFCLIPPretrainedModel): @@ -1064,12 +1048,14 @@ def __init__(self, config: CLIPConfig): if not isinstance(config.text_config, CLIPTextConfig): raise ValueError( "config.text_config is expected to be of type CLIPTextConfig but is of type" - f" {type(config.text_config)}.") + f" {type(config.text_config)}." + ) if not isinstance(config.vision_config, CLIPVisionConfig): raise ValueError( "config.vision_config is expected to be of type CLIPVisionConfig but is of type" - f" {type(config.vision_config)}.") + f" {type(config.vision_config)}." + ) text_config = config.text_config vision_config = config.vision_config @@ -1081,24 +1067,22 @@ def __init__(self, config: CLIPConfig): self.text_model = HFCLIPTextTransformer(text_config) self.vision_model = HFCLIPVisionTransformer(vision_config) - self.visual_projection = LinearClass( - self.vision_embed_dim, self.projection_dim, bias_attr=False) - self.text_projection = LinearClass( - self.text_embed_dim, self.projection_dim, bias_attr=False) - self.logit_scale = Parameter( - paddle.to_tensor(self.config.logit_scale_init_value)) + self.visual_projection = LinearClass(self.vision_embed_dim, self.projection_dim, bias_attr=False) + self.text_projection = LinearClass(self.text_embed_dim, self.projection_dim, bias_attr=False) + self.logit_scale = Parameter(paddle.to_tensor(self.config.logit_scale_init_value)) # Initialize weights and apply final processing self.post_init() def get_text_features( - self, - input_ids: Optional[paddle.Tensor]=None, - attention_mask: Optional[paddle.Tensor]=None, - position_ids: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - return_dict: Optional[bool]=None, ) -> paddle.Tensor: + self, + input_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> paddle.Tensor: r""" Returns: text_features (`paddle.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by @@ -1116,13 +1100,11 @@ def get_text_features( >>> text_features = model.get_text_features(**inputs) ```""" # Use CLIP model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = (output_attentions if output_attentions is not None - else self.config.output_attentions) - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict text_outputs = self.text_model( input_ids=input_ids, @@ -1130,7 +1112,8 @@ def get_text_features( position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) pooled_output = text_outputs[1] text_features = self.text_projection(pooled_output) @@ -1138,11 +1121,12 @@ def get_text_features( return text_features def get_image_features( - self, - pixel_values: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - return_dict: Optional[bool]=None, ) -> paddle.Tensor: + self, + pixel_values: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> paddle.Tensor: r""" Returns: image_features (`paddle.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by @@ -1166,19 +1150,18 @@ def get_image_features( >>> image_features = model.get_image_features(**inputs) ```""" # Use CLIP model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = (output_attentions if output_attentions is not None - else self.config.output_attentions) - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict vision_outputs = self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) pooled_output = vision_outputs[1] # pooled_output image_features = self.visual_projection(pooled_output) @@ -1186,15 +1169,16 @@ def get_image_features( return image_features def forward( - self, - input_ids: Optional[paddle.Tensor]=None, - pixel_values: Optional[paddle.Tensor]=None, - attention_mask: Optional[paddle.Tensor]=None, - position_ids: Optional[paddle.Tensor]=None, - return_loss: Optional[bool]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - return_dict: Optional[bool]=None, ) -> Union[Tuple, HFCLIPOutput]: + self, + input_ids: Optional[paddle.Tensor] = None, + pixel_values: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.Tensor] = None, + return_loss: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, HFCLIPOutput]: r""" Returns: @@ -1221,19 +1205,18 @@ def forward( >>> probs = F.softmax(logits_per_image.softmax, axis=1) # we can take the softmax to get the label probabilities ```""" # Use CLIP model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = (output_attentions if output_attentions is not None - else self.config.output_attentions) - output_hidden_states = (output_hidden_states - if output_hidden_states is not None else - self.config.output_hidden_states) - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict vision_outputs = self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) text_outputs = self.text_model( input_ids=input_ids, @@ -1241,7 +1224,8 @@ def forward( position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) image_embeds = vision_outputs[1] image_embeds = self.visual_projection(image_embeds) @@ -1250,14 +1234,12 @@ def forward( text_embeds = self.text_projection(text_embeds) # normalized features - image_embeds = image_embeds / image_embeds.norm( - p=2, axis=-1, keepdim=True) + image_embeds = image_embeds / image_embeds.norm(p=2, axis=-1, keepdim=True) text_embeds = text_embeds / text_embeds.norm(p=2, axis=-1, keepdim=True) # cosine similarity as logits logit_scale = self.logit_scale.exp() - logits_per_text = paddle.matmul(text_embeds, - image_embeds.t()) * logit_scale + logits_per_text = paddle.matmul(text_embeds, image_embeds.t()) * logit_scale logits_per_image = logits_per_text.t() loss = None @@ -1271,8 +1253,9 @@ def forward( text_embeds, image_embeds, text_outputs, - vision_outputs, ) - return ((loss, ) + output) if loss is not None else output + vision_outputs, + ) + return ((loss,) + output) if loss is not None else output return HFCLIPOutput( loss=loss, @@ -1281,7 +1264,8 @@ def forward( text_embeds=text_embeds, image_embeds=image_embeds, text_model_output=text_outputs, - vision_model_output=vision_outputs, ) + vision_model_output=vision_outputs, + ) class HFCLIPTextModelWithProjection(HFCLIPPretrainedModel): @@ -1294,8 +1278,7 @@ def __init__(self, config: CLIPTextConfig): self.text_model = HFCLIPTextTransformer(config) - self.text_projection = LinearClass( - config.hidden_size, config.projection_dim, bias_attr=False) + self.text_projection = LinearClass(config.hidden_size, config.projection_dim, bias_attr=False) # Initialize weights and apply final processing self.post_init() @@ -1307,14 +1290,14 @@ def set_input_embeddings(self, value): self.text_model.embeddings.token_embedding = value def forward( - self, - input_ids: Optional[paddle.Tensor]=None, - attention_mask: Optional[paddle.Tensor]=None, - position_ids: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - return_dict: Optional[bool]=None, ) -> Union[Tuple, - HFCLIPTextModelOutput]: + self, + input_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, HFCLIPTextModelOutput]: r""" Returns: @@ -1331,8 +1314,7 @@ def forward( >>> outputs = model(**inputs) >>> text_embeds = outputs.text_embeds ```""" - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict text_outputs = self.text_model( input_ids=input_ids, @@ -1340,7 +1322,8 @@ def forward( position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) pooled_output = text_outputs[1] @@ -1354,7 +1337,8 @@ def forward( text_embeds=text_embeds, last_hidden_state=text_outputs.last_hidden_state, hidden_states=text_outputs.hidden_states, - attentions=text_outputs.attentions, ) + attentions=text_outputs.attentions, + ) class HFCLIPVisionModelWithProjection(HFCLIPPretrainedModel): @@ -1366,8 +1350,7 @@ def __init__(self, config: CLIPVisionConfig): self.vision_model = HFCLIPVisionTransformer(config) - self.visual_projection = LinearClass( - config.hidden_size, config.projection_dim, bias_attr=False) + self.visual_projection = LinearClass(config.hidden_size, config.projection_dim, bias_attr=False) # Initialize weights and apply final processing self.post_init() @@ -1376,12 +1359,12 @@ def get_input_embeddings(self) -> nn.Layer: return self.vision_model.embeddings.patch_embedding def forward( - self, - pixel_values: Optional[paddle.Tensor]=None, - output_attentions: Optional[bool]=None, - output_hidden_states: Optional[bool]=None, - return_dict: Optional[bool]=None, ) -> Union[ - Tuple, HFCLIPVisionModelOutput]: + self, + pixel_values: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, HFCLIPVisionModelOutput]: r""" Returns: @@ -1403,14 +1386,14 @@ def forward( >>> outputs = model(**inputs) >>> image_embeds = outputs.image_embeds ```""" - return_dict = (return_dict if return_dict is not None else - self.config.use_return_dict) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict vision_outputs = self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) + return_dict=return_dict, + ) pooled_output = vision_outputs[1] # pooled_output @@ -1424,4 +1407,5 @@ def forward( image_embeds=image_embeds, last_hidden_state=vision_outputs.last_hidden_state, hidden_states=vision_outputs.hidden_states, - attentions=vision_outputs.attentions, ) + attentions=vision_outputs.attentions, + ) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py index c74cfb57a53b3..80cf9f98c1082 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py @@ -20,8 +20,7 @@ import paddle import PIL from packaging import version -from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict from ...loaders import TextualInversionLoaderMixin @@ -46,11 +45,7 @@ def preprocess(image): w, h = image[0].size w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 - image = [ - np.array(i.resize( - (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] - for i in image - ] + image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image] image = np.concatenate(image, axis=0) image = np.array(image).astype(np.float32) / 255.0 image = image.transpose(0, 3, 1, 2) @@ -61,50 +56,46 @@ def preprocess(image): return image -def posterior_sample(scheduler, latents, timestep, clean_latents, generator, - eta): +def posterior_sample(scheduler, latents, timestep, clean_latents, generator, eta): # 1. get previous step value (=t-1) - prev_timestep = (timestep - scheduler.config.num_train_timesteps // - scheduler.num_inference_steps) + prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps if prev_timestep <= 0: return clean_latents # 2. compute alphas, betas alpha_prod_t = scheduler.alphas_cumprod[timestep] - alpha_prod_t_prev = (scheduler.alphas_cumprod[prev_timestep] if - prev_timestep >= 0 else scheduler.final_alpha_cumprod) + alpha_prod_t_prev = ( + scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod + ) variance = scheduler._get_variance(timestep, prev_timestep) - std_dev_t = eta * variance**(0.5) + std_dev_t = eta * variance ** (0.5) # direction pointing to x_t - e_t = (latents - alpha_prod_t** - (0.5) * clean_latents) / (1 - alpha_prod_t)**(0.5) - dir_xt = (1.0 - alpha_prod_t_prev - std_dev_t**2)**(0.5) * e_t - noise = std_dev_t * randn_tensor( - clean_latents.shape, dtype=clean_latents.dtype, generator=generator) - prev_latents = alpha_prod_t_prev**(0.5) * clean_latents + dir_xt + noise + e_t = (latents - alpha_prod_t ** (0.5) * clean_latents) / (1 - alpha_prod_t) ** (0.5) + dir_xt = (1.0 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * e_t + noise = std_dev_t * randn_tensor(clean_latents.shape, dtype=clean_latents.dtype, generator=generator) + prev_latents = alpha_prod_t_prev ** (0.5) * clean_latents + dir_xt + noise return prev_latents def compute_noise(scheduler, prev_latents, latents, timestep, noise_pred, eta): # 1. get previous step value (=t-1) - prev_timestep = (timestep - scheduler.config.num_train_timesteps // - scheduler.num_inference_steps) + prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps # 2. compute alphas, betas alpha_prod_t = scheduler.alphas_cumprod[timestep] - alpha_prod_t_prev = (scheduler.alphas_cumprod[prev_timestep] if - prev_timestep >= 0 else scheduler.final_alpha_cumprod) + alpha_prod_t_prev = ( + scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod + ) beta_prod_t = 1 - alpha_prod_t # 3. compute predicted original sample from predicted noise also called # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf - pred_original_sample = (latents - beta_prod_t** - (0.5) * noise_pred) / alpha_prod_t**(0.5) + pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5) # 4. Clip "predicted x_0" if scheduler.config.clip_sample: @@ -113,16 +104,14 @@ def compute_noise(scheduler, prev_latents, latents, timestep, noise_pred, eta): # 5. compute variance: "sigma_t(η)" -> see formula (16) # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1) variance = scheduler._get_variance(timestep, prev_timestep) - std_dev_t = eta * variance**(0.5) + std_dev_t = eta * variance ** (0.5) # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf - pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2)**( - 0.5) * noise_pred + pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred - noise = (prev_latents - - (alpha_prod_t_prev** - (0.5) * pred_original_sample + pred_sample_direction)) / ( - variance**(0.5) * eta) + noise = (prev_latents - (alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction)) / ( + variance ** (0.5) * eta + ) return noise @@ -156,31 +145,28 @@ class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin): _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: DDIMScheduler, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: DDIMScheduler, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + ): super().__init__() - if (hasattr(scheduler.config, "steps_offset") and - scheduler.config.steps_offset != 1): + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " "to update the config accordingly as leaving `steps_offset` might led to incorrect results" " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" - " file") - deprecate( - "steps_offset!=1", - "1.0.0", - deprecation_message, - standard_warn=False) + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["steps_offset"] = 1 scheduler._internal_dict = FrozenDict(new_config) @@ -200,12 +186,10 @@ def __init__( f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." ) - is_unet_version_less_0_9_0 = hasattr( - unet.config, "_ppdiffusers_version") and version.parse( - version.parse(unet.config._ppdiffusers_version) - .base_version) < version.parse("0.9.0.dev0") - is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and - unet.config.sample_size < 64) + is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse( + version.parse(unet.config._ppdiffusers_version).base_version + ) < version.parse("0.9.0.dev0") + is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: deprecation_message = ( "The configuration file of the unet has set the default `sample_size` to smaller than" @@ -216,12 +200,9 @@ def __init__( " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`" " in the config might lead to incorrect results in future versions. If you have downloaded this" " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" - " the `unet/config.json` file") - deprecate( - "sample_size<64", - "1.0.0", - deprecation_message, - standard_warn=False) + " the `unet/config.json` file" + ) + deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(unet.config) new_config["sample_size"] = 64 unet._internal_dict = FrozenDict(new_config) @@ -233,18 +214,20 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) + feature_extractor=feature_extractor, + ) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): r""" Encodes the prompt into text encoder hidden states. @@ -284,29 +267,31 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) @@ -314,8 +299,7 @@ def _encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -325,21 +309,22 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, - self.tokenizer) + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( @@ -347,71 +332,67 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs def check_inputs( - self, - prompt, - strength, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + strength, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if strength < 0 or strength > 1: - raise ValueError( - f"The value of strength should in [0.0, 1.0] but is {strength}") + raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -424,7 +405,8 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs def prepare_extra_step_kwargs(self, generator, eta): @@ -433,15 +415,13 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs @@ -449,11 +429,10 @@ def prepare_extra_step_kwargs(self, generator, eta): # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) else: has_nsfw_concept = None return image, has_nsfw_concept @@ -470,21 +449,14 @@ def decode_latents(self, latents): # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps def get_timesteps(self, num_inference_steps, strength): # get the original timestep using init_timestep - init_timestep = min( - int(num_inference_steps * strength), num_inference_steps) + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:] + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] return timesteps, num_inference_steps - t_start - def prepare_latents(self, - image, - timestep, - batch_size, - num_images_per_prompt, - dtype, - generator=None): + def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None): image = image.cast(dtype) batch_size = image.shape[0] @@ -496,8 +468,7 @@ def prepare_latents(self, if isinstance(generator, list): init_latents = [ - self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i]) - for i in range(batch_size) + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) ] init_latents = paddle.concat(init_latents, axis=0) else: @@ -505,8 +476,7 @@ def prepare_latents(self, init_latents = self.vae.config.scaling_factor * init_latents - if (batch_size > init_latents.shape[0] and - batch_size % init_latents.shape[0] == 0): + if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: # expand init_latents for batch_size deprecation_message = ( f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial" @@ -518,20 +488,19 @@ def prepare_latents(self, "len(prompt) != len(image)", "1.0.0", deprecation_message, - standard_warn=False, ) + standard_warn=False, + ) additional_image_per_prompt = batch_size // init_latents.shape[0] init_latents = paddle.concat( - [init_latents] * additional_image_per_prompt * - num_images_per_prompt, - axis=0, ) - elif (batch_size > init_latents.shape[0] and - batch_size % init_latents.shape[0] != 0): + [init_latents] * additional_image_per_prompt * num_images_per_prompt, + axis=0, + ) + elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: raise ValueError( f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." ) else: - init_latents = paddle.concat( - [init_latents] * num_images_per_prompt, axis=0) + init_latents = paddle.concat([init_latents] * num_images_per_prompt, axis=0) # add noise to latents using the timestep shape = init_latents.shape @@ -546,25 +515,25 @@ def prepare_latents(self, @paddle.no_grad() def __call__( - self, - prompt: Union[str, List[str]], - source_prompt: Union[str, List[str]], - image: Union[paddle.Tensor, PIL.Image.Image]=None, - strength: float=0.8, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - negative_prompt: Optional[paddle.Tensor]=None, - source_guidance_scale: Optional[float]=1, - num_images_per_prompt: Optional[int]=1, - eta: Optional[float]=0.1, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, ): + self, + prompt: Union[str, List[str]], + source_prompt: Union[str, List[str]], + image: Union[paddle.Tensor, PIL.Image.Image] = None, + strength: float = 0.8, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + negative_prompt: Optional[paddle.Tensor] = None, + source_guidance_scale: Optional[float] = 1, + num_images_per_prompt: Optional[int] = 1, + eta: Optional[float] = 0.1, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + ): r""" Function invoked when calling the pipeline for generation. @@ -639,7 +608,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -661,20 +631,19 @@ def __call__( do_classifier_free_guidance, negative_prompt=negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) source_prompt_embeds = self._encode_prompt( - source_prompt, num_images_per_prompt, do_classifier_free_guidance, - None) + source_prompt, num_images_per_prompt, do_classifier_free_guidance, None + ) # 4. Preprocess image image = preprocess(image) # 5. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, - strength) - latent_timestep = timesteps[:1].tile( - [batch_size * num_images_per_prompt]) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) + latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt]) # 6. Prepare latent variables latents, clean_latents = self.prepare_latents( @@ -683,7 +652,8 @@ def __call__( batch_size, num_images_per_prompt, prompt_embeds.dtype, - generator, ) + generator, + ) source_latents = latents # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline @@ -691,17 +661,14 @@ def __call__( generator = extra_step_kwargs.pop("generator", None) # 8. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance latent_model_input = paddle.concat([latents] * 2) source_latent_model_input = paddle.concat([source_latents] * 2) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) - source_latent_model_input = self.scheduler.scale_model_input( - source_latent_model_input, t) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + source_latent_model_input = self.scheduler.scale_model_input(source_latent_model_input, t) # predict the noise residual concat_latent_model_input = paddle.stack( @@ -711,7 +678,8 @@ def __call__( source_latent_model_input[1], latent_model_input[1], ], - axis=0, ) + axis=0, + ) concat_prompt_embeds = paddle.stack( [ source_prompt_embeds[0], @@ -719,23 +687,25 @@ def __call__( source_prompt_embeds[1], prompt_embeds[1], ], - axis=0, ) + axis=0, + ) concat_noise_pred = self.unet( concat_latent_model_input, t, - encoder_hidden_states=concat_prompt_embeds, ).sample + encoder_hidden_states=concat_prompt_embeds, + ).sample # perform guidance ( source_noise_pred_uncond, noise_pred_uncond, source_noise_pred_text, - noise_pred_text, ) = concat_noise_pred.chunk( - 4, axis=0) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred_text, + ) = concat_noise_pred.chunk(4, axis=0) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) source_noise_pred = source_noise_pred_uncond + source_guidance_scale * ( - source_noise_pred_text - source_noise_pred_uncond) + source_noise_pred_text - source_noise_pred_uncond + ) # Sample source_latents from the posterior distribution. prev_source_latents = posterior_sample( @@ -744,7 +714,8 @@ def __call__( t, clean_latents, generator=generator, - **extra_step_kwargs, ) + **extra_step_kwargs, + ) # Compute noise. noise = compute_noise( self.scheduler, @@ -752,21 +723,17 @@ def __call__( source_latents, t, source_noise_pred, - **extra_step_kwargs, ) + **extra_step_kwargs, + ) source_latents = prev_source_latents # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step( - noise_pred, - t, - latents, - variance_noise=noise, - **extra_step_kwargs).prev_sample + noise_pred, t, latents, variance_noise=noise, **extra_step_kwargs + ).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -775,8 +742,7 @@ def __call__( image = self.decode_latents(latents) # 10. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, - prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) # 11. Convert to PIL if output_type == "pil": @@ -785,5 +751,4 @@ def __call__( if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py index 33a4cd8838fe2..31fc2eb7d9db6 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_cycle_diffusion.py @@ -22,57 +22,52 @@ from ...pipeline_utils import DiffusionPipeline from ...schedulers import DDIMScheduler from ...utils import logging, randn_tensor -from ..fastdeploy_utils import (FastDeployDiffusionPipelineMixin, - FastDeployRuntimeModel) +from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel from . import StableDiffusionPipelineOutput logger = logging.get_logger(__name__) -def posterior_sample(scheduler, latents, timestep, clean_latents, generator, - eta): +def posterior_sample(scheduler, latents, timestep, clean_latents, generator, eta): # 1. get previous step value (=t-1) - prev_timestep = (timestep - scheduler.config.num_train_timesteps // - scheduler.num_inference_steps) + prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps if prev_timestep <= 0: return clean_latents # 2. compute alphas, betas alpha_prod_t = scheduler.alphas_cumprod[timestep] - alpha_prod_t_prev = (scheduler.alphas_cumprod[prev_timestep] if - prev_timestep >= 0 else scheduler.final_alpha_cumprod) + alpha_prod_t_prev = ( + scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod + ) variance = scheduler._get_variance(timestep, prev_timestep) - std_dev_t = eta * variance**(0.5) + std_dev_t = eta * variance ** (0.5) # direction pointing to x_t - e_t = (latents - alpha_prod_t** - (0.5) * clean_latents) / (1 - alpha_prod_t)**(0.5) - dir_xt = (1.0 - alpha_prod_t_prev - std_dev_t**2)**(0.5) * e_t - noise = std_dev_t * randn_tensor( - clean_latents.shape, dtype=clean_latents.dtype, generator=generator) - prev_latents = alpha_prod_t_prev**(0.5) * clean_latents + dir_xt + noise + e_t = (latents - alpha_prod_t ** (0.5) * clean_latents) / (1 - alpha_prod_t) ** (0.5) + dir_xt = (1.0 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * e_t + noise = std_dev_t * randn_tensor(clean_latents.shape, dtype=clean_latents.dtype, generator=generator) + prev_latents = alpha_prod_t_prev ** (0.5) * clean_latents + dir_xt + noise return prev_latents def compute_noise(scheduler, prev_latents, latents, timestep, noise_pred, eta): # 1. get previous step value (=t-1) - prev_timestep = (timestep - scheduler.config.num_train_timesteps // - scheduler.num_inference_steps) + prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps # 2. compute alphas, betas alpha_prod_t = scheduler.alphas_cumprod[timestep] - alpha_prod_t_prev = (scheduler.alphas_cumprod[prev_timestep] if - prev_timestep >= 0 else scheduler.final_alpha_cumprod) + alpha_prod_t_prev = ( + scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod + ) beta_prod_t = 1 - alpha_prod_t # 3. compute predicted original sample from predicted noise also called # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf - pred_original_sample = (latents - beta_prod_t** - (0.5) * noise_pred) / alpha_prod_t**(0.5) + pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5) # 4. Clip "predicted x_0" if scheduler.config.clip_sample: @@ -81,21 +76,18 @@ def compute_noise(scheduler, prev_latents, latents, timestep, noise_pred, eta): # 5. compute variance: "sigma_t(η)" -> see formula (16) # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1) variance = scheduler._get_variance(timestep, prev_timestep) - std_dev_t = eta * variance**(0.5) + std_dev_t = eta * variance ** (0.5) # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf - pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2)**( - 0.5) * noise_pred + pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred - noise = (prev_latents - - (alpha_prod_t_prev** - (0.5) * pred_original_sample + pred_sample_direction)) / ( - variance**(0.5) * eta) + noise = (prev_latents - (alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction)) / ( + variance ** (0.5) * eta + ) return noise -class FastDeployCycleDiffusionPipeline(DiffusionPipeline, - FastDeployDiffusionPipelineMixin): +class FastDeployCycleDiffusionPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin): r""" Pipeline for text-guided image to image generation using Stable Diffusion. @@ -125,16 +117,17 @@ class FastDeployCycleDiffusionPipeline(DiffusionPipeline, _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae_encoder: FastDeployRuntimeModel, - vae_decoder: FastDeployRuntimeModel, - text_encoder: FastDeployRuntimeModel, - tokenizer: CLIPTokenizer, - unet: FastDeployRuntimeModel, - scheduler: DDIMScheduler, - safety_checker: FastDeployRuntimeModel, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=False, ): + self, + vae_encoder: FastDeployRuntimeModel, + vae_decoder: FastDeployRuntimeModel, + text_encoder: FastDeployRuntimeModel, + tokenizer: CLIPTokenizer, + unet: FastDeployRuntimeModel, + scheduler: DDIMScheduler, + safety_checker: FastDeployRuntimeModel, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = False, + ): super().__init__() if safety_checker is None and requires_safety_checker: logger.warning( @@ -159,37 +152,38 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) + feature_extractor=feature_extractor, + ) self.register_to_config(requires_safety_checker=requires_safety_checker) self.post_init() self.change_scheduler("ddim") def __call__( - self, - prompt: Union[str, List[str]], - source_prompt: Union[str, List[str]], - image: Union[paddle.Tensor, PIL.Image.Image]=None, - height: Optional[int]=None, - width: Optional[int]=None, - strength: float=0.8, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - negative_prompt: Optional[paddle.Tensor]=None, - source_guidance_scale: Optional[float]=1, - num_images_per_prompt: Optional[int]=1, - eta: Optional[float]=0.1, - latents: Optional[paddle.Tensor]=None, - parse_prompt_type: Optional[str]="lpw", - max_embeddings_multiples: Optional[int]=3, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - infer_op_dict: Dict[str, str]=None, ): + self, + prompt: Union[str, List[str]], + source_prompt: Union[str, List[str]], + image: Union[paddle.Tensor, PIL.Image.Image] = None, + height: Optional[int] = None, + width: Optional[int] = None, + strength: float = 0.8, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + negative_prompt: Optional[paddle.Tensor] = None, + source_guidance_scale: Optional[float] = 1, + num_images_per_prompt: Optional[int] = 1, + eta: Optional[float] = 0.1, + latents: Optional[paddle.Tensor] = None, + parse_prompt_type: Optional[str] = "lpw", + max_embeddings_multiples: Optional[int] = 3, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + infer_op_dict: Dict[str, str] = None, + ): r""" Function invoked when calling the pipeline for generation. @@ -266,8 +260,7 @@ def __call__( (nsfw) content, according to the `safety_checker`. """ # 0. Preprocess image - init_image = self.image_processor.preprocess( - image, height=height, width=width) + init_image = self.image_processor.preprocess(image, height=height, width=width) height, width = init_image.shape[-2:] # 1. Check inputs @@ -279,7 +272,8 @@ def __call__( negative_prompt, prompt_embeds, negative_prompt_embeds, - strength, ) + strength, + ) infer_op_dict = self.prepare_infer_op_dict(infer_op_dict) # 2. Define call parameters @@ -305,23 +299,23 @@ def __call__( negative_prompt_embeds=negative_prompt_embeds, parse_prompt_type=parse_prompt_type, max_embeddings_multiples=max_embeddings_multiples, - infer_op=infer_op_dict.get("text_encoder", None), ) + infer_op=infer_op_dict.get("text_encoder", None), + ) source_prompt_embeds = self._encode_prompt( source_prompt, num_images_per_prompt, do_classifier_free_guidance, parse_prompt_type=parse_prompt_type, max_embeddings_multiples=max_embeddings_multiples, - infer_op=infer_op_dict.get("text_encoder", None), ) + infer_op=infer_op_dict.get("text_encoder", None), + ) # 5. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, - strength) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) # 6. Prepare latent variables # at which timestep to set the initial noise (n.b. 50% if strength is 0.5) - latent_timestep = timesteps[:1].tile( - [batch_size * num_images_per_prompt]) + latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt]) is_strength_max = strength == 1.0 latents, clean_latents = self.prepare_latents( batch_size * num_images_per_prompt, @@ -333,7 +327,8 @@ def __call__( timestep=latent_timestep, is_strength_max=is_strength_max, return_image_latents=True, - infer_op=infer_op_dict.get("vae_encoder", None), ) + infer_op=infer_op_dict.get("vae_encoder", None), + ) source_latents = latents # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline @@ -341,18 +336,15 @@ def __call__( generator = extra_step_kwargs.pop("generator", None) # 8. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance latent_model_input = paddle.concat([latents] * 2) source_latent_model_input = paddle.concat([source_latents] * 2) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) - source_latent_model_input = self.scheduler.scale_model_input( - source_latent_model_input, t) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + source_latent_model_input = self.scheduler.scale_model_input(source_latent_model_input, t) # predict the noise residual concat_latent_model_input = paddle.stack( @@ -362,7 +354,8 @@ def __call__( source_latent_model_input[1], latent_model_input[1], ], - axis=0, ) + axis=0, + ) concat_prompt_embeds = paddle.stack( [ source_prompt_embeds[0], @@ -370,14 +363,16 @@ def __call__( source_prompt_embeds[1], prompt_embeds[1], ], - axis=0, ) + axis=0, + ) unet_inputs = dict( sample=concat_latent_model_input, timestep=t, encoder_hidden_states=concat_prompt_embeds, infer_op=infer_op_dict.get("unet", None), - output_shape=concat_latent_model_input.shape, ) + output_shape=concat_latent_model_input.shape, + ) # predict the noise residual concat_noise_pred = self.unet(**unet_inputs)[0] @@ -386,12 +381,12 @@ def __call__( source_noise_pred_uncond, noise_pred_uncond, source_noise_pred_text, - noise_pred_text, ) = concat_noise_pred.chunk( - 4, axis=0) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred_text, + ) = concat_noise_pred.chunk(4, axis=0) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) source_noise_pred = source_noise_pred_uncond + source_guidance_scale * ( - source_noise_pred_text - source_noise_pred_uncond) + source_noise_pred_text - source_noise_pred_uncond + ) # Sample source_latents from the posterior distribution. prev_source_latents = posterior_sample( @@ -400,7 +395,8 @@ def __call__( t, clean_latents, generator=generator, - **extra_step_kwargs, ) + **extra_step_kwargs, + ) # Compute noise. noise = compute_noise( self.scheduler, @@ -408,20 +404,16 @@ def __call__( source_latents, t, source_noise_pred, - **extra_step_kwargs, ) + **extra_step_kwargs, + ) source_latents = prev_source_latents # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step( - noise_pred, - t, - latents, - variance_noise=noise, - **extra_step_kwargs).prev_sample + noise_pred, t, latents, variance_noise=noise, **extra_step_kwargs + ).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -432,7 +424,8 @@ def __call__( if not output_type == "latent": image = self._decode_vae_latents( latents / self.vae_scaling_factor, - infer_op=infer_op_dict.get("vae_decoder", None), ) + infer_op=infer_op_dict.get("vae_decoder", None), + ) image, has_nsfw_concept = self.run_safety_checker(image) else: image = latents @@ -443,11 +436,9 @@ def __call__( else: do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - image = self.image_processor.postprocess( - image, output_type=output_type, do_denormalize=do_denormalize) + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion.py index 61110d7638d0f..8de1b7b464dfb 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion.py @@ -22,15 +22,13 @@ from ...pipeline_utils import DiffusionPipeline from ...schedulers import KarrasDiffusionSchedulers from ...utils import logging -from ..fastdeploy_utils import (FastDeployDiffusionPipelineMixin, - FastDeployRuntimeModel) +from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel from . import StableDiffusionPipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name -class FastDeployStableDiffusionPipeline(DiffusionPipeline, - FastDeployDiffusionPipelineMixin): +class FastDeployStableDiffusionPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin): r""" Pipeline for text-to-image generation using Stable Diffusion. @@ -60,21 +58,20 @@ class FastDeployStableDiffusionPipeline(DiffusionPipeline, feature_extractor ([`CLIPImageProcessor`]): Model that extracts features from generated images to be used as inputs for the `safety_checker`. """ - _optional_components = [ - "vae_encoder", "safety_checker", "feature_extractor" - ] + _optional_components = ["vae_encoder", "safety_checker", "feature_extractor"] def __init__( - self, - vae_encoder: FastDeployRuntimeModel, - vae_decoder: FastDeployRuntimeModel, - text_encoder: FastDeployRuntimeModel, - tokenizer: CLIPTokenizer, - unet: FastDeployRuntimeModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: FastDeployRuntimeModel, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=False, ): + self, + vae_encoder: FastDeployRuntimeModel, + vae_decoder: FastDeployRuntimeModel, + text_encoder: FastDeployRuntimeModel, + tokenizer: CLIPTokenizer, + unet: FastDeployRuntimeModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: FastDeployRuntimeModel, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = False, + ): super().__init__() if safety_checker is None and requires_safety_checker: logger.warning( @@ -99,34 +96,35 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) + feature_extractor=feature_extractor, + ) self.register_to_config(requires_safety_checker=requires_safety_checker) self.post_init() def __call__( - self, - prompt: Union[str, List[str]]=None, - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - parse_prompt_type: Optional[str]="lpw", - max_embeddings_multiples: Optional[int]=3, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None, - controlnet_conditioning_scale: float=1.0, - infer_op_dict: Dict[str, str]=None, ): + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + parse_prompt_type: Optional[str] = "lpw", + max_embeddings_multiples: Optional[int] = 3, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None, + controlnet_conditioning_scale: float = 1.0, + infer_op_dict: Dict[str, str] = None, + ): r""" Function invoked when calling the pipeline for generation. @@ -200,7 +198,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) infer_op_dict = self.prepare_infer_op_dict(infer_op_dict) # 2. Define call parameters @@ -226,7 +225,8 @@ def __call__( height=height, batch_size=batch_size, num_images_per_prompt=num_images_per_prompt, - do_classifier_free_guidance=do_classifier_free_guidance, ) + do_classifier_free_guidance=do_classifier_free_guidance, + ) # 3. Encode input prompt prompt_embeds = self._encode_prompt( @@ -238,7 +238,8 @@ def __call__( negative_prompt_embeds=negative_prompt_embeds, parse_prompt_type=parse_prompt_type, max_embeddings_multiples=max_embeddings_multiples, - infer_op=infer_op_dict.get("text_encoder", None), ) + infer_op=infer_op_dict.get("text_encoder", None), + ) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -250,47 +251,42 @@ def __call__( height, width, generator, - latents, ) + latents, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order is_scheduler_support_step_index = self.is_scheduler_support_step_index() with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents if is_scheduler_support_step_index: - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t, step_index=i) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i) else: - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) unet_inputs = dict( sample=latent_model_input, timestep=t, encoder_hidden_states=prompt_embeds, infer_op=infer_op_dict.get("unet", None), - output_shape=latent_model_input.shape, ) + output_shape=latent_model_input.shape, + ) if do_controlnet: unet_inputs["controlnet_cond"] = control_image - unet_inputs[ - "controlnet_conditioning_scale"] = control_conditioning_scale + unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale # predict the noise residual noise_pred_unet = self.unet(**unet_inputs)[0] # perform guidance if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk( - 2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) else: noise_pred = noise_pred_unet # compute the previous noisy sample x_t -> x_t-1 @@ -301,15 +297,13 @@ def __call__( latents, step_index=i, return_pred_original_sample=False, - **extra_step_kwargs, ) + **extra_step_kwargs, + ) else: - scheduler_output = self.scheduler.step( - noise_pred, t, latents, **extra_step_kwargs) + scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs) latents = scheduler_output.prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -320,7 +314,8 @@ def __call__( if not output_type == "latent": image = self._decode_vae_latents( latents / self.vae_scaling_factor, - infer_op=infer_op_dict.get("vae_decoder", None), ) + infer_op=infer_op_dict.get("vae_decoder", None), + ) image, has_nsfw_concept = self.run_safety_checker(image) else: image = latents @@ -331,11 +326,9 @@ def __call__( else: do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - image = self.image_processor.postprocess( - image, output_type=output_type, do_denormalize=do_denormalize) + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_controlnet.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_controlnet.py index 6d1b14edfaa32..324d66f3e0187 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_controlnet.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_controlnet.py @@ -13,16 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .pipeline_fastdeploy_stable_diffusion import \ - FastDeployStableDiffusionPipeline +from .pipeline_fastdeploy_stable_diffusion import FastDeployStableDiffusionPipeline -class FastDeployStableDiffusionControlNetPipeline( - FastDeployStableDiffusionPipeline): +class FastDeployStableDiffusionControlNetPipeline(FastDeployStableDiffusionPipeline): def __call__( - self, - *args, - **kwargs, ): + self, + *args, + **kwargs, + ): controlnet_cond = kwargs.pop("controlnet_cond", None) image = kwargs.pop("image", None) if controlnet_cond is None: diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_image_variation.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_image_variation.py index 7f92020a9d9dc..b90541cfd23a1 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_image_variation.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_image_variation.py @@ -20,7 +20,9 @@ from paddlenlp.transformers import CLIPImageProcessor from ppdiffusers.pipelines.fastdeploy_utils import ( - FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel) + FastDeployDiffusionPipelineMixin, + FastDeployRuntimeModel, +) from ...schedulers import KarrasDiffusionSchedulers from ...utils import logging @@ -30,8 +32,7 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name -class FastDeployStableDiffusionImageVariationPipeline( - DiffusionPipeline, FastDeployDiffusionPipelineMixin): +class FastDeployStableDiffusionImageVariationPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin): r""" Pipeline to generate variations from an input image using Stable Diffusion. @@ -59,15 +60,16 @@ class FastDeployStableDiffusionImageVariationPipeline( _optional_components = ["safety_checker"] def __init__( - self, - vae_encoder: FastDeployRuntimeModel, - vae_decoder: FastDeployRuntimeModel, - image_encoder: FastDeployRuntimeModel, - unet: FastDeployRuntimeModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: FastDeployRuntimeModel, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=False, ): + self, + vae_encoder: FastDeployRuntimeModel, + vae_decoder: FastDeployRuntimeModel, + image_encoder: FastDeployRuntimeModel, + unet: FastDeployRuntimeModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: FastDeployRuntimeModel, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = False, + ): super().__init__() if safety_checker is None and requires_safety_checker: @@ -93,28 +95,27 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) + feature_extractor=feature_extractor, + ) self.register_to_config(requires_safety_checker=requires_safety_checker) self.post_init() - def _encode_image(self, image, num_images_per_prompt, - do_classifier_free_guidance, infer_op_dict): + def _encode_image(self, image, num_images_per_prompt, do_classifier_free_guidance, infer_op_dict): if not isinstance(image, paddle.Tensor): - image = self.feature_extractor( - images=image, return_tensors="pd").pixel_values + image = self.feature_extractor(images=image, return_tensors="pd").pixel_values image_encoder_inputs = dict( pixel_values=image, infer_op=infer_op_dict.get("image_encoder", None), - output_shape=[image.shape[0], 768], ) + output_shape=[image.shape[0], 768], + ) image_embeddings = self.image_encoder(**image_encoder_inputs)[0] image_embeddings = image_embeddings.unsqueeze(1) # duplicate image embeddings for each generation per prompt, using mps friendly method bs_embed, seq_len, _ = image_embeddings.shape image_embeddings = image_embeddings.tile([1, num_images_per_prompt, 1]) - image_embeddings = image_embeddings.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + image_embeddings = image_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) if do_classifier_free_guidance: negative_prompt_embeds = paddle.zeros_like(image_embeddings) @@ -122,49 +123,50 @@ def _encode_image(self, image, num_images_per_prompt, # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - image_embeddings = paddle.concat( - [negative_prompt_embeds, image_embeddings]) + image_embeddings = paddle.concat([negative_prompt_embeds, image_embeddings]) return image_embeddings def check_inputs(self, image, height, width, callback_steps): - if (not isinstance(image, paddle.Tensor) and - not isinstance(image, PIL.Image.Image) and - not isinstance(image, list)): + if ( + not isinstance(image, paddle.Tensor) + and not isinstance(image, PIL.Image.Image) + and not isinstance(image, list) + ): raise ValueError( "`image` has to be of type `paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" - f" {type(image)}") + f" {type(image)}" + ) if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) @paddle.no_grad() def __call__( - self, - image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor], - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - infer_op_dict: Dict[str, str]=None, ): + self, + image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor], + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + infer_op_dict: Dict[str, str] = None, + ): r""" Function invoked when calling the pipeline for generation. @@ -242,9 +244,7 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input image - image_embeddings = self._encode_image(image, num_images_per_prompt, - do_classifier_free_guidance, - infer_op_dict) + image_embeddings = self._encode_image(image, num_images_per_prompt, do_classifier_free_guidance, infer_op_dict) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -256,26 +256,23 @@ def __call__( height, width, generator, - latents, ) + latents, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order is_scheduler_support_step_index = self.is_scheduler_support_step_index() with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents if is_scheduler_support_step_index: - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t, step_index=i) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i) else: - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual unet_inputs = dict( @@ -283,14 +280,14 @@ def __call__( timestep=t, encoder_hidden_states=image_embeddings, infer_op=infer_op_dict.get("unet", None), - output_shape=latent_model_input.shape, ) + output_shape=latent_model_input.shape, + ) noise_pred = self.unet(**unet_inputs)[0] # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 if is_scheduler_support_step_index: @@ -300,16 +297,14 @@ def __call__( latents, step_index=i, return_pred_original_sample=False, - **extra_step_kwargs, ) + **extra_step_kwargs, + ) else: - scheduler_output = self.scheduler.step( - noise_pred, t, latents, **extra_step_kwargs) + scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs) latents = scheduler_output.prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -320,7 +315,8 @@ def __call__( # 8. Post-processing image = self._decode_vae_latents( latents / self.vae_scaling_factor, - infer_op=infer_op_dict.get("vae_decoder", None), ) + infer_op=infer_op_dict.get("vae_decoder", None), + ) # 9. Run safety checker image, has_nsfw_concept = self.run_safety_checker(image) @@ -330,11 +326,9 @@ def __call__( else: do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - image = self.image_processor.postprocess( - image, output_type=output_type, do_denormalize=do_denormalize) + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_img2img.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_img2img.py index c282d47747dec..49f736a9c71c5 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_img2img.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_img2img.py @@ -22,15 +22,13 @@ from ...pipeline_utils import DiffusionPipeline from ...schedulers import KarrasDiffusionSchedulers from ...utils import logging -from ..fastdeploy_utils import (FastDeployDiffusionPipelineMixin, - FastDeployRuntimeModel) +from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel from . import StableDiffusionPipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name -class FastDeployStableDiffusionImg2ImgPipeline( - DiffusionPipeline, FastDeployDiffusionPipelineMixin): +class FastDeployStableDiffusionImg2ImgPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin): r""" Pipeline for text-guided image-to-image generation using Stable Diffusion. @@ -63,16 +61,17 @@ class FastDeployStableDiffusionImg2ImgPipeline( _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae_encoder: FastDeployRuntimeModel, - vae_decoder: FastDeployRuntimeModel, - text_encoder: FastDeployRuntimeModel, - tokenizer: CLIPTokenizer, - unet: FastDeployRuntimeModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: FastDeployRuntimeModel, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=False, ): + self, + vae_encoder: FastDeployRuntimeModel, + vae_decoder: FastDeployRuntimeModel, + text_encoder: FastDeployRuntimeModel, + tokenizer: CLIPTokenizer, + unet: FastDeployRuntimeModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: FastDeployRuntimeModel, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = False, + ): super().__init__() if safety_checker is None and requires_safety_checker: logger.warning( @@ -97,36 +96,37 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) + feature_extractor=feature_extractor, + ) self.register_to_config(requires_safety_checker=requires_safety_checker) self.post_init() def __call__( - self, - prompt: Union[str, List[str]]=None, - image: Union[paddle.Tensor, PIL.Image.Image]=None, - height: Optional[int]=None, - width: Optional[int]=None, - strength: float=0.8, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - parse_prompt_type: Optional[str]="lpw", - max_embeddings_multiples: Optional[int]=3, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None, - controlnet_conditioning_scale: float=1.0, - infer_op_dict: Dict[str, str]=None, ): + self, + prompt: Union[str, List[str]] = None, + image: Union[paddle.Tensor, PIL.Image.Image] = None, + height: Optional[int] = None, + width: Optional[int] = None, + strength: float = 0.8, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + parse_prompt_type: Optional[str] = "lpw", + max_embeddings_multiples: Optional[int] = 3, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None, + controlnet_conditioning_scale: float = 1.0, + infer_op_dict: Dict[str, str] = None, + ): r""" Function invoked when calling the pipeline for generation. @@ -199,8 +199,7 @@ def __call__( (nsfw) content, according to the `safety_checker`. """ # 0. Preprocess image - init_image = self.image_processor.preprocess( - image, height=height, width=width) + init_image = self.image_processor.preprocess(image, height=height, width=width) height, width = init_image.shape[-2:] # 1. Check inputs. Raise error if not correct @@ -212,7 +211,8 @@ def __call__( negative_prompt, prompt_embeds, negative_prompt_embeds, - strength, ) + strength, + ) infer_op_dict = self.prepare_infer_op_dict(infer_op_dict) # 2. Define call parameters @@ -238,7 +238,8 @@ def __call__( height=height, batch_size=batch_size, num_images_per_prompt=num_images_per_prompt, - do_classifier_free_guidance=do_classifier_free_guidance, ) + do_classifier_free_guidance=do_classifier_free_guidance, + ) # 3. Encode input prompt prompt_embeds = self._encode_prompt( @@ -250,17 +251,16 @@ def __call__( negative_prompt_embeds=negative_prompt_embeds, parse_prompt_type=parse_prompt_type, max_embeddings_multiples=max_embeddings_multiples, - infer_op=infer_op_dict.get("text_encoder", None), ) + infer_op=infer_op_dict.get("text_encoder", None), + ) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, - strength) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) # 5. Prepare latent variables # at which timestep to set the initial noise (n.b. 50% if strength is 0.5) - latent_timestep = timesteps[:1].tile( - [batch_size * num_images_per_prompt]) + latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt]) # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise is_strength_max = strength == 1.0 latents = self.prepare_latents( @@ -272,47 +272,42 @@ def __call__( image=init_image, timestep=latent_timestep, is_strength_max=is_strength_max, - infer_op=infer_op_dict.get("vae_encoder", None), ) + infer_op=infer_op_dict.get("vae_encoder", None), + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order is_scheduler_support_step_index = self.is_scheduler_support_step_index() with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents if is_scheduler_support_step_index: - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t, step_index=i) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i) else: - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) unet_inputs = dict( sample=latent_model_input, timestep=t, encoder_hidden_states=prompt_embeds, infer_op=infer_op_dict.get("unet", None), - output_shape=latent_model_input.shape, ) + output_shape=latent_model_input.shape, + ) if do_controlnet: unet_inputs["controlnet_cond"] = control_image - unet_inputs[ - "controlnet_conditioning_scale"] = control_conditioning_scale + unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale # predict the noise residual noise_pred_unet = self.unet(**unet_inputs)[0] # perform guidance if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk( - 2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) else: noise_pred = noise_pred_unet @@ -324,16 +319,14 @@ def __call__( latents, step_index=i, return_pred_original_sample=False, - **extra_step_kwargs, ) + **extra_step_kwargs, + ) else: - scheduler_output = self.scheduler.step( - noise_pred, t, latents, **extra_step_kwargs) + scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs) latents = scheduler_output.prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -344,7 +337,8 @@ def __call__( if not output_type == "latent": image = self._decode_vae_latents( latents / self.vae_scaling_factor, - infer_op=infer_op_dict.get("vae_decoder", None), ) + infer_op=infer_op_dict.get("vae_decoder", None), + ) image, has_nsfw_concept = self.run_safety_checker(image) else: image = latents @@ -355,11 +349,9 @@ def __call__( else: do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - image = self.image_processor.postprocess( - image, output_type=output_type, do_denormalize=do_denormalize) + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint.py index 4fdbacaaf890a..2ae694a4f8e2f 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint.py @@ -23,18 +23,13 @@ from ...pipeline_utils import DiffusionPipeline from ...schedulers import KarrasDiffusionSchedulers from ...utils import PIL_INTERPOLATION, logging -from ..fastdeploy_utils import (FastDeployDiffusionPipelineMixin, - FastDeployRuntimeModel) +from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel from . import StableDiffusionPipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name -def prepare_mask_and_masked_image(image, - mask, - height=None, - width=None, - return_image: bool=False): +def prepare_mask_and_masked_image(image, mask, height=None, width=None, return_image: bool = False): """ Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be converted to ``paddle.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the @@ -71,14 +66,11 @@ def prepare_mask_and_masked_image(image, if isinstance(image, paddle.Tensor): if not isinstance(mask, paddle.Tensor): - raise TypeError( - f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not" - ) + raise TypeError(f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not") # Batch single image if image.ndim == 3: - assert (image.shape[0] == 3 - ), "Image outside a batch should be of shape (3, H, W)" + assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)" image = image.unsqueeze(0) # Batch and add channel dim for single mask @@ -95,12 +87,9 @@ def prepare_mask_and_masked_image(image, else: mask = mask.unsqueeze(1) - assert (image.ndim == 4 and - mask.ndim == 4), "Image and Mask must have 4 dimensions" - assert (image.shape[-2:] == mask.shape[-2:] - ), "Image and Mask must have the same spatial dimensions" - assert (image.shape[0] == mask.shape[0] - ), "Image and Mask must have the same batch size" + assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions" + assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions" + assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size" # Check image is in [-1, 1] if image.min() < -1 or image.max() > 1: @@ -117,8 +106,7 @@ def prepare_mask_and_masked_image(image, # Image as float32 image = image.cast(dtype=paddle.float32) elif isinstance(mask, paddle.Tensor): - raise TypeError( - f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not") + raise TypeError(f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not") else: # preprocess image if isinstance(image, (PIL.Image.Image, np.ndarray)): @@ -129,13 +117,8 @@ def prepare_mask_and_masked_image(image, w, h = image[0].size else: w, h = width, height - w, h = (x - x % 8 - for x in (w, h)) # resize to integer multiple of 8 - image = [ - i.resize( - (w, h), resample=PIL_INTERPOLATION["lanczos"]) - for i in image - ] + w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 + image = [i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]) for i in image] image = [np.array(i.convert("RGB"))[None, :] for i in image] image = np.concatenate(image, axis=0) elif isinstance(image, list) and isinstance(image[0], np.ndarray): @@ -154,14 +137,9 @@ def prepare_mask_and_masked_image(image, w, h = mask[0].size else: w, h = width, height - w, h = (x - x % 8 - for x in (w, h)) # resize to integer multiple of 8 - mask = [ - i.resize( - (w, h), resample=PIL_INTERPOLATION["lanczos"]) for i in mask - ] - mask = np.concatenate( - [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0) + w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 + mask = [i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]) for i in mask] + mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0) mask = mask.astype(np.float32) / 255.0 elif isinstance(mask, list) and isinstance(mask[0], np.ndarray): mask = np.concatenate([m[None, None, :] for m in mask], axis=0) @@ -179,8 +157,7 @@ def prepare_mask_and_masked_image(image, return mask, masked_image -class FastDeployStableDiffusionInpaintPipeline( - DiffusionPipeline, FastDeployDiffusionPipelineMixin): +class FastDeployStableDiffusionInpaintPipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin): r""" Pipeline for text-guided image inpainting using Stable Diffusion. @@ -213,16 +190,17 @@ class FastDeployStableDiffusionInpaintPipeline( _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae_encoder: FastDeployRuntimeModel, - vae_decoder: FastDeployRuntimeModel, - text_encoder: FastDeployRuntimeModel, - tokenizer: CLIPTokenizer, - unet: FastDeployRuntimeModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: FastDeployRuntimeModel, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=False, ): + self, + vae_encoder: FastDeployRuntimeModel, + vae_decoder: FastDeployRuntimeModel, + text_encoder: FastDeployRuntimeModel, + tokenizer: CLIPTokenizer, + unet: FastDeployRuntimeModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: FastDeployRuntimeModel, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = False, + ): super().__init__() if safety_checker is None and requires_safety_checker: logger.warning( @@ -247,38 +225,39 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) + feature_extractor=feature_extractor, + ) self.register_to_config(requires_safety_checker=requires_safety_checker) self.post_init() def __call__( - self, - prompt: Union[str, List[str]]=None, - image: Union[paddle.Tensor, PIL.Image.Image]=None, - mask_image: Union[paddle.Tensor, PIL.Image.Image]=None, - height: int=None, - width: int=None, - strength: float=1.0, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - add_predicted_noise: Optional[bool]=False, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - parse_prompt_type: Optional[str]="lpw", - max_embeddings_multiples: Optional[int]=3, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None, - controlnet_conditioning_scale: float=1.0, - infer_op_dict: Dict[str, str]=None, ): + self, + prompt: Union[str, List[str]] = None, + image: Union[paddle.Tensor, PIL.Image.Image] = None, + mask_image: Union[paddle.Tensor, PIL.Image.Image] = None, + height: int = None, + width: int = None, + strength: float = 1.0, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + add_predicted_noise: Optional[bool] = False, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + parse_prompt_type: Optional[str] = "lpw", + max_embeddings_multiples: Optional[int] = 3, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None, + controlnet_conditioning_scale: float = 1.0, + infer_op_dict: Dict[str, str] = None, + ): r""" Function invoked when calling the pipeline for generation. @@ -363,7 +342,8 @@ def __call__( mask_image, height, width, - return_image=True, ) + return_image=True, + ) height, width = init_image.shape[-2:] # 1. Check inputs @@ -375,7 +355,8 @@ def __call__( negative_prompt, prompt_embeds, negative_prompt_embeds, - strength, ) + strength, + ) infer_op_dict = self.prepare_infer_op_dict(infer_op_dict) # 2. Define call parameters @@ -401,15 +382,14 @@ def __call__( negative_prompt_embeds=negative_prompt_embeds, parse_prompt_type=parse_prompt_type, max_embeddings_multiples=max_embeddings_multiples, - infer_op=infer_op_dict.get("text_encoder", None), ) + infer_op=infer_op_dict.get("text_encoder", None), + ) # 4. set timesteps self.scheduler.set_timesteps(num_inference_steps) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, - strength) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) # at which timestep to set the initial noise (n.b. 50% if strength is 0.5) - latent_timestep = timesteps[:1].tile( - [batch_size * num_images_per_prompt]) + latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt]) # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise is_strength_max = strength == 1.0 @@ -429,7 +409,8 @@ def __call__( is_strength_max=is_strength_max, return_noise=True, return_image_latents=return_image_latents, - infer_op=infer_op_dict.get("vae_encoder", None), ) + infer_op=infer_op_dict.get("vae_encoder", None), + ) if return_image_latents: latents, noise, image_latents = latents_outputs @@ -445,24 +426,23 @@ def __call__( width, do_classifier_free_guidance, return_masked_image_latents=True, - infer_op=infer_op_dict.get("vae_encoder", None), ) + infer_op=infer_op_dict.get("vae_encoder", None), + ) # 7. Check that sizes of mask, masked image and latents match if num_channels_unet == 9: # default case for runwayml/stable-diffusion-inpainting num_channels_mask = mask.shape[1] num_channels_masked_image = masked_image_latents.shape[1] - if (num_channels_latents + num_channels_mask + - num_channels_masked_image != num_channels_unet): + if num_channels_latents + num_channels_mask + num_channels_masked_image != num_channels_unet: raise ValueError( f"Incorrect configuration settings! Received `num_channels_latents`: {num_channels_latents} +" f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}" f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of" - " `pipeline.unet` or your `mask_image` or `image` input.") + " `pipeline.unet` or your `mask_image` or `image` input." + ) elif num_channels_unet != 4: - raise ValueError( - f"The unet should have either 4 or 9 input channels, not {num_channels_unet}." - ) + raise ValueError(f"The unet should have either 4 or 9 input channels, not {num_channels_unet}.") # do_controlnet do_controlnet = controlnet_cond is not None and num_channels_unet == 4 if do_controlnet: @@ -473,59 +453,52 @@ def __call__( height=height, batch_size=batch_size, num_images_per_prompt=num_images_per_prompt, - do_classifier_free_guidance=do_classifier_free_guidance, ) + do_classifier_free_guidance=do_classifier_free_guidance, + ) # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) if do_classifier_free_guidance: - init_mask = mask[:mask.shape[0] // 2] + init_mask = mask[: mask.shape[0] // 2] else: init_mask = mask # 9. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order is_scheduler_support_step_index = self.is_scheduler_support_step_index() with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents if is_scheduler_support_step_index: - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t, step_index=i) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i) else: - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) output_shape = latent_model_input.shape if not is_legacy: # concat latents, mask, masked_image_latents in the channel dimension - latent_model_input = paddle.concat( - [latent_model_input, mask, masked_image_latents], - axis=1) + latent_model_input = paddle.concat([latent_model_input, mask, masked_image_latents], axis=1) unet_inputs = dict( sample=latent_model_input, timestep=t, encoder_hidden_states=prompt_embeds, infer_op=infer_op_dict.get("unet", None), - output_shape=output_shape, ) + output_shape=output_shape, + ) if do_controlnet: unet_inputs["controlnet_cond"] = control_image - unet_inputs[ - "controlnet_conditioning_scale"] = control_conditioning_scale + unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale # predict the noise residual noise_pred_unet = self.unet(**unet_inputs)[0] # perform guidance if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk( - 2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) else: noise_pred = noise_pred_unet @@ -537,32 +510,27 @@ def __call__( latents, step_index=i, return_pred_original_sample=False, - **extra_step_kwargs, ) + **extra_step_kwargs, + ) else: - scheduler_output = self.scheduler.step( - noise_pred, t, latents, **extra_step_kwargs) + scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs) latents = scheduler_output.prev_sample if is_legacy: if i < len(timesteps) - 1: # masking if add_predicted_noise: - init_latents_proper = self.scheduler.add_noise( - image_latents, noise_pred_uncond, t) + init_latents_proper = self.scheduler.add_noise(image_latents, noise_pred_uncond, t) else: # https://github.com/huggingface/diffusers/pull/3749/files#diff-39d36ab1e622684e35fe6971c12fb44e24756bdc383aba3d7f6e3b1625bdaafc noise_timestep = timesteps[i + 1] - init_latents_proper = self.scheduler.add_noise( - image_latents, noise, noise_timestep) + init_latents_proper = self.scheduler.add_noise(image_latents, noise, noise_timestep) else: init_latents_proper = image_latents - latents = (1 - init_mask - ) * init_latents_proper + init_mask * latents + latents = (1 - init_mask) * init_latents_proper + init_mask * latents # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -573,7 +541,8 @@ def __call__( if not output_type == "latent": image = self._decode_vae_latents( latents / self.vae_scaling_factor, - infer_op=infer_op_dict.get("vae_decoder", None), ) + infer_op=infer_op_dict.get("vae_decoder", None), + ) image, has_nsfw_concept = self.run_safety_checker(image) else: image = latents @@ -584,11 +553,9 @@ def __call__( else: do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - image = self.image_processor.postprocess( - image, output_type=output_type, do_denormalize=do_denormalize) + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint_legacy.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint_legacy.py index 59c3a5bd12dec..7d2c1d82e5651 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint_legacy.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_inpaint_legacy.py @@ -23,18 +23,13 @@ from ...pipeline_utils import DiffusionPipeline from ...schedulers import KarrasDiffusionSchedulers from ...utils import PIL_INTERPOLATION, logging -from ..fastdeploy_utils import (FastDeployDiffusionPipelineMixin, - FastDeployRuntimeModel) +from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel from . import StableDiffusionPipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name -def prepare_mask_and_masked_image(image, - mask, - height=None, - width=None, - return_image: bool=False): +def prepare_mask_and_masked_image(image, mask, height=None, width=None, return_image: bool = False): """ Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be converted to ``paddle.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the @@ -71,14 +66,11 @@ def prepare_mask_and_masked_image(image, if isinstance(image, paddle.Tensor): if not isinstance(mask, paddle.Tensor): - raise TypeError( - f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not" - ) + raise TypeError(f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not") # Batch single image if image.ndim == 3: - assert (image.shape[0] == 3 - ), "Image outside a batch should be of shape (3, H, W)" + assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)" image = image.unsqueeze(0) # Batch and add channel dim for single mask @@ -95,12 +87,9 @@ def prepare_mask_and_masked_image(image, else: mask = mask.unsqueeze(1) - assert (image.ndim == 4 and - mask.ndim == 4), "Image and Mask must have 4 dimensions" - assert (image.shape[-2:] == mask.shape[-2:] - ), "Image and Mask must have the same spatial dimensions" - assert (image.shape[0] == mask.shape[0] - ), "Image and Mask must have the same batch size" + assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions" + assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions" + assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size" # Check image is in [-1, 1] if image.min() < -1 or image.max() > 1: @@ -117,8 +106,7 @@ def prepare_mask_and_masked_image(image, # Image as float32 image = image.cast(dtype=paddle.float32) elif isinstance(mask, paddle.Tensor): - raise TypeError( - f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not") + raise TypeError(f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not") else: # preprocess image if isinstance(image, (PIL.Image.Image, np.ndarray)): @@ -129,13 +117,8 @@ def prepare_mask_and_masked_image(image, w, h = image[0].size else: w, h = width, height - w, h = (x - x % 8 - for x in (w, h)) # resize to integer multiple of 8 - image = [ - i.resize( - (w, h), resample=PIL_INTERPOLATION["lanczos"]) - for i in image - ] + w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 + image = [i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]) for i in image] image = [np.array(i.convert("RGB"))[None, :] for i in image] image = np.concatenate(image, axis=0) elif isinstance(image, list) and isinstance(image[0], np.ndarray): @@ -154,14 +137,9 @@ def prepare_mask_and_masked_image(image, w, h = mask[0].size else: w, h = width, height - w, h = (x - x % 8 - for x in (w, h)) # resize to integer multiple of 8 - mask = [ - i.resize( - (w, h), resample=PIL_INTERPOLATION["nearest"]) for i in mask - ] - mask = np.concatenate( - [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0) + w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 + mask = [i.resize((w, h), resample=PIL_INTERPOLATION["nearest"]) for i in mask] + mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0) mask = mask.astype(np.float32) / 255.0 elif isinstance(mask, list) and isinstance(mask[0], np.ndarray): mask = np.concatenate([m[None, None, :] for m in mask], axis=0) @@ -179,8 +157,7 @@ def prepare_mask_and_masked_image(image, return mask, masked_image -class FastDeployStableDiffusionInpaintPipelineLegacy( - DiffusionPipeline, FastDeployDiffusionPipelineMixin): +class FastDeployStableDiffusionInpaintPipelineLegacy(DiffusionPipeline, FastDeployDiffusionPipelineMixin): r""" Pipeline for text-guided image inpainting legacy using Stable Diffusion. @@ -213,16 +190,17 @@ class FastDeployStableDiffusionInpaintPipelineLegacy( _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae_encoder: FastDeployRuntimeModel, - vae_decoder: FastDeployRuntimeModel, - text_encoder: FastDeployRuntimeModel, - tokenizer: CLIPTokenizer, - unet: FastDeployRuntimeModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: FastDeployRuntimeModel, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=False, ): + self, + vae_encoder: FastDeployRuntimeModel, + vae_decoder: FastDeployRuntimeModel, + text_encoder: FastDeployRuntimeModel, + tokenizer: CLIPTokenizer, + unet: FastDeployRuntimeModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: FastDeployRuntimeModel, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = False, + ): super().__init__() if safety_checker is None and requires_safety_checker: logger.warning( @@ -247,38 +225,39 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) + feature_extractor=feature_extractor, + ) self.register_to_config(requires_safety_checker=requires_safety_checker) self.post_init() def __call__( - self, - prompt: Union[str, List[str]]=None, - image: Union[paddle.Tensor, PIL.Image.Image]=None, - mask_image: Union[paddle.Tensor, PIL.Image.Image]=None, - height: int=None, - width: int=None, - strength: float=1.0, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - add_predicted_noise: Optional[bool]=False, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - parse_prompt_type: Optional[str]="lpw", - max_embeddings_multiples: Optional[int]=3, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None, - controlnet_conditioning_scale: float=1.0, - infer_op_dict: Dict[str, str]=None, ): + self, + prompt: Union[str, List[str]] = None, + image: Union[paddle.Tensor, PIL.Image.Image] = None, + mask_image: Union[paddle.Tensor, PIL.Image.Image] = None, + height: int = None, + width: int = None, + strength: float = 1.0, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + add_predicted_noise: Optional[bool] = False, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + parse_prompt_type: Optional[str] = "lpw", + max_embeddings_multiples: Optional[int] = 3, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None, + controlnet_conditioning_scale: float = 1.0, + infer_op_dict: Dict[str, str] = None, + ): r""" Function invoked when calling the pipeline for generation. @@ -363,7 +342,8 @@ def __call__( mask_image, height, width, - return_image=True, ) + return_image=True, + ) height, width = init_image.shape[-2:] # 1. Check inputs @@ -375,7 +355,8 @@ def __call__( negative_prompt, prompt_embeds, negative_prompt_embeds, - strength, ) + strength, + ) infer_op_dict = self.prepare_infer_op_dict(infer_op_dict) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -400,7 +381,8 @@ def __call__( height=height, batch_size=batch_size, num_images_per_prompt=num_images_per_prompt, - do_classifier_free_guidance=do_classifier_free_guidance, ) + do_classifier_free_guidance=do_classifier_free_guidance, + ) # 3. Encode input prompt prompt_embeds = self._encode_prompt( @@ -412,15 +394,14 @@ def __call__( negative_prompt_embeds=negative_prompt_embeds, parse_prompt_type=parse_prompt_type, max_embeddings_multiples=max_embeddings_multiples, - infer_op=infer_op_dict.get("text_encoder", None), ) + infer_op=infer_op_dict.get("text_encoder", None), + ) # 4. set timesteps self.scheduler.set_timesteps(num_inference_steps) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, - strength) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) # at which timestep to set the initial noise (n.b. 50% if strength is 0.5) - latent_timestep = timesteps[:1].tile( - [batch_size * num_images_per_prompt]) + latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt]) # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise is_strength_max = strength == 1.0 @@ -436,7 +417,8 @@ def __call__( is_strength_max=is_strength_max, return_noise=True, return_image_latents=True, - infer_op=infer_op_dict.get("vae_encoder", None), ) + infer_op=infer_op_dict.get("vae_encoder", None), + ) # 6. Prepare mask latent variables mask = self.prepare_mask_latents( @@ -447,52 +429,47 @@ def __call__( width, do_classifier_free_guidance, return_masked_image_latents=False, - infer_op=infer_op_dict.get("vae_encoder", None), ) + infer_op=infer_op_dict.get("vae_encoder", None), + ) # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) if do_classifier_free_guidance: - init_mask = mask[:mask.shape[0] // 2] + init_mask = mask[: mask.shape[0] // 2] else: init_mask = mask # 8. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order is_scheduler_support_step_index = self.is_scheduler_support_step_index() with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents if is_scheduler_support_step_index: - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t, step_index=i) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i) else: - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) unet_inputs = dict( sample=latent_model_input, timestep=t, encoder_hidden_states=prompt_embeds, infer_op=infer_op_dict.get("unet", None), - output_shape=latent_model_input.shape, ) + output_shape=latent_model_input.shape, + ) if do_controlnet: unet_inputs["controlnet_cond"] = control_image - unet_inputs[ - "controlnet_conditioning_scale"] = control_conditioning_scale + unet_inputs["controlnet_conditioning_scale"] = control_conditioning_scale # predict the noise residual noise_pred_unet = self.unet(**unet_inputs)[0] # perform guidance if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk( - 2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) else: noise_pred = noise_pred_unet @@ -504,32 +481,27 @@ def __call__( latents, step_index=i, return_pred_original_sample=False, - **extra_step_kwargs, ) + **extra_step_kwargs, + ) else: - scheduler_output = self.scheduler.step( - noise_pred, t, latents, **extra_step_kwargs) + scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs) latents = scheduler_output.prev_sample if i < len(timesteps) - 1: # masking if add_predicted_noise: - init_latents_proper = self.scheduler.add_noise( - image_latents, noise_pred_uncond, t) + init_latents_proper = self.scheduler.add_noise(image_latents, noise_pred_uncond, t) else: # https://github.com/huggingface/diffusers/pull/3749/files#diff-39d36ab1e622684e35fe6971c12fb44e24756bdc383aba3d7f6e3b1625bdaafc noise_timestep = timesteps[i + 1] - init_latents_proper = self.scheduler.add_noise( - image_latents, noise, noise_timestep) + init_latents_proper = self.scheduler.add_noise(image_latents, noise, noise_timestep) else: init_latents_proper = image_latents - latents = (1 - init_mask - ) * init_latents_proper + init_mask * latents + latents = (1 - init_mask) * init_latents_proper + init_mask * latents # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -540,7 +512,8 @@ def __call__( if not output_type == "latent": image = self._decode_vae_latents( latents / self.vae_scaling_factor, - infer_op=infer_op_dict.get("vae_decoder", None), ) + infer_op=infer_op_dict.get("vae_decoder", None), + ) image, has_nsfw_concept = self.run_safety_checker(image) else: image = latents @@ -551,11 +524,9 @@ def __call__( else: do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - image = self.image_processor.postprocess( - image, output_type=output_type, do_denormalize=do_denormalize) + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_mega.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_mega.py index 7f66d4caec169..d2c9622fd7c8a 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_mega.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_mega.py @@ -19,16 +19,17 @@ import PIL.Image from ...utils import logging -from .pipeline_fastdeploy_cycle_diffusion import \ - FastDeployCycleDiffusionPipeline -from .pipeline_fastdeploy_stable_diffusion import \ - FastDeployStableDiffusionPipeline -from .pipeline_fastdeploy_stable_diffusion_img2img import \ - FastDeployStableDiffusionImg2ImgPipeline -from .pipeline_fastdeploy_stable_diffusion_inpaint import \ - FastDeployStableDiffusionInpaintPipeline -from .pipeline_fastdeploy_stable_diffusion_inpaint_legacy import \ - FastDeployStableDiffusionInpaintPipelineLegacy +from .pipeline_fastdeploy_cycle_diffusion import FastDeployCycleDiffusionPipeline +from .pipeline_fastdeploy_stable_diffusion import FastDeployStableDiffusionPipeline +from .pipeline_fastdeploy_stable_diffusion_img2img import ( + FastDeployStableDiffusionImg2ImgPipeline, +) +from .pipeline_fastdeploy_stable_diffusion_inpaint import ( + FastDeployStableDiffusionInpaintPipeline, +) +from .pipeline_fastdeploy_stable_diffusion_inpaint_legacy import ( + FastDeployStableDiffusionInpaintPipelineLegacy, +) logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -63,45 +64,39 @@ class FastDeployStableDiffusionMegaPipeline(FastDeployStableDiffusionPipeline): feature_extractor ([`CLIPFeatureExtractor`]): Model that extracts features from generated images to be used as inputs for the `safety_checker`. """ - _optional_components = [ - "vae_encoder", "safety_checker", "feature_extractor" - ] + _optional_components = ["vae_encoder", "safety_checker", "feature_extractor"] def __call__(self, *args, **kwargs): return self.text2img(*args, **kwargs) def text2img( - self, - prompt: Union[str, List[str]], - height: Optional[int]=512, - width: Optional[int]=512, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: Optional[float]=0.0, - generator: Optional[paddle.Generator]=None, - latents: Optional[paddle.Tensor]=None, - parse_prompt_type: Optional[str]="lpw", - max_embeddings_multiples: Optional[int]=3, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None, - controlnet_conditioning_scale: float=1.0, - infer_op_dict: Dict[str, str]=None, ): + self, + prompt: Union[str, List[str]], + height: Optional[int] = 512, + width: Optional[int] = 512, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: Optional[float] = 0.0, + generator: Optional[paddle.Generator] = None, + latents: Optional[paddle.Tensor] = None, + parse_prompt_type: Optional[str] = "lpw", + max_embeddings_multiples: Optional[int] = 3, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None, + controlnet_conditioning_scale: float = 1.0, + infer_op_dict: Dict[str, str] = None, + ): - expected_components = inspect.signature( - FastDeployStableDiffusionPipeline.__init__).parameters.keys() - components = { - name: component - for name, component in self.components.items() - if name in expected_components - } + expected_components = inspect.signature(FastDeployStableDiffusionPipeline.__init__).parameters.keys() + components = {name: component for name, component in self.components.items() if name in expected_components} temp_pipeline = FastDeployStableDiffusionPipeline( - **components, - requires_safety_checker=self.config.requires_safety_checker) + **components, requires_safety_checker=self.config.requires_safety_checker + ) temp_pipeline._progress_bar_config = self._progress_bar_config output = temp_pipeline( prompt=prompt, @@ -122,42 +117,39 @@ def text2img( callback_steps=callback_steps, controlnet_cond=controlnet_cond, controlnet_conditioning_scale=controlnet_conditioning_scale, - infer_op_dict=infer_op_dict, ) + infer_op_dict=infer_op_dict, + ) return output def img2img( - self, - prompt: Union[str, List[str]], - image: Union[paddle.Tensor, PIL.Image.Image], - height: Optional[int]=None, - width: Optional[int]=None, - strength: float=0.8, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: Optional[float]=0.0, - generator: Optional[paddle.Generator]=None, - latents: Optional[paddle.Tensor]=None, - parse_prompt_type: Optional[str]="lpw", - max_embeddings_multiples: Optional[int]=3, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None, - controlnet_conditioning_scale: float=1.0, - infer_op_dict: Dict[str, str]=None, ): - expected_components = inspect.signature( - FastDeployStableDiffusionImg2ImgPipeline.__init__).parameters.keys() - components = { - name: component - for name, component in self.components.items() - if name in expected_components - } + self, + prompt: Union[str, List[str]], + image: Union[paddle.Tensor, PIL.Image.Image], + height: Optional[int] = None, + width: Optional[int] = None, + strength: float = 0.8, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: Optional[float] = 0.0, + generator: Optional[paddle.Generator] = None, + latents: Optional[paddle.Tensor] = None, + parse_prompt_type: Optional[str] = "lpw", + max_embeddings_multiples: Optional[int] = 3, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None, + controlnet_conditioning_scale: float = 1.0, + infer_op_dict: Dict[str, str] = None, + ): + expected_components = inspect.signature(FastDeployStableDiffusionImg2ImgPipeline.__init__).parameters.keys() + components = {name: component for name, component in self.components.items() if name in expected_components} temp_pipeline = FastDeployStableDiffusionImg2ImgPipeline( - **components, - requires_safety_checker=self.config.requires_safety_checker) + **components, requires_safety_checker=self.config.requires_safety_checker + ) temp_pipeline._progress_bar_config = self._progress_bar_config output = temp_pipeline( prompt=prompt, @@ -180,48 +172,46 @@ def img2img( callback_steps=callback_steps, controlnet_cond=controlnet_cond, controlnet_conditioning_scale=controlnet_conditioning_scale, - infer_op_dict=infer_op_dict, ) + infer_op_dict=infer_op_dict, + ) return output def inpaint_legacy( - self, - prompt: Union[str, List[str]], - image: Union[paddle.Tensor, PIL.Image.Image], - mask_image: Union[paddle.Tensor, PIL.Image.Image], - height: Optional[int]=None, - width: Optional[int]=None, - strength: float=0.8, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: Optional[float]=0.0, - generator: Optional[paddle.Generator]=None, - latents: Optional[paddle.Tensor]=None, - parse_prompt_type: Optional[str]="lpw", - max_embeddings_multiples: Optional[int]=3, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None, - controlnet_conditioning_scale: float=1.0, - infer_op_dict: Dict[str, str]=None, ): + self, + prompt: Union[str, List[str]], + image: Union[paddle.Tensor, PIL.Image.Image], + mask_image: Union[paddle.Tensor, PIL.Image.Image], + height: Optional[int] = None, + width: Optional[int] = None, + strength: float = 0.8, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: Optional[float] = 0.0, + generator: Optional[paddle.Generator] = None, + latents: Optional[paddle.Tensor] = None, + parse_prompt_type: Optional[str] = "lpw", + max_embeddings_multiples: Optional[int] = 3, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None, + controlnet_conditioning_scale: float = 1.0, + infer_op_dict: Dict[str, str] = None, + ): assert ( self.unet_num_latent_channels == 4 ), f"Detected `unet_num_latent_channels` is {self.unet_num_latent_channels}, Plese use `inpaint` method." expected_components = inspect.signature( - FastDeployStableDiffusionInpaintPipelineLegacy. - __init__).parameters.keys() - components = { - name: component - for name, component in self.components.items() - if name in expected_components - } + FastDeployStableDiffusionInpaintPipelineLegacy.__init__ + ).parameters.keys() + components = {name: component for name, component in self.components.items() if name in expected_components} temp_pipeline = FastDeployStableDiffusionInpaintPipelineLegacy( - **components, - requires_safety_checker=self.config.requires_safety_checker) + **components, requires_safety_checker=self.config.requires_safety_checker + ) temp_pipeline._progress_bar_config = self._progress_bar_config output = temp_pipeline( prompt=prompt, @@ -245,45 +235,42 @@ def inpaint_legacy( callback_steps=callback_steps, controlnet_cond=controlnet_cond, controlnet_conditioning_scale=controlnet_conditioning_scale, - infer_op_dict=infer_op_dict, ) + infer_op_dict=infer_op_dict, + ) return output def inpaint( - self, - prompt: Union[str, List[str]], - image: Union[paddle.Tensor, PIL.Image.Image], - mask_image: Union[paddle.Tensor, PIL.Image.Image], - height=None, - width=None, - strength: float=0.8, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: Optional[float]=0.0, - generator: Optional[paddle.Generator]=None, - latents: Optional[paddle.Tensor]=None, - parse_prompt_type: Optional[str]="lpw", - max_embeddings_multiples: Optional[int]=3, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - controlnet_cond: Union[paddle.Tensor, PIL.Image.Image]=None, - controlnet_conditioning_scale: float=1.0, - infer_op_dict: Dict[str, str]=None, ): + self, + prompt: Union[str, List[str]], + image: Union[paddle.Tensor, PIL.Image.Image], + mask_image: Union[paddle.Tensor, PIL.Image.Image], + height=None, + width=None, + strength: float = 0.8, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: Optional[float] = 0.0, + generator: Optional[paddle.Generator] = None, + latents: Optional[paddle.Tensor] = None, + parse_prompt_type: Optional[str] = "lpw", + max_embeddings_multiples: Optional[int] = 3, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + controlnet_cond: Union[paddle.Tensor, PIL.Image.Image] = None, + controlnet_conditioning_scale: float = 1.0, + infer_op_dict: Dict[str, str] = None, + ): assert self.unet_num_latent_channels in [4, 9] - expected_components = inspect.signature( - FastDeployStableDiffusionInpaintPipeline.__init__).parameters.keys() - components = { - name: component - for name, component in self.components.items() - if name in expected_components - } + expected_components = inspect.signature(FastDeployStableDiffusionInpaintPipeline.__init__).parameters.keys() + components = {name: component for name, component in self.components.items() if name in expected_components} temp_pipeline = FastDeployStableDiffusionInpaintPipeline( - **components, - requires_safety_checker=self.config.requires_safety_checker) + **components, requires_safety_checker=self.config.requires_safety_checker + ) temp_pipeline._progress_bar_config = self._progress_bar_config output = temp_pipeline( prompt=prompt, @@ -307,46 +294,42 @@ def inpaint( callback_steps=callback_steps, controlnet_cond=controlnet_cond, controlnet_conditioning_scale=controlnet_conditioning_scale, - infer_op_dict=infer_op_dict, ) + infer_op_dict=infer_op_dict, + ) return output def cycle_diffusion( - self, - prompt: Union[str, List[str]], - source_prompt: Union[str, List[str]], - image: Union[paddle.Tensor, PIL.Image.Image]=None, - height: Optional[int]=None, - width: Optional[int]=None, - strength: float=0.8, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - negative_prompt: Optional[paddle.Tensor]=None, - source_guidance_scale: Optional[float]=1, - num_images_per_prompt: Optional[int]=1, - eta: Optional[float]=0.1, - latents: Optional[paddle.Tensor]=None, - parse_prompt_type: Optional[str]="lpw", - max_embeddings_multiples: Optional[int]=3, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - infer_op_dict: Dict[str, str]=None, ): - expected_components = inspect.signature( - FastDeployCycleDiffusionPipeline.__init__).parameters.keys() - components = { - name: component - for name, component in self.components.items() - if name in expected_components - } + self, + prompt: Union[str, List[str]], + source_prompt: Union[str, List[str]], + image: Union[paddle.Tensor, PIL.Image.Image] = None, + height: Optional[int] = None, + width: Optional[int] = None, + strength: float = 0.8, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + negative_prompt: Optional[paddle.Tensor] = None, + source_guidance_scale: Optional[float] = 1, + num_images_per_prompt: Optional[int] = 1, + eta: Optional[float] = 0.1, + latents: Optional[paddle.Tensor] = None, + parse_prompt_type: Optional[str] = "lpw", + max_embeddings_multiples: Optional[int] = 3, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + infer_op_dict: Dict[str, str] = None, + ): + expected_components = inspect.signature(FastDeployCycleDiffusionPipeline.__init__).parameters.keys() + components = {name: component for name, component in self.components.items() if name in expected_components} temp_pipeline = FastDeployCycleDiffusionPipeline( - **components, - requires_safety_checker=self.config.requires_safety_checker) + **components, requires_safety_checker=self.config.requires_safety_checker + ) temp_pipeline._progress_bar_config = self._progress_bar_config output = temp_pipeline( prompt=prompt, @@ -371,6 +354,7 @@ def cycle_diffusion( return_dict=return_dict, callback=callback, callback_steps=callback_steps, - infer_op_dict=infer_op_dict, ) + infer_op_dict=infer_op_dict, + ) return output diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_upscale.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_upscale.py index 05ff6fa970504..db0660a1cbb90 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_upscale.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_fastdeploy_stable_diffusion_upscale.py @@ -21,24 +21,23 @@ from ...pipeline_utils import DiffusionPipeline from ...schedulers import DDPMScheduler from ...utils import logging -from ..fastdeploy_utils import (FastDeployDiffusionPipelineMixin, - FastDeployRuntimeModel) +from ..fastdeploy_utils import FastDeployDiffusionPipelineMixin, FastDeployRuntimeModel from ..pipeline_utils import ImagePipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name -class FastDeployStableDiffusionUpscalePipeline( - DiffusionPipeline, FastDeployDiffusionPipelineMixin): +class FastDeployStableDiffusionUpscalePipeline(DiffusionPipeline, FastDeployDiffusionPipelineMixin): def __init__( - self, - vae: FastDeployRuntimeModel, - text_encoder: FastDeployRuntimeModel, - tokenizer: Any, - unet: FastDeployRuntimeModel, - low_res_scheduler: DDPMScheduler, - scheduler: Any, - max_noise_level: int=350, ): + self, + vae: FastDeployRuntimeModel, + text_encoder: FastDeployRuntimeModel, + tokenizer: Any, + unet: FastDeployRuntimeModel, + low_res_scheduler: DDPMScheduler, + scheduler: Any, + max_noise_level: int = 350, + ): super().__init__( vae=vae, text_encoder=text_encoder, @@ -49,18 +48,19 @@ def __init__( safety_checker=None, feature_extractor=None, watermarker=None, - max_noise_level=max_noise_level, ) + max_noise_level=max_noise_level, + ) self.post_init(vae_scaling_factor=0.08333) def check_inputs(self, prompt, image, noise_level, callback_steps): if not isinstance(prompt, str) and not isinstance(prompt, list): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - if (not isinstance(image, paddle.Tensor) and - not isinstance(image, PIL.Image.Image) and - not isinstance(image, list)): + if ( + not isinstance(image, paddle.Tensor) + and not isinstance(image, PIL.Image.Image) + and not isinstance(image, list) + ): raise ValueError( f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or `list` but is {type(image)}" ) @@ -83,39 +83,38 @@ def check_inputs(self, prompt, image, noise_level, callback_steps): # check noise level if noise_level > self.config.max_noise_level: - raise ValueError( - f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}" - ) + raise ValueError(f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) def __call__( - self, - prompt: Union[str, List[str]], - image: Union[paddle.Tensor, PIL.Image.Image, List[PIL.Image.Image]], - num_inference_steps: int=75, - guidance_scale: float=9.0, - noise_level: int=20, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - parse_prompt_type: Optional[str]="lpw", - max_embeddings_multiples: Optional[int]=3, - prompt_embeds: Optional[np.ndarray]=None, - negative_prompt_embeds: Optional[np.ndarray]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - infer_op_dict: Dict[str, str]=None, ): + self, + prompt: Union[str, List[str]], + image: Union[paddle.Tensor, PIL.Image.Image, List[PIL.Image.Image]], + num_inference_steps: int = 75, + guidance_scale: float = 9.0, + noise_level: int = 20, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + parse_prompt_type: Optional[str] = "lpw", + max_embeddings_multiples: Optional[int] = 3, + prompt_embeds: Optional[np.ndarray] = None, + negative_prompt_embeds: Optional[np.ndarray] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + infer_op_dict: Dict[str, str] = None, + ): r""" Function invoked when calling the pipeline for generation. @@ -204,7 +203,8 @@ def __call__( negative_prompt_embeds=negative_prompt_embeds, parse_prompt_type=parse_prompt_type, max_embeddings_multiples=max_embeddings_multiples, - infer_op=infer_op_dict.get("text_encoder", None), ) + infer_op=infer_op_dict.get("text_encoder", None), + ) # 4. Preprocess image image = self.image_processor.preprocess(image) @@ -215,13 +215,11 @@ def __call__( # 5. Add noise to image noise_level = paddle.to_tensor([noise_level], dtype="int64") - noise = paddle.randn( - image.shape, generator=generator, dtype=text_embeddings.dtype) + noise = paddle.randn(image.shape, generator=generator, dtype=text_embeddings.dtype) image = self.low_res_scheduler.add_noise(image, noise, noise_level) batch_multiplier = 2 if do_classifier_free_guidance else 1 - image = paddle.concat([image] * batch_multiplier * - num_images_per_prompt) + image = paddle.concat([image] * batch_multiplier * num_images_per_prompt) noise_level = paddle.concat([noise_level] * image.shape[0]) # 6. Prepare latent variables @@ -231,7 +229,8 @@ def __call__( height, width, generator, - latents, ) + latents, + ) NUM_UNET_INPUT_CHANNELS = self.unet_num_latent_channels NUM_LATENT_CHANNELS = self.vae_decoder_num_latent_channels @@ -243,27 +242,24 @@ def __call__( f" {NUM_UNET_INPUT_CHANNELS} but received `num_channels_latents`: {NUM_LATENT_CHANNELS} +" f" `num_channels_image`: {num_channels_image} " f" = {NUM_LATENT_CHANNELS+num_channels_image}. Please verify the config of" - " `pipeline.unet` or your `image` input.") + " `pipeline.unet` or your `image` input." + ) # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 9. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order is_scheduler_support_step_index = self.is_scheduler_support_step_index() with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents if is_scheduler_support_step_index: - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t, step_index=i) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t, step_index=i) else: - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) unet_inputs = dict( sample=paddle.concat( @@ -272,16 +268,15 @@ def __call__( timestep=t, encoder_hidden_states=prompt_embeds, infer_op=infer_op_dict.get("unet", None), - output_shape=latent_model_input.shape, ) + output_shape=latent_model_input.shape, + ) # predict the noise residual noise_pred_unet = self.unet(**unet_inputs)[0] # perform guidance if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk( - 2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred_uncond, noise_pred_text = noise_pred_unet.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) else: noise_pred = noise_pred_unet @@ -293,16 +288,14 @@ def __call__( latents, step_index=i, return_pred_original_sample=False, - **extra_step_kwargs, ) + **extra_step_kwargs, + ) else: - scheduler_output = self.scheduler.step( - noise_pred, t, latents, **extra_step_kwargs) + scheduler_output = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs) latents = scheduler_output.prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -313,16 +306,18 @@ def __call__( if not output_type == "latent": image = self._decode_vae_latents( latents / self.vae_scaling_factor, - infer_op=infer_op_dict.get("vae_decoder", None), ) + infer_op=infer_op_dict.get("vae_decoder", None), + ) else: image = latents do_denormalize = [True] * image.shape[0] - image = self.image_processor.postprocess( - image, output_type=output_type, do_denormalize=do_denormalize) + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) if not return_dict: - return (image, ) + return (image,) - return ImagePipelineOutput(images=image, ) + return ImagePipelineOutput( + images=image, + ) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 75f8db28f0c67..b847facb71074 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -18,16 +18,13 @@ import paddle from packaging import version -from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict -from ...loaders import (FromCkptMixin, LoraLoaderMixin, - TextualInversionLoaderMixin) +from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers -from ...utils import (deprecate, logging, randn_tensor, - replace_example_docstring) +from ...utils import deprecate, logging, randn_tensor, replace_example_docstring from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker @@ -48,8 +45,7 @@ """ -class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, - LoraLoaderMixin, FromCkptMixin): +class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin): r""" Pipeline for text-to-image generation using Stable Diffusion. @@ -87,37 +83,33 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + ): super().__init__() - if (hasattr(scheduler.config, "steps_offset") and - scheduler.config.steps_offset != 1): + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " "to update the config accordingly as leaving `steps_offset` might led to incorrect results" " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" - " file") - deprecate( - "steps_offset!=1", - "1.0.0", - deprecation_message, - standard_warn=False) + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["steps_offset"] = 1 scheduler._internal_dict = FrozenDict(new_config) - if (hasattr(scheduler.config, "clip_sample") and - scheduler.config.clip_sample is True): + if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`." " `clip_sample` should be set to False in the configuration file. Please make sure to update the" @@ -125,11 +117,7 @@ def __init__( " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very" " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file" ) - deprecate( - "clip_sample not set", - "1.0.0", - deprecation_message, - standard_warn=False) + deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["clip_sample"] = False scheduler._internal_dict = FrozenDict(new_config) @@ -150,12 +138,10 @@ def __init__( " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." ) - is_unet_version_less_0_9_0 = hasattr( - unet.config, "_ppdiffusers_version") and version.parse( - version.parse(unet.config._ppdiffusers_version) - .base_version) < version.parse("0.9.0.dev0") - is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and - unet.config.sample_size < 64) + is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse( + version.parse(unet.config._ppdiffusers_version).base_version + ) < version.parse("0.9.0.dev0") + is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: deprecation_message = ( "The configuration file of the unet has set the default `sample_size` to smaller than" @@ -166,12 +152,9 @@ def __init__( " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`" " in the config might lead to incorrect results in future versions. If you have downloaded this" " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" - " the `unet/config.json` file") - deprecate( - "sample_size<64", - "1.0.0", - deprecation_message, - standard_warn=False) + " the `unet/config.json` file" + ) + deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(unet.config) new_config["sample_size"] = 64 unet._internal_dict = FrozenDict(new_config) @@ -183,18 +166,20 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): r""" Encodes the prompt into text encoder hidden states. @@ -234,29 +219,31 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) @@ -264,8 +251,7 @@ def _encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -275,21 +261,22 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, - self.tokenizer) + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( @@ -297,46 +284,42 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) else: has_nsfw_concept = None return image, has_nsfw_concept @@ -355,53 +338,49 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - height, - width, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -414,17 +393,19 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = [ batch_size, num_channels_latents, @@ -447,25 +428,25 @@ def prepare_latents( @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - prompt: Union[str, List[str]]=None, - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, ): + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ): r""" Function invoked when calling the pipeline for generation. @@ -546,7 +527,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -568,7 +550,8 @@ def __call__( do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -583,43 +566,38 @@ def __call__( width, prompt_embeds.dtype, generator, - latents, ) + latents, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( latent_model_input, t, encoder_hidden_states=prompt_embeds, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -632,8 +610,7 @@ def __call__( image = self.decode_latents(latents) # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker( - image, prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) # 10. Convert to PIL image = self.numpy_to_pil(image) @@ -642,11 +619,9 @@ def __call__( image = self.decode_latents(latents) # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker( - image, prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_adapter.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_adapter.py index 7b6cf35b03da0..0ec8990c31e59 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_adapter.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_adapter.py @@ -18,15 +18,12 @@ import numpy as np import paddle import PIL -from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer # from ...loaders import TextualInversionLoaderMixin -from ...models import (AutoencoderKL, MultiAdapter, T2IAdapter, - UNet2DConditionModel) +from ...models import AutoencoderKL, MultiAdapter, T2IAdapter, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers -from ...utils import (PIL_INTERPOLATION, logging, randn_tensor, - replace_example_docstring) +from ...utils import PIL_INTERPOLATION, logging, randn_tensor, replace_example_docstring from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker @@ -81,8 +78,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image: else: h = int(round(img_size / 8 / coef) * 8) - images = images.resize( - (w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None) + images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None) return images @@ -95,12 +91,8 @@ def preprocess(image): if isinstance(image[0], PIL.Image.Image): w, h = image[0].size w, h = (x - x % 8 for x in (w, h)) - image = [ - np.array(i.resize( - (w, h), resample=PIL_INTERPOLATION["lanczos"])) for i in image - ] - image = [(i[None, ..., None] if i.ndim == 2 else i[None, ...]) - for i in image] + image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])) for i in image] + image = [(i[None, ..., None] if i.ndim == 2 else i[None, ...]) for i in image] image = np.concatenate(image, axis=0) image = np.array(image).astype(np.float32) / 255.0 image = image.transpose(0, 3, 1, 2) @@ -155,17 +147,18 @@ class StableDiffusionAdapterPipeline(DiffusionPipeline): _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - adapter: Union[T2IAdapter, MultiAdapter, List[T2IAdapter]], - scheduler: KarrasDiffusionSchedulers, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, - adapter_weights: Optional[List[float]]=None, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + adapter: Union[T2IAdapter, MultiAdapter, List[T2IAdapter]], + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + adapter_weights: Optional[List[float]] = None, + requires_safety_checker: bool = True, + ): super().__init__() if safety_checker is None and requires_safety_checker: logger.warning( @@ -185,8 +178,9 @@ def __init__( adapter=adapter, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) def enable_vae_slicing(self): @@ -206,13 +200,14 @@ def disable_vae_slicing(self): self.vae.disable_slicing() def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): """ Encodes the prompt into text encoder hidden states. @@ -249,32 +244,29 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( f"The following part of your input was truncated because CLIP can only handle sequences up to {self.tokenizer.model_max_length} tokens: {removed_text}" ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None - prompt_embeds = self.text_encoder( - text_input_ids, attention_mask=attention_mask) + prompt_embeds = self.text_encoder(text_input_ids, attention_mask=attention_mask) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.astype(self.text_encoder.dtype) bs_embed, seq_len, _ = prompt_embeds.shape - prompt_embeds = prompt_embeds.tile( - repeat_times=[1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.tile(repeat_times=[1, num_images_per_prompt, 1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) if do_classifier_free_guidance and negative_prompt_embeds is None: uncond_tokens: List[str] if negative_prompt is None: @@ -300,34 +292,28 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + return_tensors="pd", + ) + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None - negative_prompt_embeds = self.text_encoder( - uncond_input.input_ids, attention_mask=attention_mask) + negative_prompt_embeds = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.astype( - self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - repeat_times=[1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - (batch_size * num_images_per_prompt, seq_len, -1)) - prompt_embeds = paddle.concat( - x=[negative_prompt_embeds, prompt_embeds]) + negative_prompt_embeds = negative_prompt_embeds.astype(self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.tile(repeat_times=[1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape((batch_size * num_images_per_prompt, seq_len, -1)) + prompt_embeds = paddle.concat(x=[negative_prompt_embeds, prompt_embeds]) return prompt_embeds def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.astype(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.astype(dtype) + ) else: has_nsfw_concept = None return image, has_nsfw_concept @@ -336,37 +322,36 @@ def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents image = self.vae.decode(latents).sample image = (image / 2 + 0.5).clip(min=0, max=1) - image = image.cpu().transpose(perm=[0, 2, 3, 1]).astype( - dtype="float32").numpy() + image = image.cpu().transpose(perm=[0, 2, 3, 1]).astype(dtype="float32").numpy() return image def prepare_extra_step_kwargs(self, generator, eta): - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - height, - width, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) - if (callback_steps is None or callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") + if ( + callback_steps is None + or callback_steps is not None + and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}." ) @@ -378,11 +363,8 @@ def check_inputs( raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two." @@ -394,19 +376,21 @@ def check_inputs( ) def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = ( batch_size, num_channels_latents, height // self.vae_scale_factor, - width // self.vae_scale_factor, ) + width // self.vae_scale_factor, + ) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( f"You have passed a list of generators of length {len(generator)}, but requested an effective batch size of {batch_size}. Make sure the batch size matches the length of the generators." @@ -438,28 +422,27 @@ def _default_height_width(self, height, width, image): @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - prompt: Union[str, List[str]]=None, - image: Union[paddle.Tensor, PIL.Image.Image, List[ - PIL.Image.Image]]=None, - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: int=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - adapter_conditioning_scale: Union[float, List[float]]=1.0, ): + self, + prompt: Union[str, List[str]] = None, + image: Union[paddle.Tensor, PIL.Image.Image, List[PIL.Image.Image]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + adapter_conditioning_scale: Union[float, List[float]] = 1.0, + ): """ Function invoked when calling the pipeline for generation. @@ -550,13 +533,13 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) is_multi_adapter = isinstance(self.adapter, MultiAdapter) if is_multi_adapter: adapter_input = [preprocess(img) for img in image] n, c, h, w = adapter_input[0].shape - adapter_input = paddle.stack( - x=[x.reshape([n * c, h, w]) for x in adapter_input]) + adapter_input = paddle.stack(x=[x.reshape([n * c, h, w]) for x in adapter_input]) else: adapter_input = preprocess(image) adapter_input = adapter_input.astype(self.adapter.dtype) @@ -573,7 +556,8 @@ def __call__( do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) self.scheduler.set_timesteps(num_inference_steps) timesteps = self.scheduler.timesteps num_channels_latents = self.unet.in_channels @@ -584,43 +568,35 @@ def __call__( width, prompt_embeds.dtype, generator, - latents, ) + latents, + ) extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) adapter_state = self.adapter(adapter_input) for k, v in enumerate(adapter_state): adapter_state[k] = v * adapter_conditioning_scale if num_images_per_prompt > 1: for k, v in enumerate(adapter_state): - adapter_state[k] = v.tile( - repeat_times=[num_images_per_prompt, 1, 1, 1]) + adapter_state[k] = v.tile(repeat_times=[num_images_per_prompt, 1, 1, 1]) if do_classifier_free_guidance: for k, v in enumerate(adapter_state): adapter_state[k] = paddle.concat(x=[v] * 2, axis=0) - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): - latent_model_input = (paddle.concat(x=[latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat(x=[latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) noise_pred = self.unet( latent_model_input, t, encoder_hidden_states=prompt_embeds, cross_attention_kwargs=cross_attention_kwargs, - down_block_additional_residuals=[ - state.clone() for state in adapter_state - ], ).sample + down_block_additional_residuals=[state.clone() for state in adapter_state], + ).sample if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk( - chunks=2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample - if (i == len(timesteps) - 1 or i + 1 > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample + if i == len(timesteps) - 1 or i + 1 > num_warmup_steps and (i + 1) % self.scheduler.order == 0: progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -629,14 +605,11 @@ def __call__( has_nsfw_concept = None elif output_type == "pil": image = self.decode_latents(latents) - image, has_nsfw_concept = self.run_safety_checker( - image, prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) image = self.numpy_to_pil(image) else: image = self.decode_latents(latents) - image, has_nsfw_concept = self.run_safety_checker( - image, prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) if not return_dict: return image, has_nsfw_concept - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_all_in_one.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_all_in_one.py index 3deff63114cd2..3971ea99471d6 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_all_in_one.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_all_in_one.py @@ -25,17 +25,20 @@ import PIL import PIL.Image from packaging import version -from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict -from ...loaders import (FromCkptMixin, LoraLoaderMixin, - TextualInversionLoaderMixin) +from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...pipeline_utils import DiffusionPipeline from ...schedulers import ( - DDIMScheduler, DPMSolverMultistepScheduler, EulerAncestralDiscreteScheduler, - EulerDiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler) + DDIMScheduler, + DPMSolverMultistepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, +) from ...utils import PIL_INTERPOLATION, deprecate, logging from ...utils.testing_utils import load_image from . import StableDiffusionPipelineOutput @@ -86,7 +89,8 @@ def save_all(images, FORMAT="jpg", OUTDIR="./outputs/"): [^\\()\[\]:]+| : """, - re.X, ) + re.X, +) def parse_prompt_attention(text): @@ -175,9 +179,7 @@ def multiply_range(start_position, multiplier): return res -def get_prompts_with_weights(pipe: DiffusionPipeline, - prompt: List[str], - max_length: int): +def get_prompts_with_weights(pipe: DiffusionPipeline, prompt: List[str], max_length: int): r""" Tokenize a list of prompts and return its tokens with weights of each token. @@ -212,32 +214,20 @@ def get_prompts_with_weights(pipe: DiffusionPipeline, tokens.append(text_token) weights.append(text_weight) if truncated: - logger.warning( - "Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples" - ) + logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples") return tokens, weights -def pad_tokens_and_weights(tokens, - weights, - max_length, - bos, - eos, - pad, - no_boseos_middle=True, - chunk_length=77): +def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos_middle=True, chunk_length=77): r""" Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length. """ max_embeddings_multiples = (max_length - 2) // (chunk_length - 2) - weights_length = (max_length if no_boseos_middle else - max_embeddings_multiples * chunk_length) + weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length for i in range(len(tokens)): - tokens[i] = ([bos] + tokens[i] + [eos] + [pad] * - (max_length - 2 - len(tokens[i]))) + tokens[i] = [bos] + tokens[i] + [eos] + [pad] * (max_length - 2 - len(tokens[i])) if no_boseos_middle: - weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - - len(weights[i])) + weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i])) else: w = [] if len(weights[i]) == 0: @@ -245,8 +235,7 @@ def pad_tokens_and_weights(tokens, else: for j in range(max_embeddings_multiples): w.append(1.0) # weight for starting token in this chunk - w += weights[i][j * (chunk_length - 2):min( - len(weights[i]), (j + 1) * (chunk_length - 2))] + w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))] w.append(1.0) # weight for ending token in this chunk w += [1.0] * (weights_length - len(w)) weights[i] = w[:] @@ -255,10 +244,11 @@ def pad_tokens_and_weights(tokens, def get_unweighted_text_embeddings( - pipe: DiffusionPipeline, - text_input: paddle.Tensor, - chunk_length: int, - no_boseos_middle: Optional[bool]=True, ): + pipe: DiffusionPipeline, + text_input: paddle.Tensor, + chunk_length: int, + no_boseos_middle: Optional[bool] = True, +): """ When the length of tokens is a multiple of the capacity of the text encoder, it should be split into chunks and sent to the text encoder individually. @@ -268,8 +258,7 @@ def get_unweighted_text_embeddings( text_embeddings = [] for i in range(max_embeddings_multiples): # extract the i-th chunk - text_input_chunk = text_input[:, i * (chunk_length - 2):(i + 1) * ( - chunk_length - 2) + 2].clone() + text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone() # cover the head and the tail by the starting and the ending tokens text_input_chunk[:, 0] = text_input[0, 0] @@ -296,14 +285,15 @@ def get_unweighted_text_embeddings( def get_weighted_text_embeddings( - pipe: DiffusionPipeline, - prompt: Union[str, List[str]], - uncond_prompt: Optional[Union[str, List[str]]]=None, - max_embeddings_multiples: Optional[int]=1, - no_boseos_middle: Optional[bool]=False, - skip_parsing: Optional[bool]=False, - skip_weighting: Optional[bool]=False, - **kwargs, ): + pipe: DiffusionPipeline, + prompt: Union[str, List[str]], + uncond_prompt: Optional[Union[str, List[str]]] = None, + max_embeddings_multiples: Optional[int] = 1, + no_boseos_middle: Optional[bool] = False, + skip_parsing: Optional[bool] = False, + skip_weighting: Optional[bool] = False, + **kwargs, +): r""" Prompts can be assigned with local weights using brackets. For example, prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful', @@ -329,24 +319,19 @@ def get_weighted_text_embeddings( skip_weighting (`bool`, *optional*, defaults to `False`): Skip the weighting. When the parsing is skipped, it is forced True. """ - max_length = (pipe.tokenizer.model_max_length - 2 - ) * max_embeddings_multiples + 2 + max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2 if isinstance(prompt, str): prompt = [prompt] if not skip_parsing: - prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, - max_length - 2) + prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2) if uncond_prompt is not None: if isinstance(uncond_prompt, str): uncond_prompt = [uncond_prompt] - uncond_tokens, uncond_weights = get_prompts_with_weights( - pipe, uncond_prompt, max_length - 2) + uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2) else: prompt_tokens = [ - token[1:-1] - for token in pipe.tokenizer( - prompt, max_length=max_length, truncation=True).input_ids + token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids ] prompt_weights = [[1.0] * len(token) for token in prompt_tokens] if uncond_prompt is not None: @@ -354,33 +339,26 @@ def get_weighted_text_embeddings( uncond_prompt = [uncond_prompt] uncond_tokens = [ token[1:-1] - for token in pipe.tokenizer( - uncond_prompt, max_length=max_length, truncation=True) - .input_ids + for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids ] uncond_weights = [[1.0] * len(token) for token in uncond_tokens] # round up the longest length of tokens to a multiple of (model_max_length - 2) max_length = max([len(token) for token in prompt_tokens]) if uncond_prompt is not None: - max_length = max(max_length, - max([len(token) for token in uncond_tokens])) + max_length = max(max_length, max([len(token) for token in uncond_tokens])) max_embeddings_multiples = min( max_embeddings_multiples, - (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1, ) + (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1, + ) max_embeddings_multiples = max(1, max_embeddings_multiples) - max_length = (pipe.tokenizer.model_max_length - 2 - ) * max_embeddings_multiples + 2 + max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2 # pad the length of tokens and weights # support bert tokenizer - bos = (pipe.tokenizer.bos_token_id - if pipe.tokenizer.bos_token_id is not None else - pipe.tokenizer.cls_token_id) - eos = (pipe.tokenizer.eos_token_id - if pipe.tokenizer.eos_token_id is not None else - pipe.tokenizer.sep_token_id) + bos = pipe.tokenizer.bos_token_id if pipe.tokenizer.bos_token_id is not None else pipe.tokenizer.cls_token_id + eos = pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id is not None else pipe.tokenizer.sep_token_id pad = pipe.tokenizer.pad_token_id prompt_tokens, prompt_weights = pad_tokens_and_weights( prompt_tokens, @@ -390,7 +368,8 @@ def get_weighted_text_embeddings( eos, pad, no_boseos_middle=no_boseos_middle, - chunk_length=pipe.tokenizer.model_max_length, ) + chunk_length=pipe.tokenizer.model_max_length, + ) prompt_tokens = paddle.to_tensor(prompt_tokens, dtype=paddle.int64) if uncond_prompt is not None: uncond_tokens, uncond_weights = pad_tokens_and_weights( @@ -401,7 +380,8 @@ def get_weighted_text_embeddings( eos, pad, no_boseos_middle=no_boseos_middle, - chunk_length=pipe.tokenizer.model_max_length, ) + chunk_length=pipe.tokenizer.model_max_length, + ) uncond_tokens = paddle.to_tensor(uncond_tokens, dtype=paddle.int64) # get the embeddings @@ -409,30 +389,28 @@ def get_weighted_text_embeddings( pipe, prompt_tokens, pipe.tokenizer.model_max_length, - no_boseos_middle=no_boseos_middle, ) - prompt_weights = paddle.to_tensor( - prompt_weights, dtype=paddle.float32).cast(text_embeddings.dtype) + no_boseos_middle=no_boseos_middle, + ) + prompt_weights = paddle.to_tensor(prompt_weights, dtype=paddle.float32).cast(text_embeddings.dtype) if uncond_prompt is not None: uncond_embeddings = get_unweighted_text_embeddings( pipe, uncond_tokens, pipe.tokenizer.model_max_length, - no_boseos_middle=no_boseos_middle, ) - uncond_weights = paddle.to_tensor( - uncond_weights, dtype=paddle.float32).cast(uncond_embeddings.dtype) + no_boseos_middle=no_boseos_middle, + ) + uncond_weights = paddle.to_tensor(uncond_weights, dtype=paddle.float32).cast(uncond_embeddings.dtype) # assign weights to the prompts and normalize in the sense of mean # TODO: should we normalize by chunk or in a whole (current implementation)? if (not skip_parsing) and (not skip_weighting): previous_mean = text_embeddings.mean(axis=[-2, -1]) text_embeddings *= prompt_weights.unsqueeze(-1) - text_embeddings *= previous_mean / text_embeddings.mean( - axis=[-2, -1], keepdim=True) + text_embeddings *= previous_mean / text_embeddings.mean(axis=[-2, -1], keepdim=True) if uncond_prompt is not None: previous_mean = uncond_embeddings.mean(axis=[-2, -1]) uncond_embeddings *= uncond_weights.unsqueeze(-1) - uncond_embeddings *= previous_mean / uncond_embeddings.mean( - axis=[-2, -1], keepdim=True) + uncond_embeddings *= previous_mean / uncond_embeddings.mean(axis=[-2, -1], keepdim=True) if uncond_prompt is not None: return text_embeddings, uncond_embeddings @@ -453,9 +431,7 @@ def preprocess_mask(mask, scale_factor=8): mask = mask.convert("L") w, h = mask.size w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32 - mask = mask.resize( - (w // scale_factor, h // scale_factor), - resample=PIL_INTERPOLATION["nearest"]) + mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"]) mask = np.array(mask).astype(np.float32) / 255.0 mask = np.tile(mask, (4, 1, 1)) mask = mask[None].transpose(0, 1, 2, 3) # what does this step do? @@ -464,9 +440,7 @@ def preprocess_mask(mask, scale_factor=8): return mask -class StableDiffusionPipelineAllinOne(DiffusionPipeline, - TextualInversionLoaderMixin, - LoraLoaderMixin, FromCkptMixin): +class StableDiffusionPipelineAllinOne(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin): r""" Pipeline for text-to-image image-to-image inpainting generation using Stable Diffusion. @@ -497,38 +471,38 @@ class StableDiffusionPipelineAllinOne(DiffusionPipeline, _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler, - EulerDiscreteScheduler, - EulerAncestralDiscreteScheduler, - DPMSolverMultistepScheduler, ], - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, - requires_safety_checker: bool=False, ): - if (hasattr(scheduler.config, "steps_offset") and - scheduler.config.steps_offset != 1): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: Union[ + DDIMScheduler, + PNDMScheduler, + LMSDiscreteScheduler, + EulerDiscreteScheduler, + EulerAncestralDiscreteScheduler, + DPMSolverMultistepScheduler, + ], + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + requires_safety_checker: bool = False, + ): + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " "to update the config accordingly as leaving `steps_offset` might led to incorrect results" " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" - " file") - deprecate( - "steps_offset!=1", - "1.0.0", - deprecation_message, - standard_warn=False) + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["steps_offset"] = 1 scheduler._internal_dict = FrozenDict(new_config) - if (hasattr(scheduler.config, "clip_sample") and - scheduler.config.clip_sample is True): + if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`." " `clip_sample` should be set to False in the configuration file. Please make sure to update the" @@ -536,11 +510,7 @@ def __init__( " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very" " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file" ) - deprecate( - "clip_sample not set", - "1.0.0", - deprecation_message, - standard_warn=False) + deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["clip_sample"] = False scheduler._internal_dict = FrozenDict(new_config) @@ -559,12 +529,10 @@ def __init__( f"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." ) - is_unet_version_less_0_9_0 = hasattr( - unet.config, "_ppdiffusers_version") and version.parse( - version.parse(unet.config._ppdiffusers_version) - .base_version) < version.parse("0.9.0.dev0") - is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and - unet.config.sample_size < 64) + is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse( + version.parse(unet.config._ppdiffusers_version).base_version + ) < version.parse("0.9.0.dev0") + is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: deprecation_message = ( "The configuration file of the unet has set the default `sample_size` to smaller than" @@ -575,12 +543,9 @@ def __init__( " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`" " in the config might lead to incorrect results in future versions. If you have downloaded this" " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" - " the `unet/config.json` file") - deprecate( - "sample_size<64", - "1.0.0", - deprecation_message, - standard_warn=False) + " the `unet/config.json` file" + ) + deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(unet.config) new_config["sample_size"] = 64 unet._internal_dict = FrozenDict(new_config) @@ -592,7 +557,8 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) + feature_extractor=feature_extractor, + ) self.register_to_config(requires_safety_checker=requires_safety_checker) self.__init__additional__() @@ -602,7 +568,8 @@ def __init__additional__(self): setattr( self, "vae_scale_factor", - 2**(len(self.vae.config.block_out_channels) - 1), ) + 2 ** (len(self.vae.config.block_out_channels) - 1), + ) def __call__(self, *args, **kwargs): return self.text2image(*args, **kwargs) @@ -611,16 +578,17 @@ def text2img(self, *args, **kwargs): return self.text2image(*args, **kwargs) def _encode_prompt( - self, - prompt, - negative_prompt, - max_embeddings_multiples, - no_boseos_middle, - skip_parsing, - skip_weighting, - do_classifier_free_guidance, - num_images_per_prompt, - **kwargs, ): + self, + prompt, + negative_prompt, + max_embeddings_multiples, + no_boseos_middle, + skip_parsing, + skip_weighting, + do_classifier_free_guidance, + num_images_per_prompt, + **kwargs, + ): batch_size = len(prompt) if isinstance(prompt, list) else 1 if negative_prompt is None: @@ -631,41 +599,37 @@ def _encode_prompt( raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) text_embeddings, uncond_embeddings = get_weighted_text_embeddings( pipe=self, prompt=prompt, - uncond_prompt=negative_prompt - if do_classifier_free_guidance else None, + uncond_prompt=negative_prompt if do_classifier_free_guidance else None, max_embeddings_multiples=max_embeddings_multiples, no_boseos_middle=no_boseos_middle, skip_parsing=skip_parsing, skip_weighting=skip_weighting, - **kwargs, ) + **kwargs, + ) bs_embed, seq_len, _ = text_embeddings.shape text_embeddings = text_embeddings.tile([1, num_images_per_prompt, 1]) - text_embeddings = text_embeddings.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + text_embeddings = text_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) if do_classifier_free_guidance: seq_len = uncond_embeddings.shape[1] - uncond_embeddings = uncond_embeddings.tile( - [1, num_images_per_prompt, 1]) - uncond_embeddings = uncond_embeddings.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) - text_embeddings = paddle.concat( - [uncond_embeddings, text_embeddings]) + uncond_embeddings = uncond_embeddings.tile([1, num_images_per_prompt, 1]) + uncond_embeddings = uncond_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1]) + text_embeddings = paddle.concat([uncond_embeddings, text_embeddings]) return text_embeddings def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) else: has_nsfw_concept = None return image, has_nsfw_concept @@ -684,8 +648,7 @@ def prepare_extra_step_kwargs(self, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta @@ -694,61 +657,47 @@ def prepare_extra_step_kwargs(self, eta): def check_inputs_text2img(self, prompt, height, width, callback_steps): if not isinstance(prompt, str) and not isinstance(prompt, list): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) def check_inputs_img2img_inpaint(self, prompt, strength, callback_steps): if not isinstance(prompt, str) and not isinstance(prompt, list): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if strength < 0 or strength > 1: - raise ValueError( - f"The value of strength should in [1.0, 1.0] but is {strength}") + raise ValueError(f"The value of strength should in [1.0, 1.0] but is {strength}") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") - - def prepare_latents_text2img(self, - batch_size, - num_channels_latents, - height, - width, - dtype, - latents=None): + f" {type(callback_steps)}." + ) + + def prepare_latents_text2img(self, batch_size, num_channels_latents, height, width, dtype, latents=None): shape = [batch_size, num_channels_latents, height // 8, width // 8] if latents is None: latents = paddle.randn(shape, dtype=dtype) else: if latents.shape != shape: - raise ValueError( - f"Unexpected latents shape, got {latents.shape}, expected {shape}" - ) + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") # scale the initial noise by the standard deviation required by the scheduler latents = latents * self.scheduler.init_noise_sigma return latents - def prepare_latents_img2img(self, image, timestep, num_images_per_prompt, - dtype): + def prepare_latents_img2img(self, image, timestep, num_images_per_prompt, dtype): image = image.cast(dtype=dtype) init_latent_dist = self.vae.encode(image).latent_dist init_latents = init_latent_dist.sample() @@ -756,8 +705,7 @@ def prepare_latents_img2img(self, image, timestep, num_images_per_prompt, b, c, h, w = init_latents.shape init_latents = init_latents.tile([1, num_images_per_prompt, 1, 1]) - init_latents = init_latents.reshape( - [b * num_images_per_prompt, c, h, w]) + init_latents = init_latents.reshape([b * num_images_per_prompt, c, h, w]) # add noise to latents using the timesteps noise = paddle.randn(init_latents.shape, dtype=dtype) @@ -779,8 +727,7 @@ def get_timesteps(self, num_inference_steps, strength): return timesteps, num_inference_steps - t_start - def prepare_latents_inpaint(self, image, timestep, num_images_per_prompt, - dtype): + def prepare_latents_inpaint(self, image, timestep, num_images_per_prompt, dtype): image = image.cast(dtype) init_latent_dist = self.vae.encode(image).latent_dist init_latents = init_latent_dist.sample() @@ -788,8 +735,7 @@ def prepare_latents_inpaint(self, image, timestep, num_images_per_prompt, b, c, h, w = init_latents.shape init_latents = init_latents.tile([1, num_images_per_prompt, 1, 1]) - init_latents = init_latents.reshape( - [b * num_images_per_prompt, c, h, w]) + init_latents = init_latents.reshape([b * num_images_per_prompt, c, h, w]) init_latents_orig = init_latents @@ -801,27 +747,28 @@ def prepare_latents_inpaint(self, image, timestep, num_images_per_prompt, @paddle.no_grad() def text2image( - self, - prompt: Union[str, List[str]], - height: int=512, - width: int=512, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - seed: Optional[int]=None, - latents: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - # new add - max_embeddings_multiples: Optional[int]=3, - no_boseos_middle: Optional[bool]=False, - skip_parsing: Optional[bool]=False, - skip_weighting: Optional[bool]=False, - **kwargs, ): + self, + prompt: Union[str, List[str]], + height: int = 512, + width: int = 512, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + seed: Optional[int] = None, + latents: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + # new add + max_embeddings_multiples: Optional[int] = 3, + no_boseos_middle: Optional[bool] = False, + skip_parsing: Optional[bool] = False, + skip_weighting: Optional[bool] = False, + **kwargs, + ): r""" Function invoked when calling the pipeline for generation. @@ -891,7 +838,8 @@ def text2image( no_boseos_middle=no_boseos_middle, skip_parsing=skip_parsing, skip_weighting=skip_weighting, - epoch_time=time.time(), ) + epoch_time=time.time(), + ) paddle.seed(seed) # 1. Check inputs. Raise error if not correct self.check_inputs_text2img(prompt, height, width, callback_steps) @@ -912,7 +860,8 @@ def text2image( skip_parsing, skip_weighting, do_classifier_free_guidance, - num_images_per_prompt, ) + num_images_per_prompt, + ) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -926,42 +875,33 @@ def text2image( height, width, text_embeddings.dtype, - latents, ) + latents, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(eta) # 7. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual - noise_pred = self.unet( - latent_model_input, - t, - encoder_hidden_states=text_embeddings).sample + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -970,8 +910,7 @@ def text2image( image = self.decode_latents(latents) # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, - text_embeddings.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, text_embeddings.dtype) # 10. Convert to PIL if output_type == "pil": @@ -980,33 +919,33 @@ def text2image( if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) @paddle.no_grad() def img2img( - self, - prompt: Union[str, List[str]], - image: Union[paddle.Tensor, PIL.Image.Image], - strength: float=0.8, - height=None, - width=None, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: Optional[float]=0.0, - seed: Optional[int]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - # new add - max_embeddings_multiples: Optional[int]=1, - no_boseos_middle: Optional[bool]=False, - skip_parsing: Optional[bool]=False, - skip_weighting: Optional[bool]=False, - **kwargs, ): + self, + prompt: Union[str, List[str]], + image: Union[paddle.Tensor, PIL.Image.Image], + strength: float = 0.8, + height=None, + width=None, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: Optional[float] = 0.0, + seed: Optional[int] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + # new add + max_embeddings_multiples: Optional[int] = 1, + no_boseos_middle: Optional[bool] = False, + skip_parsing: Optional[bool] = False, + skip_weighting: Optional[bool] = False, + **kwargs, + ): r""" Function invoked when calling the pipeline for generation. @@ -1093,7 +1032,8 @@ def img2img( no_boseos_middle=no_boseos_middle, skip_parsing=skip_parsing, skip_weighting=skip_weighting, - epoch_time=time.time(), ) + epoch_time=time.time(), + ) paddle.seed(seed) # 1. Check inputs @@ -1115,7 +1055,8 @@ def img2img( skip_parsing, skip_weighting, do_classifier_free_guidance, - num_images_per_prompt, ) + num_images_per_prompt, + ) # 4. Preprocess image if isinstance(image, PIL.Image.Image): @@ -1124,50 +1065,36 @@ def img2img( # 5. set timesteps self.scheduler.set_timesteps(num_inference_steps) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, - strength) - latent_timestep = timesteps[:1].tile( - [batch_size * num_images_per_prompt]) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) + latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt]) # 6. Prepare latent variables - latents = self.prepare_latents_img2img(image, latent_timestep, - num_images_per_prompt, - text_embeddings.dtype) + latents = self.prepare_latents_img2img(image, latent_timestep, num_images_per_prompt, text_embeddings.dtype) # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(eta) # 8. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual - noise_pred = self.unet( - latent_model_input, - t, - encoder_hidden_states=text_embeddings).sample + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -1176,8 +1103,7 @@ def img2img( image = self.decode_latents(latents) # 10. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, - text_embeddings.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, text_embeddings.dtype) # 11. Convert to PIL if output_type == "pil": @@ -1186,34 +1112,34 @@ def img2img( if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) @paddle.no_grad() def inpaint( - self, - prompt: Union[str, List[str]], - image: Union[paddle.Tensor, PIL.Image.Image], - mask_image: Union[paddle.Tensor, PIL.Image.Image], - height=None, - width=None, - strength: float=0.8, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: Optional[float]=0.0, - seed: Optional[int]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - # new add - max_embeddings_multiples: Optional[int]=1, - no_boseos_middle: Optional[bool]=False, - skip_parsing: Optional[bool]=False, - skip_weighting: Optional[bool]=False, - **kwargs, ): + self, + prompt: Union[str, List[str]], + image: Union[paddle.Tensor, PIL.Image.Image], + mask_image: Union[paddle.Tensor, PIL.Image.Image], + height=None, + width=None, + strength: float = 0.8, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: Optional[float] = 0.0, + seed: Optional[int] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + # new add + max_embeddings_multiples: Optional[int] = 1, + no_boseos_middle: Optional[bool] = False, + skip_parsing: Optional[bool] = False, + skip_weighting: Optional[bool] = False, + **kwargs, + ): r""" Function invoked when calling the pipeline for generation. @@ -1309,7 +1235,8 @@ def inpaint( no_boseos_middle=no_boseos_middle, skip_parsing=skip_parsing, skip_weighting=skip_weighting, - epoch_time=time.time(), ) + epoch_time=time.time(), + ) paddle.seed(seed) # 1. Check inputs @@ -1331,7 +1258,8 @@ def inpaint( skip_parsing, skip_weighting, do_classifier_free_guidance, - num_images_per_prompt, ) + num_images_per_prompt, + ) if not isinstance(image, paddle.Tensor): image = image.resize((width, height)) @@ -1343,16 +1271,14 @@ def inpaint( # 5. set timesteps self.scheduler.set_timesteps(num_inference_steps) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, - strength) - latent_timestep = timesteps[:1].tile( - [batch_size * num_images_per_prompt]) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) + latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt]) # 6. Prepare latent variables # encode the init image into latents and scale the latents latents, init_latents_orig, noise = self.prepare_latents_inpaint( - image, latent_timestep, num_images_per_prompt, - text_embeddings.dtype) + image, latent_timestep, num_images_per_prompt, text_embeddings.dtype + ) # 7. Prepare mask latent mask = mask_image.cast(latents.dtype) @@ -1362,41 +1288,30 @@ def inpaint( extra_step_kwargs = self.prepare_extra_step_kwargs(eta) # 9. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual - noise_pred = self.unet( - latent_model_input, - t, - encoder_hidden_states=text_embeddings).sample + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # masking - init_latents_proper = self.scheduler.add_noise( - init_latents_orig, noise, t) + init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, t) latents = (init_latents_proper * mask) + (latents * (1 - mask)) # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -1405,8 +1320,7 @@ def inpaint( image = self.decode_latents(latents) # 11. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, - text_embeddings.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, text_embeddings.dtype) # 12. Convert to PIL if output_type == "pil": @@ -1415,8 +1329,7 @@ def inpaint( if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) @staticmethod def numpy_to_pil(images, **kwargs): diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py index 25099d6d6c726..4e5e08168878d 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py @@ -21,8 +21,7 @@ import paddle import paddle.nn as nn from paddle.nn import functional as F -from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...loaders import TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel @@ -100,8 +99,7 @@ def aggregate_attention(self, from_where: List[str]) -> paddle.Tensor: attention_maps = self.get_average_attention() for location in from_where: for item in attention_maps[location]: - cross_maps = item.reshape( - [-1, self.attn_res[0], self.attn_res[1], item.shape[-1]]) + cross_maps = item.reshape([-1, self.attn_res[0], self.attn_res[1], item.shape[-1]]) out.append(cross_maps) out = paddle.concat(out, axis=0) out = out.sum(0) / out.shape[0] @@ -132,21 +130,19 @@ def __init__(self, attnstore, place_in_unet): self.place_in_unet = place_in_unet def __call__( - self, - attn: Attention, - hidden_states, - encoder_hidden_states=None, - attention_mask=None, ): + self, + attn: Attention, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + ): batch_size, sequence_length, _ = hidden_states.shape - attention_mask = attn.prepare_attention_mask( - attention_mask, sequence_length, batch_size) + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) query = attn.to_q(hidden_states) is_cross = encoder_hidden_states is not None - encoder_hidden_states = (encoder_hidden_states - if encoder_hidden_states is not None else - hidden_states) + encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states key = attn.to_k(encoder_hidden_states) value = attn.to_v(encoder_hidden_states) @@ -160,8 +156,7 @@ def __call__( if not attention_probs.stop_gradient: # TODO must flatten (0, 1) # [bs, num_heads, q_len, k_len] -> [bs*num_heads, q_len, k_len] - self.attnstore( - attention_probs.flatten(0, 1), is_cross, self.place_in_unet) + self.attnstore(attention_probs.flatten(0, 1), is_cross, self.place_in_unet) hidden_states = paddle.matmul(attention_probs, value) hidden_states = attn.batch_to_head_dim(hidden_states) @@ -174,8 +169,7 @@ def __call__( return hidden_states -class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, - TextualInversionLoaderMixin): +class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversionLoaderMixin): r""" Pipeline for text-to-image generation using Stable Diffusion and Attend and Excite. @@ -205,15 +199,16 @@ class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + ): super().__init__() if safety_checker is None and requires_safety_checker: @@ -239,19 +234,21 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): r""" Encodes the prompt into text encoder hidden states. @@ -291,29 +288,31 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) @@ -321,8 +320,7 @@ def _encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -332,21 +330,22 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, - self.tokenizer) + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( @@ -354,47 +353,43 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) else: has_nsfw_concept = None return image, has_nsfw_concept @@ -415,54 +410,50 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - indices, - height, - width, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + indices, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -475,22 +466,19 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) - indices_is_list_ints = isinstance(indices, list) and isinstance( - indices[0], int) - indices_is_list_list_ints = (isinstance(indices, list) and - isinstance(indices[0], list) and - isinstance(indices[0][0], int)) + indices_is_list_ints = isinstance(indices, list) and isinstance(indices[0], int) + indices_is_list_list_ints = ( + isinstance(indices, list) and isinstance(indices[0], list) and isinstance(indices[0][0], int) + ) if not indices_is_list_ints and not indices_is_list_list_ints: - raise TypeError( - "`indices` must be a list of ints or a list of a list of ints") + raise TypeError("`indices` must be a list of ints or a list of a list of ints") - if (indices is None) or (indices is not None and - not isinstance(indices, List)): - raise ValueError( - f"`indices` has to be a list but is {type(indices)}") + if (indices is None) or (indices is not None and not isinstance(indices, List)): + raise ValueError(f"`indices` has to be a list but is {type(indices)}") if indices_is_list_ints: indices_batch_size = 1 @@ -511,19 +499,21 @@ def check_inputs( # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = ( batch_size, num_channels_latents, height // self.vae_scale_factor, - width // self.vae_scale_factor, ) + width // self.vae_scale_factor, + ) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" @@ -539,8 +529,9 @@ def prepare_latents( @staticmethod def _compute_max_attention_per_index( - attention_maps: paddle.Tensor, - indices: List[int], ) -> List[paddle.Tensor]: + attention_maps: paddle.Tensor, + indices: List[int], + ) -> List[paddle.Tensor]: """Computes the maximum attention value for each of the tokens we wish to alter.""" attention_for_text = attention_maps[:, :, 1:-1] attention_for_text *= 100 @@ -554,38 +545,35 @@ def _compute_max_attention_per_index( for i in indices: image = attention_for_text[:, :, i] smoothing = GaussianSmoothing() - input = F.pad(image.unsqueeze(0).unsqueeze(0), (1, 1, 1, 1), - mode="reflect") + input = F.pad(image.unsqueeze(0).unsqueeze(0), (1, 1, 1, 1), mode="reflect") image = smoothing(input).squeeze(0).squeeze(0) # paddle.max donot support float16 max_indices_list.append(image.max()) return max_indices_list def _aggregate_and_get_max_attention_per_token( - self, - indices: List[int], ): + self, + indices: List[int], + ): """Aggregates the attention for each token and computes the max activation value for each token to alter.""" attention_maps = self.attention_store.aggregate_attention( - from_where=("up", "down", "mid"), ) + from_where=("up", "down", "mid"), + ) max_attention_per_index = self._compute_max_attention_per_index( attention_maps=attention_maps, - indices=indices, ) + indices=indices, + ) return max_attention_per_index @staticmethod - def _compute_loss( - max_attention_per_index: List[paddle.Tensor]) -> paddle.Tensor: + def _compute_loss(max_attention_per_index: List[paddle.Tensor]) -> paddle.Tensor: """Computes the attend-and-excite loss using the maximum attention value for each token.""" - losses = [ - max(0, 1.0 - curr_max) for curr_max in max_attention_per_index - ] + losses = [max(0, 1.0 - curr_max) for curr_max in max_attention_per_index] loss = max(losses) return loss @staticmethod - def _update_latent(latents: paddle.Tensor, - loss: paddle.Tensor, - step_size: float) -> paddle.Tensor: + def _update_latent(latents: paddle.Tensor, loss: paddle.Tensor, step_size: float) -> paddle.Tensor: """Update the latent according to the computed loss.""" loss.stop_gradient = False grad_cond = paddle.autograd.grad(loss, [latents], retain_graph=True)[0] @@ -593,15 +581,16 @@ def _update_latent(latents: paddle.Tensor, return latents def _perform_iterative_refinement_step( - self, - latents: paddle.Tensor, - indices: List[int], - loss: paddle.Tensor, - threshold: float, - text_embeddings: paddle.Tensor, - step_size: float, - t: int, - max_refinement_steps: int=20, ): + self, + latents: paddle.Tensor, + indices: List[int], + loss: paddle.Tensor, + threshold: float, + text_embeddings: paddle.Tensor, + step_size: float, + t: int, + max_refinement_steps: int = 20, + ): """ Performs the iterative latent refinement introduced in the paper. Here, we continuously update the latent code according to our loss objective until the given threshold is reached for all tokens. @@ -618,7 +607,8 @@ def _perform_iterative_refinement_step( # Get max activation value for each subject token max_attention_per_index = self._aggregate_and_get_max_attention_per_token( - indices=indices, ) + indices=indices, + ) loss = self._compute_loss(max_attention_per_index) @@ -628,9 +618,7 @@ def _perform_iterative_refinement_step( logger.info(f"\t Try {iteration}. loss: {loss}") if iteration >= max_refinement_steps: - logger.info( - f"\t Exceeded max number of iterations ({max_refinement_steps})! " - ) + logger.info(f"\t Exceeded max number of iterations ({max_refinement_steps})! ") break # Run one more time but don't compute gradients and update the latents. @@ -643,7 +631,8 @@ def _perform_iterative_refinement_step( # Get max activation value for each subject token max_attention_per_index = self._aggregate_and_get_max_attention_per_token( - indices=indices, ) + indices=indices, + ) loss = self._compute_loss(max_attention_per_index) logger.info(f"\t Finished with loss of: {loss}") return loss, latents, max_attention_per_index @@ -662,8 +651,7 @@ def register_attention_control(self): continue cross_att_count += 1 - attn_procs[name] = AttendExciteAttnProcessor( - attnstore=self.attention_store, place_in_unet=place_in_unet) + attn_procs[name] = AttendExciteAttnProcessor(attnstore=self.attention_store, place_in_unet=place_in_unet) self.unet.set_attn_processor(attn_procs) self.attention_store.num_att_layers = cross_att_count @@ -671,42 +659,36 @@ def register_attention_control(self): def get_indices(self, prompt: str) -> Dict[str, int]: """Utility function to list the indices of the tokens you wish to alte""" ids = self.tokenizer(prompt).input_ids - indices = { - i: tok - for tok, i in zip( - self.tokenizer.convert_ids_to_tokens(ids), range(len(ids))) - } + indices = {i: tok for tok, i in zip(self.tokenizer.convert_ids_to_tokens(ids), range(len(ids)))} return indices @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - prompt: Union[str, List[str]], - token_indices: Union[List[int], List[List[int]]], - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: int=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - max_iter_to_alter: int=25, - thresholds: dict={0: 0.05, - 10: 0.5, - 20: 0.8}, - scale_factor: int=20, - attn_res: Optional[Tuple[int]]=(16, 16), ): + self, + prompt: Union[str, List[str]], + token_indices: Union[List[int], List[List[int]]], + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: int = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + max_iter_to_alter: int = 25, + thresholds: dict = {0: 0.05, 10: 0.5, 20: 0.8}, + scale_factor: int = 20, + attn_res: Optional[Tuple[int]] = (16, 16), + ): r""" Function invoked when calling the pipeline for generation. @@ -802,7 +784,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -824,7 +807,8 @@ def __call__( do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -839,7 +823,8 @@ def __call__( width, prompt_embeds.dtype, generator, - latents, ) + latents, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) @@ -853,8 +838,9 @@ def __call__( scale_range = np.linspace(1.0, 0.5, len(self.scheduler.timesteps)) step_size = scale_factor * np.sqrt(scale_range) - text_embeddings = (prompt_embeds[batch_size * num_images_per_prompt:] - if do_classifier_free_guidance else prompt_embeds) + text_embeddings = ( + prompt_embeds[batch_size * num_images_per_prompt :] if do_classifier_free_guidance else prompt_embeds + ) if isinstance(token_indices[0], int): token_indices = [token_indices] @@ -865,8 +851,7 @@ def __call__( indices = indices + [ind] * num_images_per_prompt # 7. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # Attend and excite process @@ -874,8 +859,7 @@ def __call__( latents = latents.clone().detach() latents.stop_gradient = False updated_latents = [] - for latent, index, text_embedding in zip(latents, indices, - text_embeddings): + for latent, index, text_embedding in zip(latents, indices, text_embeddings): # Forward pass of denoising with text conditioning latent = latent.unsqueeze(0) text_embedding = text_embedding.unsqueeze(0) @@ -889,28 +873,23 @@ def __call__( self.unet.clear_gradients() # Get max activation value for each subject token - max_attention_per_index = ( - self._aggregate_and_get_max_attention_per_token( - indices=index, )) + max_attention_per_index = self._aggregate_and_get_max_attention_per_token( + indices=index, + ) - loss = self._compute_loss( - max_attention_per_index=max_attention_per_index) + loss = self._compute_loss(max_attention_per_index=max_attention_per_index) # If this is an iterative refinement step, verify we have reached the desired threshold for all - if i in thresholds.keys() and loss > 1.0 - thresholds[ - i]: - ( - loss, - latent, - max_attention_per_index, - ) = self._perform_iterative_refinement_step( + if i in thresholds.keys() and loss > 1.0 - thresholds[i]: + (loss, latent, max_attention_per_index,) = self._perform_iterative_refinement_step( latents=latent, indices=index, loss=loss, threshold=thresholds[i], text_embeddings=text_embedding, step_size=step_size[i], - t=t, ) + t=t, + ) # Perform gradient update if i < max_iter_to_alter: @@ -918,41 +897,36 @@ def __call__( latent = self._update_latent( latents=latent, loss=loss, - step_size=step_size[i], ) - logger.info( - f"Iteration {i} | Loss: {loss.item():0.4f}") + step_size=step_size[i], + ) + logger.info(f"Iteration {i} | Loss: {loss.item():0.4f}") updated_latents.append(latent) latents = paddle.concat(updated_latents, axis=0) # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( latent_model_input, t, encoder_hidden_states=prompt_embeds, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -961,8 +935,7 @@ def __call__( image = self.decode_latents(latents) # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, - prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) # 10. Convert to PIL if output_type == "pil": @@ -971,8 +944,7 @@ def __call__( if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) class GaussianSmoothing(nn.Layer): @@ -989,11 +961,12 @@ class GaussianSmoothing(nn.Layer): # channels=1, kernel_size=kernel_size, sigma=sigma, dim=2 def __init__( - self, - channels: int=1, - kernel_size: int=3, - sigma: float=0.5, - dim: int=2, ): + self, + channels: int = 1, + kernel_size: int = 3, + sigma: float = 0.5, + dim: int = 2, + ): super().__init__() if isinstance(kernel_size, int): @@ -1004,21 +977,17 @@ def __init__( # The gaussian kernel is the product of the # gaussian function of each dimension. kernel = 1 - meshgrids = paddle.meshgrid([ - paddle.arange( - size, dtype=paddle.float32) for size in kernel_size - ]) + meshgrids = paddle.meshgrid([paddle.arange(size, dtype=paddle.float32) for size in kernel_size]) for size, std, mgrid in zip(kernel_size, sigma, meshgrids): mean = (size - 1) / 2 - kernel *= (1 / (std * math.sqrt(2 * math.pi)) * - paddle.exp(-(((mgrid - mean) / (2 * std))**2))) + kernel *= 1 / (std * math.sqrt(2 * math.pi)) * paddle.exp(-(((mgrid - mean) / (2 * std)) ** 2)) # Make sure sum of values in gaussian kernel equals 1. kernel = kernel / paddle.sum(kernel) # Reshape to depthwise convolutional weight kernel = kernel.reshape([1, 1, *kernel.shape]) - kernel = kernel.tile([channels, * [1] * (kernel.ndim - 1)]) + kernel = kernel.tile([channels, *[1] * (kernel.ndim - 1)]) self.register_buffer("weight", kernel) self.groups = channels @@ -1030,9 +999,7 @@ def __init__( elif dim == 3: self.conv = F.conv3d else: - raise RuntimeError( - "Only 1, 2 and 3 dimensions are supported. Received {}.".format( - dim)) + raise RuntimeError("Only 1, 2 and 3 dimensions are supported. Received {}.".format(dim)) def forward(self, input): """ @@ -1042,5 +1009,4 @@ def forward(self, input): Returns: filtered (paddle.Tensor): Filtered output. """ - return self.conv( - input, weight=self.weight.cast(input.dtype), groups=self.groups) + return self.conv(input, weight=self.weight.cast(input.dtype), groups=self.groups) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py index c46f6b8e52147..448660c4ef7c3 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_controlnet.py @@ -21,16 +21,14 @@ import paddle import paddle.nn as nn import PIL.Image -from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...loaders import TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ...models.controlnet import ControlNetOutput from ...models.modeling_utils import ModelMixin from ...schedulers import KarrasDiffusionSchedulers -from ...utils import (PIL_INTERPOLATION, logging, randn_tensor, - replace_example_docstring) +from ...utils import PIL_INTERPOLATION, logging, randn_tensor, replace_example_docstring from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker @@ -88,28 +86,25 @@ class MultiControlNetModel(ModelMixin): `ControlNetModel` as a list. """ - def __init__( - self, - controlnets: Union[List[ControlNetModel], Tuple[ControlNetModel]]): + def __init__(self, controlnets: Union[List[ControlNetModel], Tuple[ControlNetModel]]): super().__init__() self.nets = nn.LayerList(controlnets) def forward( - self, - sample: paddle.Tensor, - timestep: Union[paddle.Tensor, float, int], - encoder_hidden_states: paddle.Tensor, - controlnet_cond: List[paddle.Tensor], - conditioning_scale: List[float], - class_labels: Optional[paddle.Tensor]=None, - timestep_cond: Optional[paddle.Tensor]=None, - attention_mask: Optional[paddle.Tensor]=None, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - guess_mode: bool=False, - return_dict: bool=True, ) -> Union[ControlNetOutput, Tuple]: - for i, ( - image, scale, controlnet - ) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)): + self, + sample: paddle.Tensor, + timestep: Union[paddle.Tensor, float, int], + encoder_hidden_states: paddle.Tensor, + controlnet_cond: List[paddle.Tensor], + conditioning_scale: List[float], + class_labels: Optional[paddle.Tensor] = None, + timestep_cond: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guess_mode: bool = False, + return_dict: bool = True, + ) -> Union[ControlNetOutput, Tuple]: + for i, (image, scale, controlnet) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)): down_samples, mid_sample = controlnet( sample, timestep, @@ -121,7 +116,8 @@ def forward( attention_mask, cross_attention_kwargs, guess_mode, - return_dict, ) + return_dict, + ) # merge samples if i == 0: @@ -129,16 +125,14 @@ def forward( else: down_block_res_samples = [ samples_prev + samples_curr - for samples_prev, samples_curr in zip( - down_block_res_samples, down_samples) + for samples_prev, samples_curr in zip(down_block_res_samples, down_samples) ] mid_block_res_sample += mid_sample return down_block_res_samples, mid_block_res_sample -class StableDiffusionControlNetPipeline(DiffusionPipeline, - TextualInversionLoaderMixin): +class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMixin): r""" Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance. This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the @@ -174,17 +168,22 @@ class StableDiffusionControlNetPipeline(DiffusionPipeline, _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ - ControlNetModel], MultiControlNetModel, ], - scheduler: KarrasDiffusionSchedulers, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + controlnet: Union[ + ControlNetModel, + List[ControlNetModel], + Tuple[ControlNetModel], + MultiControlNetModel, + ], + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + ): super().__init__() if safety_checker is None and requires_safety_checker: @@ -214,8 +213,9 @@ def __init__( controlnet=controlnet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) def enable_vae_slicing(self): @@ -250,13 +250,14 @@ def disable_vae_tiling(self): # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): r""" Encodes the prompt into text encoder hidden states. Args: @@ -295,32 +296,36 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - config = (self.text_encoder.config - if isinstance(self.text_encoder.config, dict) else - self.text_encoder.config.to_dict()) - if (config.get("use_attention_mask", None) is not None and - config["use_attention_mask"]): + config = ( + self.text_encoder.config + if isinstance(self.text_encoder.config, dict) + else self.text_encoder.config.to_dict() + ) + if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) @@ -328,8 +333,7 @@ def _encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -339,21 +343,22 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, - self.tokenizer) + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( @@ -361,50 +366,48 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - config = (self.text_encoder.config - if isinstance(self.text_encoder.config, dict) else - self.text_encoder.config.to_dict()) - if (config.get("use_attention_mask", None) is not None and - config["use_attention_mask"]): + config = ( + self.text_encoder.config + if isinstance(self.text_encoder.config, dict) + else self.text_encoder.config.to_dict() + ) + if config.get("use_attention_mask", None) is not None and config["use_attention_mask"]: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - dtype=self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) else: has_nsfw_concept = None return image, has_nsfw_concept @@ -425,55 +428,51 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - image, - height, - width, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, - controlnet_conditioning_scale=1.0, ): + self, + prompt, + image, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + controlnet_conditioning_scale=1.0, + ): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -486,7 +485,8 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) # `prompt` needs more sophisticated handling when there are multiple # conditionings. @@ -502,15 +502,12 @@ def check_inputs( self.check_image(image, prompt, prompt_embeds) elif isinstance(self.controlnet, MultiControlNetModel): if not isinstance(image, list): - raise TypeError( - "For multiple controlnets: `image` must be type `list`") + raise TypeError("For multiple controlnets: `image` must be type `list`") # When `image` is a nested list: # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]]) elif any(isinstance(i, list) for i in image): - raise ValueError( - "A single batch of multiple conditionings are supported at the moment." - ) + raise ValueError("A single batch of multiple conditionings are supported at the moment.") elif len(image) != len(self.controlnet.nets): raise ValueError( "For multiple controlnets: `image` must have the same length as the number of controlnets." @@ -524,35 +521,28 @@ def check_inputs( # Check `controlnet_conditioning_scale` if isinstance(self.controlnet, ControlNetModel): if not isinstance(controlnet_conditioning_scale, float): - raise TypeError( - "For single controlnet: `controlnet_conditioning_scale` must be type `float`." - ) + raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.") elif isinstance(self.controlnet, MultiControlNetModel): if isinstance(controlnet_conditioning_scale, list): - if any( - isinstance(i, list) - for i in controlnet_conditioning_scale): - raise ValueError( - "A single batch of multiple conditionings are supported at the moment." - ) - elif isinstance(controlnet_conditioning_scale, list) and len( - controlnet_conditioning_scale) != len(self.controlnet.nets): + if any(isinstance(i, list) for i in controlnet_conditioning_scale): + raise ValueError("A single batch of multiple conditionings are supported at the moment.") + elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len( + self.controlnet.nets + ): raise ValueError( "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have" - " the same length as the number of controlnets") + " the same length as the number of controlnets" + ) else: assert False def check_image(self, image, prompt, prompt_embeds): image_is_pil = isinstance(image, PIL.Image.Image) image_is_tensor = isinstance(image, paddle.Tensor) - image_is_pil_list = isinstance(image, list) and isinstance( - image[0], PIL.Image.Image) - image_is_tensor_list = isinstance(image, list) and isinstance( - image[0], paddle.Tensor) + image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image) + image_is_tensor_list = isinstance(image, list) and isinstance(image[0], paddle.Tensor) - if (not image_is_pil and not image_is_tensor and - not image_is_pil_list and not image_is_tensor_list): + if not image_is_pil and not image_is_tensor and not image_is_pil_list and not image_is_tensor_list: raise TypeError( "image must be one of PIL image, paddle tensor, list of PIL images, or list of paddle tensors" ) @@ -579,15 +569,16 @@ def check_image(self, image, prompt, prompt_embeds): ) def prepare_image( - self, - image, - width, - height, - batch_size, - num_images_per_prompt, - dtype, - do_classifier_free_guidance=False, - guess_mode=False, ): + self, + image, + width, + height, + batch_size, + num_images_per_prompt, + dtype, + do_classifier_free_guidance=False, + guess_mode=False, + ): if not isinstance(image, paddle.Tensor): if isinstance(image, PIL.Image.Image): image = [image] @@ -596,8 +587,7 @@ def prepare_image( images = [] for image_ in image: image_ = image_.convert("RGB") - image_ = image_.resize( - (width, height), resample=PIL_INTERPOLATION["lanczos"]) + image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"]) image_ = np.array(image_) image_ = image_[None, :] images.append(image_) @@ -627,14 +617,15 @@ def prepare_image( # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = [ batch_size, num_channels_latents, @@ -678,48 +669,47 @@ def _default_height_width(self, height, width, image): # override DiffusionPipeline def save_pretrained( - self, - save_directory: Union[str, os.PathLike], - safe_serialization: bool=False, - variant: Optional[str]=None, - to_diffusers: bool=None, ): + self, + save_directory: Union[str, os.PathLike], + safe_serialization: bool = False, + variant: Optional[str] = None, + to_diffusers: bool = None, + ): if isinstance(self.controlnet, ControlNetModel): super().save_pretrained( save_directory, safe_serialization=safe_serialization, variant=variant, - to_diffusers=to_diffusers, ) - else: - raise NotImplementedError( - "Currently, the `save_pretrained()` is not implemented for Multi-ControlNet." + to_diffusers=to_diffusers, ) + else: + raise NotImplementedError("Currently, the `save_pretrained()` is not implemented for Multi-ControlNet.") @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - prompt: Union[str, List[str]]=None, - image: Union[paddle.Tensor, PIL.Image.Image, List[paddle.Tensor], - List[PIL.Image.Image]]=None, - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: int=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - controlnet_conditioning_scale: Union[float, List[float]]=1.0, - guess_mode: bool=False, ): + self, + prompt: Union[str, List[str]] = None, + image: Union[paddle.Tensor, PIL.Image.Image, List[paddle.Tensor], List[PIL.Image.Image]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + controlnet_conditioning_scale: Union[float, List[float]] = 1.0, + guess_mode: bool = False, + ): r""" Function invoked when calling the pipeline for generation. Args: @@ -813,7 +803,8 @@ def __call__( negative_prompt, prompt_embeds, negative_prompt_embeds, - controlnet_conditioning_scale, ) + controlnet_conditioning_scale, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -828,10 +819,8 @@ def __call__( # corresponds to doing no classifier free guidance. do_classifier_free_guidance = guidance_scale > 1.0 - if isinstance(self.controlnet, MultiControlNetModel) and isinstance( - controlnet_conditioning_scale, float): - controlnet_conditioning_scale = [controlnet_conditioning_scale - ] * len(self.controlnet.nets) + if isinstance(self.controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float): + controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(self.controlnet.nets) # 3. Encode input prompt prompt_embeds = self._encode_prompt( @@ -840,7 +829,8 @@ def __call__( do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 4. Prepare image if isinstance(self.controlnet, ControlNetModel): @@ -852,7 +842,8 @@ def __call__( num_images_per_prompt=num_images_per_prompt, dtype=self.controlnet.dtype, do_classifier_free_guidance=do_classifier_free_guidance, - guess_mode=guess_mode, ) + guess_mode=guess_mode, + ) elif isinstance(self.controlnet, MultiControlNetModel): images = [] @@ -865,7 +856,8 @@ def __call__( num_images_per_prompt=num_images_per_prompt, dtype=self.controlnet.dtype, do_classifier_free_guidance=do_classifier_free_guidance, - guess_mode=guess_mode, ) + guess_mode=guess_mode, + ) images.append(image_) @@ -886,21 +878,19 @@ def __call__( width, prompt_embeds.dtype, generator, - latents, ) + latents, + ) # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 8. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # controlnet(s) inference if guess_mode and do_classifier_free_guidance: @@ -919,20 +909,17 @@ def __call__( controlnet_cond=image, conditioning_scale=controlnet_conditioning_scale, guess_mode=guess_mode, - return_dict=False, ) + return_dict=False, + ) if guess_mode and do_classifier_free_guidance: # Infered ControlNet only for the conditional batch. # To apply the output of ControlNet to both the unconditional and conditional batches, # add 0 to the unconditional batch to keep it unchanged. - down_block_res_samples = [ - paddle.concat([paddle.zeros_like(d), d]) - for d in down_block_res_samples - ] - mid_block_res_sample = paddle.concat([ - paddle.zeros_like(mid_block_res_sample), - mid_block_res_sample - ]) + down_block_res_samples = [paddle.concat([paddle.zeros_like(d), d]) for d in down_block_res_samples] + mid_block_res_sample = paddle.concat( + [paddle.zeros_like(mid_block_res_sample), mid_block_res_sample] + ) # predict the noise residual noise_pred = self.unet( @@ -941,22 +928,19 @@ def __call__( encoder_hidden_states=prompt_embeds, cross_attention_kwargs=cross_attention_kwargs, down_block_additional_residuals=down_block_res_samples, - mid_block_additional_residual=mid_block_res_sample, ).sample + mid_block_additional_residual=mid_block_res_sample, + ).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -969,8 +953,7 @@ def __call__( image = self.decode_latents(latents) # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker( - image, prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) # 10. Convert to PIL image = self.numpy_to_pil(image) @@ -979,11 +962,9 @@ def __call__( image = self.decode_latents(latents) # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker( - image, prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index 4a517f2085671..9bbe0ba73588b 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -20,8 +20,12 @@ import paddle import PIL from packaging import version -from paddlenlp.transformers import (CLIPTextModel, CLIPTokenizer, - DPTForDepthEstimation, DPTImageProcessor) +from paddlenlp.transformers import ( + CLIPTextModel, + CLIPTokenizer, + DPTForDepthEstimation, + DPTImageProcessor, +) from ...configuration_utils import FrozenDict from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin @@ -44,11 +48,7 @@ def preprocess(image): w, h = image[0].size w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 - image = [ - np.array(i.resize( - (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] - for i in image - ] + image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image] image = np.concatenate(image, axis=0) image = np.array(image).astype(np.float32) / 255.0 image = image.transpose(0, 3, 1, 2) @@ -59,8 +59,7 @@ def preprocess(image): return image -class StableDiffusionDepth2ImgPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): +class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-guided image to image generation using Stable Diffusion. @@ -90,22 +89,21 @@ class StableDiffusionDepth2ImgPipeline( """ def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - depth_estimator: DPTForDepthEstimation, - feature_extractor: DPTImageProcessor, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + depth_estimator: DPTForDepthEstimation, + feature_extractor: DPTImageProcessor, + ): super().__init__() - is_unet_version_less_0_9_0 = hasattr( - unet.config, "_ppdiffusers_version") and version.parse( - version.parse(unet.config._ppdiffusers_version) - .base_version) < version.parse("0.9.0.dev0") - is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and - unet.config.sample_size < 64) + is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse( + version.parse(unet.config._ppdiffusers_version).base_version + ) < version.parse("0.9.0.dev0") + is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: deprecation_message = ( "The configuration file of the unet has set the default `sample_size` to smaller than" @@ -116,12 +114,9 @@ def __init__( " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`" " in the config might lead to incorrect results in future versions. If you have downloaded this" " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" - " the `unet/config.json` file") - deprecate( - "sample_size<64", - "1.0.0", - deprecation_message, - standard_warn=False) + " the `unet/config.json` file" + ) + deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(unet.config) new_config["sample_size"] = 64 unet._internal_dict = FrozenDict(new_config) @@ -133,18 +128,20 @@ def __init__( unet=unet, scheduler=scheduler, depth_estimator=depth_estimator, - feature_extractor=feature_extractor, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): r""" Encodes the prompt into text encoder hidden states. @@ -184,29 +181,31 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) @@ -214,8 +213,7 @@ def _encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -225,21 +223,22 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, - self.tokenizer) + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( @@ -247,47 +246,43 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) else: has_nsfw_concept = None return image, has_nsfw_concept @@ -308,52 +303,49 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs def check_inputs( - self, - prompt, - strength, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + strength, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if strength < 0 or strength > 1: - raise ValueError( - f"The value of strength should in [0.0, 1.0] but is {strength}") + raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -366,27 +358,21 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps def get_timesteps(self, num_inference_steps, strength): # get the original timestep using init_timestep - init_timestep = min( - int(num_inference_steps * strength), num_inference_steps) + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:] + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] return timesteps, num_inference_steps - t_start # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.prepare_latents - def prepare_latents(self, - image, - timestep, - batch_size, - num_images_per_prompt, - dtype, - generator=None): + def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None): if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)): raise ValueError( f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or list but is {type(image)}" @@ -403,8 +389,7 @@ def prepare_latents(self, if isinstance(generator, list): init_latents = [ - self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i]) - for i in range(batch_size) + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) ] init_latents = paddle.concat(init_latents, axis=0) else: @@ -412,8 +397,7 @@ def prepare_latents(self, init_latents = self.vae.config.scaling_factor * init_latents - if (batch_size > init_latents.shape[0] and - batch_size % init_latents.shape[0] == 0): + if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: # expand init_latents for batch_size deprecation_message = ( f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial" @@ -425,12 +409,11 @@ def prepare_latents(self, "len(prompt) != len(image)", "1.0.0", deprecation_message, - standard_warn=False, ) + standard_warn=False, + ) additional_image_per_prompt = batch_size // init_latents.shape[0] - init_latents = paddle.concat( - [init_latents] * additional_image_per_prompt, axis=0) - elif (batch_size > init_latents.shape[0] and - batch_size % init_latents.shape[0] != 0): + init_latents = paddle.concat([init_latents] * additional_image_per_prompt, axis=0) + elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: raise ValueError( f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." ) @@ -446,8 +429,7 @@ def prepare_latents(self, return latents - def prepare_depth_map(self, image, depth_map, batch_size, - do_classifier_free_guidance, dtype): + def prepare_depth_map(self, image, depth_map, batch_size, do_classifier_free_guidance, dtype): if isinstance(image, PIL.Image.Image): image = [image] else: @@ -459,27 +441,24 @@ def prepare_depth_map(self, image, depth_map, batch_size, height, width = image[0].shape[-2:] if depth_map is None: - pixel_values = self.feature_extractor( - images=image, return_tensors="pd").pixel_values + pixel_values = self.feature_extractor(images=image, return_tensors="pd").pixel_values # The DPT-Hybrid model uses batch-norm layers which are not compatible with fp16. # TODO DPTModel `expand_as`` donot supoort float16 with paddle.amp.auto_cast(True, level="O2"): - depth_map = self.depth_estimator( - pixel_values).predicted_depth.cast("float32") + depth_map = self.depth_estimator(pixel_values).predicted_depth.cast("float32") else: depth_map = depth_map.cast("float32") depth_map = paddle.nn.functional.interpolate( depth_map.unsqueeze(1), - size=(height // self.vae_scale_factor, - width // self.vae_scale_factor), + size=(height // self.vae_scale_factor, width // self.vae_scale_factor), mode="bicubic", - align_corners=False, ) + align_corners=False, + ) # amin / amax donot support float16 depth_min = paddle.amin(depth_map, axis=[1, 2, 3], keepdim=True) depth_max = paddle.amax(depth_map, axis=[1, 2, 3], keepdim=True) - depth_map = 2.0 * (depth_map - depth_min) / (depth_max - depth_min - ) - 1.0 + depth_map = 2.0 * (depth_map - depth_min) / (depth_max - depth_min) - 1.0 # maybe cast to float16 depth_map = depth_map.cast(dtype) @@ -488,30 +467,29 @@ def prepare_depth_map(self, image, depth_map, batch_size, repeat_by = batch_size // depth_map.shape[0] depth_map = depth_map.tile([repeat_by, 1, 1, 1]) - depth_map = (paddle.concat([depth_map] * 2) - if do_classifier_free_guidance else depth_map) + depth_map = paddle.concat([depth_map] * 2) if do_classifier_free_guidance else depth_map return depth_map @paddle.no_grad() def __call__( - self, - prompt: Union[str, List[str]]=None, - image: Union[paddle.Tensor, PIL.Image.Image]=None, - depth_map: Optional[paddle.Tensor]=None, - strength: float=0.8, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: Optional[float]=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, ): + self, + prompt: Union[str, List[str]] = None, + image: Union[paddle.Tensor, PIL.Image.Image] = None, + depth_map: Optional[paddle.Tensor] = None, + strength: float = 0.8, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: Optional[float] = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + ): r""" Function invoked when calling the pipeline for generation. @@ -603,7 +581,8 @@ def __call__( callback_steps, negative_prompt=negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) if image is None: raise ValueError("`image` input cannot be undefined.") @@ -627,7 +606,8 @@ def __call__( do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 4. Prepare depth mask depth_mask = self.prepare_depth_map( @@ -635,17 +615,16 @@ def __call__( depth_map, batch_size * num_images_per_prompt, do_classifier_free_guidance, - prompt_embeds.dtype, ) + prompt_embeds.dtype, + ) # 5. Preprocess image image = preprocess(image) # 6. Set timesteps self.scheduler.set_timesteps(num_inference_steps) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, - strength) - latent_timestep = timesteps[:1].tile( - [batch_size * num_images_per_prompt]) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) + latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt]) # 7. Prepare latent variables latents = self.prepare_latents( @@ -654,44 +633,35 @@ def __call__( batch_size, num_images_per_prompt, prompt_embeds.dtype, - generator, ) + generator, + ) # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 9. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) - latent_model_input = paddle.concat( - [latent_model_input, depth_mask], axis=1) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + latent_model_input = paddle.concat([latent_model_input, depth_mask], axis=1) # predict the noise residual - noise_pred = self.unet( - latent_model_input, t, - encoder_hidden_states=prompt_embeds).sample + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample latents = latents.cast(prompt_embeds.dtype) # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -704,6 +674,6 @@ def __call__( image = self.numpy_to_pil(image) if not return_dict: - return (image, ) + return (image,) return ImagePipelineOutput(images=image) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py index 87ea9a04eb5f6..48556ee9e0bfb 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py @@ -19,8 +19,7 @@ import paddle import PIL from packaging import version -from paddlenlp.transformers import (CLIPImageProcessor, - CLIPVisionModelWithProjection) +from paddlenlp.transformers import CLIPImageProcessor, CLIPVisionModelWithProjection from ...configuration_utils import FrozenDict from ...models import AutoencoderKL, UNet2DConditionModel @@ -62,14 +61,15 @@ class StableDiffusionImageVariationPipeline(DiffusionPipeline): _optional_components = ["safety_checker"] def __init__( - self, - vae: AutoencoderKL, - image_encoder: CLIPVisionModelWithProjection, - unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + image_encoder: CLIPVisionModelWithProjection, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + ): super().__init__() if safety_checker is None and requires_safety_checker: @@ -88,12 +88,10 @@ def __init__( " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." ) - is_unet_version_less_0_9_0 = hasattr( - unet.config, "_ppdiffusers_version") and version.parse( - version.parse(unet.config._ppdiffusers_version) - .base_version) < version.parse("0.9.0.dev0") - is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and - unet.config.sample_size < 64) + is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse( + version.parse(unet.config._ppdiffusers_version).base_version + ) < version.parse("0.9.0.dev0") + is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: deprecation_message = ( "The configuration file of the unet has set the default `sample_size` to smaller than" @@ -104,12 +102,9 @@ def __init__( " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`" " in the config might lead to incorrect results in future versions. If you have downloaded this" " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" - " the `unet/config.json` file") - deprecate( - "sample_size<64", - "1.0.0", - deprecation_message, - standard_warn=False) + " the `unet/config.json` file" + ) + deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(unet.config) new_config["sample_size"] = 64 unet._internal_dict = FrozenDict(new_config) @@ -120,17 +115,16 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) - def _encode_image(self, image, num_images_per_prompt, - do_classifier_free_guidance): + def _encode_image(self, image, num_images_per_prompt, do_classifier_free_guidance): dtype = self.image_encoder.dtype if not isinstance(image, paddle.Tensor): - image = self.feature_extractor( - images=image, return_tensors="pd").pixel_values + image = self.feature_extractor(images=image, return_tensors="pd").pixel_values image = image.cast(dtype) image_embeddings = self.image_encoder(image).image_embeds @@ -139,8 +133,7 @@ def _encode_image(self, image, num_images_per_prompt, # duplicate image embeddings for each generation per prompt, using mps friendly method bs_embed, seq_len, _ = image_embeddings.shape image_embeddings = image_embeddings.tile([1, num_images_per_prompt, 1]) - image_embeddings = image_embeddings.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + image_embeddings = image_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) if do_classifier_free_guidance: negative_prompt_embeds = paddle.zeros_like(image_embeddings) @@ -148,19 +141,17 @@ def _encode_image(self, image, num_images_per_prompt, # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - image_embeddings = paddle.concat( - [negative_prompt_embeds, image_embeddings]) + image_embeddings = paddle.concat([negative_prompt_embeds, image_embeddings]) return image_embeddings # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) else: has_nsfw_concept = None return image, has_nsfw_concept @@ -181,54 +172,56 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs(self, image, height, width, callback_steps): - if (not isinstance(image, paddle.Tensor) and - not isinstance(image, PIL.Image.Image) and - not isinstance(image, list)): + if ( + not isinstance(image, paddle.Tensor) + and not isinstance(image, PIL.Image.Image) + and not isinstance(image, list) + ): raise ValueError( "`image` has to be of type `paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" - f" {type(image)}") + f" {type(image)}" + ) if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = ( batch_size, num_channels_latents, height // self.vae_scale_factor, - width // self.vae_scale_factor, ) + width // self.vae_scale_factor, + ) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" @@ -244,21 +237,21 @@ def prepare_latents( @paddle.no_grad() def __call__( - self, - image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor], - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, ): + self, + image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor], + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + ): r""" Function invoked when calling the pipeline for generation. @@ -332,8 +325,7 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input image - image_embeddings = self._encode_image(image, num_images_per_prompt, - do_classifier_free_guidance) + image_embeddings = self._encode_image(image, num_images_per_prompt, do_classifier_free_guidance) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -348,42 +340,33 @@ def __call__( width, image_embeddings.dtype, generator, - latents, ) + latents, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual - noise_pred = self.unet( - latent_model_input, - t, - encoder_hidden_states=image_embeddings).sample + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -392,8 +375,7 @@ def __call__( image = self.decode_latents(latents) # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker( - image, image_embeddings.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, image_embeddings.dtype) # 10. Convert to PIL if output_type == "pil": @@ -402,5 +384,4 @@ def __call__( if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index b26c0e76369b2..d8bee685bc963 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -20,17 +20,20 @@ import paddle import PIL from packaging import version -from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict from ...image_processor import VaeImageProcessor -from ...loaders import (FromCkptMixin, LoraLoaderMixin, - TextualInversionLoaderMixin) +from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers -from ...utils import (PIL_INTERPOLATION, deprecate, logging, randn_tensor, - replace_example_docstring) +from ...utils import ( + PIL_INTERPOLATION, + deprecate, + logging, + randn_tensor, + replace_example_docstring, +) from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker @@ -74,11 +77,7 @@ def preprocess(image): w, h = image[0].size w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 - image = [ - np.array(i.resize( - (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] - for i in image - ] + image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image] image = np.concatenate(image, axis=0) image = np.array(image).astype(np.float32) / 255.0 image = image.transpose(0, 3, 1, 2) @@ -89,9 +88,7 @@ def preprocess(image): return image -class StableDiffusionImg2ImgPipeline(DiffusionPipeline, - TextualInversionLoaderMixin, - LoraLoaderMixin, FromCkptMixin): +class StableDiffusionImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin): r""" Pipeline for text-guided image to image generation using Stable Diffusion. @@ -130,37 +127,33 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline, # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__init__ def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + ): super().__init__() - if (hasattr(scheduler.config, "steps_offset") and - scheduler.config.steps_offset != 1): + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " "to update the config accordingly as leaving `steps_offset` might led to incorrect results" " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" - " file") - deprecate( - "steps_offset!=1", - "1.0.0", - deprecation_message, - standard_warn=False) + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["steps_offset"] = 1 scheduler._internal_dict = FrozenDict(new_config) - if (hasattr(scheduler.config, "clip_sample") and - scheduler.config.clip_sample is True): + if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`." " `clip_sample` should be set to False in the configuration file. Please make sure to update the" @@ -168,11 +161,7 @@ def __init__( " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very" " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file" ) - deprecate( - "clip_sample not set", - "1.0.0", - deprecation_message, - standard_warn=False) + deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["clip_sample"] = False scheduler._internal_dict = FrozenDict(new_config) @@ -193,12 +182,10 @@ def __init__( " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." ) - is_unet_version_less_0_9_0 = hasattr( - unet.config, "_ppdiffusers_version") and version.parse( - version.parse(unet.config._ppdiffusers_version) - .base_version) < version.parse("0.9.0.dev0") - is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and - unet.config.sample_size < 64) + is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse( + version.parse(unet.config._ppdiffusers_version).base_version + ) < version.parse("0.9.0.dev0") + is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: deprecation_message = ( "The configuration file of the unet has set the default `sample_size` to smaller than" @@ -209,12 +196,9 @@ def __init__( " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`" " in the config might lead to incorrect results in future versions. If you have downloaded this" " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" - " the `unet/config.json` file") - deprecate( - "sample_size<64", - "1.0.0", - deprecation_message, - standard_warn=False) + " the `unet/config.json` file" + ) + deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(unet.config) new_config["sample_size"] = 64 unet._internal_dict = FrozenDict(new_config) @@ -226,22 +210,24 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) - self.image_processor = VaeImageProcessor( - vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True) + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True) self.register_to_config( - requires_safety_checker=requires_safety_checker, ) + requires_safety_checker=requires_safety_checker, + ) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): r""" Encodes the prompt into text encoder hidden states. @@ -281,36 +267,37 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -320,21 +307,22 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, - self.tokenizer) + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( @@ -342,36 +330,33 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds @@ -380,17 +365,14 @@ def run_safety_checker(self, image, dtype): has_nsfw_concept = None else: if paddle.is_tensor(image): - feature_extractor_input = self.image_processor.postprocess( - image, output_type="pil") + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") else: - feature_extractor_input = self.image_processor.numpy_to_pil( - image) - safety_checker_input = self.feature_extractor( - feature_extractor_input, return_tensors="pd") + feature_extractor_input = self.image_processor.numpy_to_pil(image) + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pd") image, has_nsfw_concept = self.safety_checker( images=image, - clip_input=paddle.cast(safety_checker_input.pixel_values, - dtype), ) + clip_input=paddle.cast(safety_checker_input.pixel_values, dtype), + ) return image, has_nsfw_concept def decode_latents(self, latents): @@ -406,51 +388,48 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - strength, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + strength, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if strength < 0 or strength > 1: - raise ValueError( - f"The value of strength should in [0.0, 1.0] but is {strength}") + raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -463,29 +442,21 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) def get_timesteps(self, num_inference_steps, strength): # get the original timestep using init_timestep - init_timestep = min( - int(num_inference_steps * strength), num_inference_steps) + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:] + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] return timesteps, num_inference_steps - t_start - def prepare_latents(self, - image, - timestep, - batch_size, - num_images_per_prompt, - dtype, - generator=None): + def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None): if not isinstance(image, (paddle.Tensor, list)): - raise ValueError( - f"`image` has to be of type `paddle.Tensor` or list but is {type(image)}" - ) + raise ValueError(f"`image` has to be of type `paddle.Tensor` or list but is {type(image)}") image = image.cast(dtype) @@ -498,8 +469,7 @@ def prepare_latents(self, if isinstance(generator, list): init_latents = [ - self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i]) - for i in range(batch_size) + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) ] init_latents = paddle.concat(init_latents, axis=0) else: @@ -507,8 +477,7 @@ def prepare_latents(self, init_latents = self.vae.config.scaling_factor * init_latents - if (batch_size > init_latents.shape[0] and - batch_size % init_latents.shape[0] == 0): + if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: # expand init_latents for batch_size deprecation_message = ( f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial" @@ -520,12 +489,11 @@ def prepare_latents(self, "len(prompt) != len(image)", "1.0.0", deprecation_message, - standard_warn=False, ) + standard_warn=False, + ) additional_image_per_prompt = batch_size // init_latents.shape[0] - init_latents = paddle.concat( - [init_latents] * additional_image_per_prompt, axis=0) - elif (batch_size > init_latents.shape[0] and - batch_size % init_latents.shape[0] != 0): + init_latents = paddle.concat([init_latents] * additional_image_per_prompt, axis=0) + elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: raise ValueError( f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." ) @@ -544,24 +512,24 @@ def prepare_latents(self, @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - prompt: Union[str, List[str]]=None, - image: Union[paddle.Tensor, PIL.Image.Image]=None, - strength: float=0.8, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: Optional[float]=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, ): + self, + prompt: Union[str, List[str]] = None, + image: Union[paddle.Tensor, PIL.Image.Image] = None, + strength: float = 0.8, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: Optional[float] = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ): r""" Function invoked when calling the pipeline for generation. @@ -637,7 +605,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -658,17 +627,16 @@ def __call__( do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 4. Preprocess image image = self.image_processor.preprocess(image) # 5. set timesteps self.scheduler.set_timesteps(num_inference_steps) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, - strength) - latent_timestep = timesteps[:1].tile( - [batch_size * num_images_per_prompt]) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) + latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt]) # 6. Prepare latent variables latents = self.prepare_latents( @@ -677,51 +645,45 @@ def __call__( batch_size, num_images_per_prompt, prompt_embeds.dtype, - generator, ) + generator, + ) # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 8. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( latent_model_input, t, encoder_hidden_states=prompt_embeds, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) if not output_type == "latent": image = self.decode_latents(latents) - image, has_nsfw_concept = self.run_safety_checker( - image, prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) else: image = latents has_nsfw_concept = None @@ -731,11 +693,9 @@ def __call__( else: do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - image = self.image_processor.postprocess( - image, output_type=output_type, do_denormalize=do_denormalize) + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index fb09dc473b674..f1e0347160085 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -21,8 +21,7 @@ import paddle.nn.functional as F import PIL from packaging import version -from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict from ...models import AutoencoderKL, UNet2DConditionModel @@ -65,14 +64,11 @@ def prepare_mask_and_masked_image(image, mask): """ if isinstance(image, paddle.Tensor): if not isinstance(mask, paddle.Tensor): - raise TypeError( - f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not" - ) + raise TypeError(f"`image` is a paddle.Tensor but `mask` (type: {type(mask)} is not") # Batch single image if image.ndim == 3: - assert (image.shape[0] == 3 - ), "Image outside a batch should be of shape (3, H, W)" + assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)" image = image.unsqueeze(0) # Batch and add channel dim for single mask @@ -89,12 +85,9 @@ def prepare_mask_and_masked_image(image, mask): else: mask = mask.unsqueeze(1) - assert (image.ndim == 4 and - mask.ndim == 4), "Image and Mask must have 4 dimensions" - assert (image.shape[-2:] == mask.shape[-2:] - ), "Image and Mask must have the same spatial dimensions" - assert (image.shape[0] == mask.shape[0] - ), "Image and Mask must have the same batch size" + assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions" + assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions" + assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size" # Check image is in [-1, 1] if image.min() < -1 or image.max() > 1: @@ -110,8 +103,7 @@ def prepare_mask_and_masked_image(image, mask): # Image as float32 image = image.cast(paddle.float32) elif isinstance(mask, paddle.Tensor): - raise TypeError( - f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not") + raise TypeError(f"`mask` is a paddle.Tensor but `image` (type: {type(image)} is not") else: # preprocess image if isinstance(image, (PIL.Image.Image, np.ndarray)): @@ -131,8 +123,7 @@ def prepare_mask_and_masked_image(image, mask): mask = [mask] if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image): - mask = np.concatenate( - [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0) + mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0) mask = mask.astype(np.float32) / 255.0 elif isinstance(mask, list) and isinstance(mask[0], np.ndarray): mask = np.concatenate([m[None, None, :] for m in mask], axis=0) @@ -176,49 +167,47 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline): _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + requires_safety_checker: bool = True, + ): super().__init__() - if (hasattr(scheduler.config, "steps_offset") and - scheduler.config.steps_offset != 1): + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " "to update the config accordingly as leaving `steps_offset` might led to incorrect results" " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" - " file") - deprecate( - "steps_offset!=1", - "1.0.0", - deprecation_message, - standard_warn=False) + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["steps_offset"] = 1 scheduler._internal_dict = FrozenDict(new_config) - if (hasattr(scheduler.config, "skip_prk_steps") and - scheduler.config.skip_prk_steps is False): + if hasattr(scheduler.config, "skip_prk_steps") and scheduler.config.skip_prk_steps is False: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} has not set the configuration" " `skip_prk_steps`. `skip_prk_steps` should be set to True in the configuration file. Please make" " sure to update the config accordingly as not setting `skip_prk_steps` in the config might lead to" " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face" " Hub, it would be very nice if you could open a Pull request for the" - " `scheduler/scheduler_config.json` file") + " `scheduler/scheduler_config.json` file" + ) deprecate( "skip_prk_steps not set", "1.0.0", deprecation_message, - standard_warn=False, ) + standard_warn=False, + ) new_config = dict(scheduler.config) new_config["skip_prk_steps"] = True scheduler._internal_dict = FrozenDict(new_config) @@ -239,12 +228,10 @@ def __init__( " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." ) - is_unet_version_less_0_9_0 = hasattr( - unet.config, "_ppdiffusers_version") and version.parse( - version.parse(unet.config._ppdiffusers_version) - .base_version) < version.parse("0.9.0.dev0") - is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and - unet.config.sample_size < 64) + is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse( + version.parse(unet.config._ppdiffusers_version).base_version + ) < version.parse("0.9.0.dev0") + is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: deprecation_message = ( "The configuration file of the unet has set the default `sample_size` to smaller than" @@ -255,12 +242,9 @@ def __init__( " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`" " in the config might lead to incorrect results in future versions. If you have downloaded this" " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" - " the `unet/config.json` file") - deprecate( - "sample_size<64", - "1.0.0", - deprecation_message, - standard_warn=False) + " the `unet/config.json` file" + ) + deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(unet.config) new_config["sample_size"] = 64 unet._internal_dict = FrozenDict(new_config) @@ -272,19 +256,21 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): r""" Encodes the prompt into text encoder hidden states. @@ -320,29 +306,31 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) @@ -350,8 +338,7 @@ def _encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -361,14 +348,16 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt @@ -378,47 +367,43 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) else: has_nsfw_concept = None return image, has_nsfw_concept @@ -430,15 +415,13 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs @@ -454,39 +437,37 @@ def decode_latents(self, latents): # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs def check_inputs( - self, - prompt, - height, - width, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -499,18 +480,20 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = [ batch_size, num_channels_latents, @@ -531,22 +514,20 @@ def prepare_latents( return latents def prepare_mask_latents( - self, - mask, - masked_image, - batch_size, - height, - width, - dtype, - generator, - do_classifier_free_guidance, ): + self, + mask, + masked_image, + batch_size, + height, + width, + dtype, + generator, + do_classifier_free_guidance, + ): # resize the mask to latents shape as we concatenate the mask to the latents # we do that before converting to dtype to avoid breaking in case we're using cpu_offload # and half precision - mask = F.interpolate( - mask, - size=(height // self.vae_scale_factor, - width // self.vae_scale_factor)) + mask = F.interpolate(mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)) mask = mask.cast(dtype) masked_image = masked_image.cast(dtype) @@ -554,13 +535,12 @@ def prepare_mask_latents( # encode the mask image into latents space so we can concatenate it to the latents if isinstance(generator, list): masked_image_latents = [ - self.vae.encode(masked_image[i:i + 1]).latent_dist.sample( - generator=generator[i]) for i in range(batch_size) + self.vae.encode(masked_image[i : i + 1]).latent_dist.sample(generator=generator[i]) + for i in range(batch_size) ] masked_image_latents = paddle.concat(masked_image_latents, axis=0) else: - masked_image_latents = self.vae.encode( - masked_image).latent_dist.sample(generator=generator) + masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator) masked_image_latents = self.vae.config.scaling_factor * masked_image_latents # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method @@ -579,14 +559,12 @@ def prepare_mask_latents( f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed." " Make sure the number of images that you pass is divisible by the total requested batch size." ) - masked_image_latents = masked_image_latents.tile( - [batch_size // masked_image_latents.shape[0], 1, 1, 1]) + masked_image_latents = masked_image_latents.tile([batch_size // masked_image_latents.shape[0], 1, 1, 1]) - mask = paddle.concat([mask] * - 2) if do_classifier_free_guidance else mask - masked_image_latents = (paddle.concat([masked_image_latents] * 2) - if do_classifier_free_guidance else - masked_image_latents) + mask = paddle.concat([mask] * 2) if do_classifier_free_guidance else mask + masked_image_latents = ( + paddle.concat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents + ) # aligning device to prevent device errors when concating it with the latent model input masked_image_latents = masked_image_latents.cast(dtype) @@ -594,26 +572,26 @@ def prepare_mask_latents( @paddle.no_grad() def __call__( - self, - prompt: Union[str, List[str]]=None, - image: Union[paddle.Tensor, PIL.Image.Image]=None, - mask_image: Union[paddle.Tensor, PIL.Image.Image]=None, - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, ): + self, + prompt: Union[str, List[str]] = None, + image: Union[paddle.Tensor, PIL.Image.Image] = None, + mask_image: Union[paddle.Tensor, PIL.Image.Image] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + ): r""" Function invoked when calling the pipeline for generation. @@ -726,7 +704,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) if image is None: raise ValueError("`image` input cannot be undefined.") @@ -754,7 +733,8 @@ def __call__( do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 4. Preprocess mask and image mask, masked_image = prepare_mask_and_masked_image(image, mask_image) @@ -772,7 +752,8 @@ def __call__( width, prompt_embeds.dtype, generator, - latents, ) + latents, + ) # 7. Prepare mask latent variables mask, masked_image_latents = self.prepare_mask_latents( @@ -783,60 +764,51 @@ def __call__( width, prompt_embeds.dtype, generator, - do_classifier_free_guidance, ) + do_classifier_free_guidance, + ) # 8. Check that sizes of mask, masked image and latents match num_channels_mask = mask.shape[1] num_channels_masked_image = masked_image_latents.shape[1] - if (num_channels_latents + num_channels_mask + num_channels_masked_image - != self.unet.config.in_channels): + if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels: raise ValueError( f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects" f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +" f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}" f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of" - " `pipeline.unet` or your `mask_image` or `image` input.") + " `pipeline.unet` or your `mask_image` or `image` input." + ) # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 10. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents # concat latents, mask, masked_image_latents in the channel dimension - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) - latent_model_input = paddle.concat( - [latent_model_input, mask, masked_image_latents], axis=1) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + latent_model_input = paddle.concat([latent_model_input, mask, masked_image_latents], axis=1) # predict the noise residual - noise_pred = self.unet( - latent_model_input, t, - encoder_hidden_states=prompt_embeds).sample + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # must cast dtype, paddle.concat has bug.... latents = latents.cast(prompt_embeds.dtype) # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -845,8 +817,7 @@ def __call__( image = self.decode_latents(latents) # 12. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, - prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) # 13. Convert to PIL if output_type == "pil": @@ -855,5 +826,4 @@ def __call__( if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index 67150c534019e..e321d55a86336 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -21,12 +21,10 @@ import paddle.nn.functional as F import PIL from packaging import version -from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict -from ...loaders import (FromCkptMixin, LoraLoaderMixin, - TextualInversionLoaderMixin) +from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor @@ -54,7 +52,8 @@ def preprocess_mask(mask, batch_size, scale_factor=8): w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 mask = mask.resize( (w // scale_factor, h // scale_factor), - resample=PIL_INTERPOLATION["nearest"], ) + resample=PIL_INTERPOLATION["nearest"], + ) mask = np.array(mask).astype(np.float32) / 255.0 mask = np.tile(mask, (4, 1, 1)) mask = np.vstack([mask[None]] * batch_size) @@ -70,7 +69,8 @@ def preprocess_mask(mask, batch_size, scale_factor=8): elif mask.shape[1] not in valid_mask_channel_sizes: raise ValueError( f"Mask channel dimension of size in {valid_mask_channel_sizes} should be second or fourth dimension," - f" but received mask of shape {tuple(mask.shape)}") + f" but received mask of shape {tuple(mask.shape)}" + ) # (potentially) reduce mask channel dimension from 3 to 1 for broadcasting to latent shape mask = mask.mean(1, keepdim=True) h, w = mask.shape[-2:] @@ -79,9 +79,9 @@ def preprocess_mask(mask, batch_size, scale_factor=8): return mask -class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline, - TextualInversionLoaderMixin, - LoraLoaderMixin, FromCkptMixin): +class StableDiffusionInpaintPipelineLegacy( + DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin +): r""" Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*. @@ -119,37 +119,33 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline, # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__init__ def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + ): super().__init__() - if (hasattr(scheduler.config, "steps_offset") and - scheduler.config.steps_offset != 1): + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " "to update the config accordingly as leaving `steps_offset` might led to incorrect results" " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" - " file") - deprecate( - "steps_offset!=1", - "1.0.0", - deprecation_message, - standard_warn=False) + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["steps_offset"] = 1 scheduler._internal_dict = FrozenDict(new_config) - if (hasattr(scheduler.config, "clip_sample") and - scheduler.config.clip_sample is True): + if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`." " `clip_sample` should be set to False in the configuration file. Please make sure to update the" @@ -157,11 +153,7 @@ def __init__( " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very" " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file" ) - deprecate( - "clip_sample not set", - "1.0.0", - deprecation_message, - standard_warn=False) + deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["clip_sample"] = False scheduler._internal_dict = FrozenDict(new_config) @@ -182,12 +174,10 @@ def __init__( " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." ) - is_unet_version_less_0_9_0 = hasattr( - unet.config, "_ppdiffusers_version") and version.parse( - version.parse(unet.config._ppdiffusers_version) - .base_version) < version.parse("0.9.0.dev0") - is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and - unet.config.sample_size < 64) + is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse( + version.parse(unet.config._ppdiffusers_version).base_version + ) < version.parse("0.9.0.dev0") + is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: deprecation_message = ( "The configuration file of the unet has set the default `sample_size` to smaller than" @@ -198,12 +188,9 @@ def __init__( " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`" " in the config might lead to incorrect results in future versions. If you have downloaded this" " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" - " the `unet/config.json` file") - deprecate( - "sample_size<64", - "1.0.0", - deprecation_message, - standard_warn=False) + " the `unet/config.json` file" + ) + deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(unet.config) new_config["sample_size"] = 64 unet._internal_dict = FrozenDict(new_config) @@ -215,19 +202,21 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): r""" Encodes the prompt into text encoder hidden states. @@ -267,29 +256,31 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) @@ -297,8 +288,7 @@ def _encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -308,21 +298,22 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, - self.tokenizer) + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( @@ -330,47 +321,43 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) else: has_nsfw_concept = None return image, has_nsfw_concept @@ -391,52 +378,49 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs def check_inputs( - self, - prompt, - strength, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + strength, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if strength < 0 or strength > 1: - raise ValueError( - f"The value of strength should in [0.0, 1.0] but is {strength}") + raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -449,59 +433,56 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps def get_timesteps(self, num_inference_steps, strength): # get the original timestep using init_timestep - init_timestep = min( - int(num_inference_steps * strength), num_inference_steps) + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:] + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] return timesteps, num_inference_steps - t_start - def prepare_latents(self, image, timestep, num_images_per_prompt, dtype, - generator): + def prepare_latents(self, image, timestep, num_images_per_prompt, dtype, generator): image = image.cast(dtype) init_latent_dist = self.vae.encode(image).latent_dist init_latents = init_latent_dist.sample(generator=generator) init_latents = self.vae.config.scaling_factor * init_latents # Expand init_latents for batch_size and num_images_per_prompt - init_latents = paddle.concat( - [init_latents] * num_images_per_prompt, axis=0) + init_latents = paddle.concat([init_latents] * num_images_per_prompt, axis=0) init_latents_orig = init_latents # add noise to latents using the timesteps - noise = randn_tensor( - init_latents.shape, generator=generator, dtype=dtype) + noise = randn_tensor(init_latents.shape, generator=generator, dtype=dtype) init_latents = self.scheduler.add_noise(init_latents, noise, timestep) latents = init_latents return latents, init_latents_orig, noise @paddle.no_grad() def __call__( - self, - prompt: Union[str, List[str]], - image: Union[paddle.Tensor, PIL.Image.Image]=None, - mask_image: Union[paddle.Tensor, PIL.Image.Image]=None, - strength: float=0.8, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - add_predicted_noise: Optional[bool]=False, - eta: Optional[float]=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, ): + self, + prompt: Union[str, List[str]], + image: Union[paddle.Tensor, PIL.Image.Image] = None, + mask_image: Union[paddle.Tensor, PIL.Image.Image] = None, + strength: float = 0.8, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + add_predicted_noise: Optional[bool] = False, + eta: Optional[float] = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + ): r""" Function invoked when calling the pipeline for generation. @@ -580,7 +561,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -602,21 +584,19 @@ def __call__( do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 4. Preprocess image and mask if not isinstance(image, paddle.Tensor): image = preprocess_image(image, batch_size) - mask_image = preprocess_mask(mask_image, batch_size, - self.vae_scale_factor) + mask_image = preprocess_mask(mask_image, batch_size, self.vae_scale_factor) # 5. set timesteps self.scheduler.set_timesteps(num_inference_steps) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, - strength) - latent_timestep = timesteps[:1].tile( - [batch_size * num_images_per_prompt]) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) + latent_timestep = timesteps[:1].tile([batch_size * num_images_per_prompt]) # 6. Prepare latent variables # encode the init image into latents and scale the latents @@ -625,7 +605,8 @@ def __call__( latent_timestep, num_images_per_prompt, prompt_embeds.dtype, - generator, ) + generator, + ) # 7. Prepare mask latent mask = mask_image.cast(latents.dtype) @@ -635,50 +616,39 @@ def __call__( extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 9. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual - noise_pred = self.unet( - latent_model_input, t, - encoder_hidden_states=prompt_embeds).sample + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample if i < len(timesteps) - 1: # masking if add_predicted_noise: - init_latents_proper = self.scheduler.add_noise( - init_latents_orig, noise_pred_uncond, t) + init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise_pred_uncond, t) else: # https://github.com/huggingface/diffusers/pull/3749/files#diff-39d36ab1e622684e35fe6971c12fb44e24756bdc383aba3d7f6e3b1625bdaafc noise_timestep = timesteps[i + 1] - init_latents_proper = self.scheduler.add_noise( - init_latents_orig, noise, noise_timestep) + init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, noise_timestep) else: init_latents_proper = init_latents_orig latents = (init_latents_proper * mask) + (latents * (1 - mask)) # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -687,8 +657,7 @@ def __call__( image = self.decode_latents(latents) # 11. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, - prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) # 12. Convert to PIL if output_type == "pil": @@ -697,5 +666,4 @@ def __call__( if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index f39e50878b44e..02b3128d40d82 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -18,8 +18,7 @@ import numpy as np import paddle import PIL -from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel @@ -43,11 +42,7 @@ def preprocess(image): w, h = image[0].size w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 - image = [ - np.array(i.resize( - (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] - for i in image - ] + image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image] image = np.concatenate(image, axis=0) image = np.array(image).astype(np.float32) / 255.0 image = image.transpose(0, 3, 1, 2) @@ -58,8 +53,7 @@ def preprocess(image): return image -class StableDiffusionInstructPix2PixPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): +class StableDiffusionInstructPix2PixPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for pixel-level image editing by following text instructions. Based on Stable Diffusion. @@ -95,15 +89,16 @@ class StableDiffusionInstructPix2PixPipeline( _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + ): super().__init__() if safety_checker is None and requires_safety_checker: @@ -129,30 +124,31 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) @paddle.no_grad() def __call__( - self, - prompt: Union[str, List[str]]=None, - image: Union[paddle.Tensor, PIL.Image.Image]=None, - num_inference_steps: int=100, - guidance_scale: float=7.5, - image_guidance_scale: float=1.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, ): + self, + prompt: Union[str, List[str]] = None, + image: Union[paddle.Tensor, PIL.Image.Image] = None, + num_inference_steps: int = 100, + guidance_scale: float = 7.5, + image_guidance_scale: float = 1.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + ): r""" Function invoked when calling the pipeline for generation. @@ -252,7 +248,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) if image is None: raise ValueError("`image` input cannot be undefined.") @@ -268,8 +265,7 @@ def __call__( # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = (guidance_scale > 1.0 and - image_guidance_scale >= 1.0) + do_classifier_free_guidance = guidance_scale > 1.0 and image_guidance_scale >= 1.0 # check if scheduler is in sigmas space scheduler_is_in_sigma_space = hasattr(self.scheduler, "sigmas") @@ -280,7 +276,8 @@ def __call__( do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 3. Preprocess image image = preprocess(image) @@ -297,7 +294,8 @@ def __call__( num_images_per_prompt, prompt_embeds.dtype, do_classifier_free_guidance, - generator, ) + generator, + ) # 6. Prepare latent variables num_channels_latents = self.vae.config.latent_channels @@ -308,7 +306,8 @@ def __call__( width, prompt_embeds.dtype, generator, - latents, ) + latents, + ) # 7. Check that shapes of latents and image match the UNet channels num_channels_image = image_latents.shape[1] @@ -318,45 +317,40 @@ def __call__( f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +" f" `num_channels_image`: {num_channels_image} " f" = {num_channels_latents+num_channels_image}. Please verify the config of" - " `pipeline.unet` or your `image` input.") + " `pipeline.unet` or your `image` input." + ) # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 9. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # Expand the latents if we are doing classifier free guidance. # The latents are expanded 3 times because for pix2pix the guidance\ # is applied for both the text and the input image. - latent_model_input = (paddle.concat([latents] * 3) if - do_classifier_free_guidance else latents) + latent_model_input = paddle.concat([latents] * 3) if do_classifier_free_guidance else latents # concat latents, image_latents in the channel dimension - scaled_latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + scaled_latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) scaled_latent_model_input = paddle.concat( [ scaled_latent_model_input, image_latents.cast(scaled_latent_model_input.dtype), ], - axis=1, ) + axis=1, + ) # predict the noise residual - noise_pred = self.unet( - scaled_latent_model_input, - t, - encoder_hidden_states=prompt_embeds).sample + noise_pred = self.unet(scaled_latent_model_input, t, encoder_hidden_states=prompt_embeds).sample # Hack: # For karras style schedulers the model does classifer free guidance using the # predicted_original_sample instead of the noise_pred. So we need to compute the # predicted_original_sample here if we are using a karras style scheduler. if scheduler_is_in_sigma_space: - step_index = ( - self.scheduler.timesteps == t).nonzero().item() + step_index = (self.scheduler.timesteps == t).nonzero().item() sigma = self.scheduler.sigmas[step_index] noise_pred = latent_model_input - sigma * noise_pred @@ -365,11 +359,13 @@ def __call__( ( noise_pred_text, noise_pred_image, - noise_pred_uncond, ) = noise_pred.chunk(3) - noise_pred = (noise_pred_uncond + guidance_scale * - (noise_pred_text - noise_pred_image - ) + image_guidance_scale * - (noise_pred_image - noise_pred_uncond)) + noise_pred_uncond, + ) = noise_pred.chunk(3) + noise_pred = ( + noise_pred_uncond + + guidance_scale * (noise_pred_text - noise_pred_image) + + image_guidance_scale * (noise_pred_image - noise_pred_uncond) + ) # Hack: # For karras style schedulers the model does classifer free guidance using the @@ -381,13 +377,10 @@ def __call__( noise_pred = (noise_pred - latents) / (-sigma) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -396,8 +389,7 @@ def __call__( image = self.decode_latents(latents) # 11. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, - prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) # 12. Convert to PIL if output_type == "pil": @@ -406,17 +398,17 @@ def __call__( if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): r""" Encodes the prompt into text encoder hidden states. @@ -456,29 +448,31 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) @@ -486,8 +480,7 @@ def _encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -497,21 +490,22 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, - self.tokenizer) + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( @@ -519,49 +513,44 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes # pix2pix has two negative embeddings, and unlike in other pipelines latents are ordered [prompt_embeds, negative_prompt_embeds, negative_prompt_embeds] - prompt_embeds = paddle.concat([ - prompt_embeds, negative_prompt_embeds, negative_prompt_embeds - ]) + prompt_embeds = paddle.concat([prompt_embeds, negative_prompt_embeds, negative_prompt_embeds]) return prompt_embeds # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) else: has_nsfw_concept = None return image, has_nsfw_concept @@ -573,15 +562,13 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs @@ -596,32 +583,32 @@ def decode_latents(self, latents): return image def check_inputs( - self, - prompt, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -634,23 +621,26 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = ( batch_size, num_channels_latents, height // self.vae_scale_factor, - width // self.vae_scale_factor, ) + width // self.vae_scale_factor, + ) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" @@ -665,13 +655,14 @@ def prepare_latents( return latents def prepare_image_latents( - self, - image, - batch_size, - num_images_per_prompt, - dtype, - do_classifier_free_guidance, - generator=None, ): + self, + image, + batch_size, + num_images_per_prompt, + dtype, + do_classifier_free_guidance, + generator=None, + ): if not isinstance(image, (paddle.Tensor, PIL.Image.Image, list)): raise ValueError( f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or list but is {type(image)}" @@ -687,16 +678,12 @@ def prepare_image_latents( ) if isinstance(generator, list): - image_latents = [ - self.vae.encode(image[i:i + 1]).latent_dist.mode() - for i in range(batch_size) - ] + image_latents = [self.vae.encode(image[i : i + 1]).latent_dist.mode() for i in range(batch_size)] image_latents = paddle.concat(image_latents, axis=0) else: image_latents = self.vae.encode(image).latent_dist.mode() - if (batch_size > image_latents.shape[0] and - batch_size % image_latents.shape[0] == 0): + if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0: # expand image_latents for batch_size deprecation_message = ( f"You have passed {batch_size} text prompts (`prompt`), but only {image_latents.shape[0]} initial" @@ -708,12 +695,11 @@ def prepare_image_latents( "len(prompt) != len(image)", "1.0.0", deprecation_message, - standard_warn=False, ) + standard_warn=False, + ) additional_image_per_prompt = batch_size // image_latents.shape[0] - image_latents = paddle.concat( - [image_latents] * additional_image_per_prompt, axis=0) - elif (batch_size > image_latents.shape[0] and - batch_size % image_latents.shape[0] != 0): + image_latents = paddle.concat([image_latents] * additional_image_per_prompt, axis=0) + elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0: raise ValueError( f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts." ) @@ -722,7 +708,6 @@ def prepare_image_latents( if do_classifier_free_guidance: uncond_image_latents = paddle.zeros_like(image_latents) - image_latents = paddle.concat( - [image_latents, image_latents, uncond_image_latents], axis=0) + image_latents = paddle.concat([image_latents, image_latents, uncond_image_latents], axis=0) return image_latents diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py index f4fdd86cdbfb6..9151849ce7309 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py @@ -75,12 +75,13 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline): """ def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: EulerDiscreteScheduler, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: EulerDiscreteScheduler, + ): super().__init__() self.register_modules( @@ -88,10 +89,10 @@ def __init__( text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, - scheduler=scheduler, ) + scheduler=scheduler, + ) - def _encode_prompt(self, prompt, do_classifier_free_guidance, - negative_prompt): + def _encode_prompt(self, prompt, do_classifier_free_guidance, negative_prompt): r""" Encodes the prompt into text encoder hidden states. @@ -112,23 +113,25 @@ def _encode_prompt(self, prompt, do_classifier_free_guidance, max_length=self.tokenizer.model_max_length, truncation=True, return_length=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, untruncated_ids): - removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) text_encoder_out = self.text_encoder( text_input_ids, - output_hidden_states=True, ) + output_hidden_states=True, + ) text_embeddings = text_encoder_out.hidden_states[-1] text_pooler_out = text_encoder_out.pooler_output @@ -140,14 +143,16 @@ def _encode_prompt(self, prompt, do_classifier_free_guidance, elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt @@ -158,11 +163,13 @@ def _encode_prompt(self, prompt, do_classifier_free_guidance, max_length=max_length, truncation=True, return_length=True, - return_tensors="pd", ) + return_tensors="pd", + ) uncond_encoder_out = self.text_encoder( uncond_input.input_ids, - output_hidden_states=True, ) + output_hidden_states=True, + ) uncond_embeddings = uncond_encoder_out.hidden_states[-1] uncond_pooler_out = uncond_encoder_out.pooler_output @@ -170,10 +177,8 @@ def _encode_prompt(self, prompt, do_classifier_free_guidance, # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - text_embeddings = paddle.concat( - [uncond_embeddings, text_embeddings]) - text_pooler_out = paddle.concat( - [uncond_pooler_out, text_pooler_out]) + text_embeddings = paddle.concat([uncond_embeddings, text_embeddings]) + text_pooler_out = paddle.concat([uncond_pooler_out, text_pooler_out]) return text_embeddings, text_pooler_out @@ -188,13 +193,13 @@ def decode_latents(self, latents): def check_inputs(self, prompt, image, callback_steps): if not isinstance(prompt, str) and not isinstance(prompt, list): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - if (not isinstance(image, paddle.Tensor) and - not isinstance(image, PIL.Image.Image) and - not isinstance(image, list)): + if ( + not isinstance(image, paddle.Tensor) + and not isinstance(image, PIL.Image.Image) + and not isinstance(image, list) + ): raise ValueError( f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or `list` but is {type(image)}" ) @@ -216,30 +221,30 @@ def check_inputs(self, prompt, image, callback_steps): ) if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.prepare_latents def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = (batch_size, num_channels_latents, height, width) if latents is None: latents = randn_tensor(shape, generator=generator, dtype=dtype) else: if latents.shape != list(shape): - raise ValueError( - f"Unexpected latents shape, got {latents.shape}, expected {shape}" - ) + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") # scale the initial noise by the standard deviation required by the scheduler latents = latents * self.scheduler.init_noise_sigma @@ -247,19 +252,19 @@ def prepare_latents( @paddle.no_grad() def __call__( - self, - prompt: Union[str, List[str]], - image: Union[paddle.Tensor, PIL.Image.Image, List[PIL.Image.Image]], - num_inference_steps: int=75, - guidance_scale: float=9.0, - negative_prompt: Optional[Union[str, List[str]]]=None, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, ): + self, + prompt: Union[str, List[str]], + image: Union[paddle.Tensor, PIL.Image.Image, List[PIL.Image.Image]], + num_inference_steps: int = 75, + guidance_scale: float = 9.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + ): r""" Function invoked when calling the pipeline for generation. @@ -362,16 +367,14 @@ def __call__( prompt = [""] * batch_size # 3. Encode input prompt - text_embeddings, text_pooler_out = self._encode_prompt( - prompt, do_classifier_free_guidance, negative_prompt) + text_embeddings, text_pooler_out = self._encode_prompt(prompt, do_classifier_free_guidance, negative_prompt) # 4. Preprocess image image = preprocess(image) image = image.cast(text_embeddings.dtype) if image.shape[1] == 3: # encode image if not in latent-space yet - image = (self.vae.encode(image).latent_dist.sample() * - self.vae.config.scaling_factor) + image = self.vae.encode(image).latent_dist.sample() * self.vae.config.scaling_factor # 5. set timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -386,27 +389,23 @@ def __call__( # "the This step theoretically can make the model work better on out-of-distribution inputs, but mostly just seems to make it match the input less, so it's turned off by default." noise_level = paddle.to_tensor([0.0], dtype=paddle.float32) noise_level = paddle.concat([noise_level] * image.shape[0]) - inv_noise_level = (noise_level**2 + 1)**(-0.5) + inv_noise_level = (noise_level**2 + 1) ** (-0.5) # TODO F.interpolate donot support float16 - image_cond = (F.interpolate( - image.cast("float32"), scale_factor=2, - mode="nearest") * inv_noise_level[:, None, None, None]) + image_cond = ( + F.interpolate(image.cast("float32"), scale_factor=2, mode="nearest") * inv_noise_level[:, None, None, None] + ) image_cond = image_cond.cast(text_embeddings.dtype) noise_level_embed = paddle.concat( [ - paddle.ones( - [text_pooler_out.shape[0], 64], - dtype=text_pooler_out.dtype), - paddle.zeros( - [text_pooler_out.shape[0], 64], - dtype=text_pooler_out.dtype), + paddle.ones([text_pooler_out.shape[0], 64], dtype=text_pooler_out.dtype), + paddle.zeros([text_pooler_out.shape[0], 64], dtype=text_pooler_out.dtype), ], - axis=1, ) + axis=1, + ) - timestep_condition = paddle.concat( - [noise_level_embed, text_pooler_out], axis=1) + timestep_condition = paddle.concat([noise_level_embed, text_pooler_out], axis=1) # 6. Prepare latent variables height, width = image.shape[2:] @@ -418,7 +417,8 @@ def __call__( width * 2, text_embeddings.dtype, generator, - latents, ) + latents, + ) # 7. Check that sizes of image and latents match num_channels_image = image.shape[1] @@ -428,7 +428,8 @@ def __call__( f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +" f" `num_channels_image`: {num_channels_image} " f" = {num_channels_latents+num_channels_image}. Please verify the config of" - " `pipeline.unet` or your `image` input.") + " `pipeline.unet` or your `image` input." + ) # 9. Denoising loop num_warmup_steps = 0 @@ -437,48 +438,39 @@ def __call__( for i, t in enumerate(timesteps): sigma = self.scheduler.sigmas[i] # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - scaled_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t) scaled_model_input = paddle.concat( - [ - scaled_model_input, - image_cond.cast(scaled_model_input.dtype) - ], - axis=1, ) + [scaled_model_input, image_cond.cast(scaled_model_input.dtype)], + axis=1, + ) # preconditioning parameter based on Karras et al. (2022) (table 1) timestep = paddle.log(sigma) * 0.25 noise_pred = self.unet( scaled_model_input, timestep, encoder_hidden_states=text_embeddings, - timestep_cond=timestep_condition, ).sample + timestep_cond=timestep_condition, + ).sample # in original repo, the output contains a variance channel that's not used noise_pred = noise_pred[:, :-1] # apply preconditioning, based on table 1 in Karras et al. (2022) inv_sigma = 1 / (sigma**2 + 1) - noise_pred = ( - inv_sigma * latent_model_input + - self.scheduler.scale_model_input(sigma, t) * noise_pred) + noise_pred = inv_sigma * latent_model_input + self.scheduler.scale_model_input(sigma, t) * noise_pred # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, - latents).prev_sample + latents = self.scheduler.step(noise_pred, t, latents).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -491,6 +483,6 @@ def __call__( image = self.numpy_to_pil(image) if not return_dict: - return (image, ) + return (image,) return ImagePipelineOutput(images=image) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_mega.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_mega.py index 13e7d28b153ee..93a2487ee267a 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_mega.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_mega.py @@ -21,8 +21,9 @@ from ...utils import logging from .pipeline_stable_diffusion import StableDiffusionPipeline from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline -from .pipeline_stable_diffusion_inpaint_legacy import \ - StableDiffusionInpaintPipelineLegacy +from .pipeline_stable_diffusion_inpaint_legacy import ( + StableDiffusionInpaintPipelineLegacy, +) logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -61,36 +62,31 @@ def __call__(self, *args, **kwargs): return self.text2img(*args, **kwargs) def text2img( - self, - prompt: Union[str, List[str]]=None, - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, ): + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ): - expected_components = inspect.signature( - StableDiffusionPipeline.__init__).parameters.keys() - components = { - name: component - for name, component in self.components.items() - if name in expected_components - } + expected_components = inspect.signature(StableDiffusionPipeline.__init__).parameters.keys() + components = {name: component for name, component in self.components.items() if name in expected_components} temp_pipeline = StableDiffusionPipeline( - **components, - requires_safety_checker=self.config.requires_safety_checker) + **components, requires_safety_checker=self.config.requires_safety_checker + ) output = temp_pipeline( prompt=prompt, height=height, @@ -108,38 +104,34 @@ def text2img( return_dict=return_dict, callback=callback, callback_steps=callback_steps, - cross_attention_kwargs=cross_attention_kwargs, ) + cross_attention_kwargs=cross_attention_kwargs, + ) return output def img2img( - self, - prompt: Union[str, List[str]]=None, - image: Union[paddle.Tensor, PIL.Image.Image]=None, - strength: float=0.8, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: Optional[float]=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - **kwargs, ): - expected_components = inspect.signature( - StableDiffusionImg2ImgPipeline.__init__).parameters.keys() - components = { - name: component - for name, component in self.components.items() - if name in expected_components - } + self, + prompt: Union[str, List[str]] = None, + image: Union[paddle.Tensor, PIL.Image.Image] = None, + strength: float = 0.8, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: Optional[float] = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + **kwargs, + ): + expected_components = inspect.signature(StableDiffusionImg2ImgPipeline.__init__).parameters.keys() + components = {name: component for name, component in self.components.items() if name in expected_components} temp_pipeline = StableDiffusionImg2ImgPipeline( - **components, - requires_safety_checker=self.config.requires_safety_checker) + **components, requires_safety_checker=self.config.requires_safety_checker + ) output = temp_pipeline( prompt=prompt, image=image, @@ -156,41 +148,37 @@ def img2img( return_dict=return_dict, callback=callback, callback_steps=callback_steps, - **kwargs, ) + **kwargs, + ) return output def inpaint_legacy( - self, - prompt: Union[str, List[str]], - image: Union[paddle.Tensor, PIL.Image.Image]=None, - mask_image: Union[paddle.Tensor, PIL.Image.Image]=None, - strength: float=0.8, - num_inference_steps: Optional[int]=50, - guidance_scale: Optional[float]=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - add_predicted_noise: Optional[bool]=False, - eta: Optional[float]=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - **kwargs, ): - expected_components = inspect.signature( - StableDiffusionInpaintPipelineLegacy.__init__).parameters.keys() - components = { - name: component - for name, component in self.components.items() - if name in expected_components - } + self, + prompt: Union[str, List[str]], + image: Union[paddle.Tensor, PIL.Image.Image] = None, + mask_image: Union[paddle.Tensor, PIL.Image.Image] = None, + strength: float = 0.8, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + add_predicted_noise: Optional[bool] = False, + eta: Optional[float] = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + **kwargs, + ): + expected_components = inspect.signature(StableDiffusionInpaintPipelineLegacy.__init__).parameters.keys() + components = {name: component for name, component in self.components.items() if name in expected_components} temp_pipeline = StableDiffusionInpaintPipelineLegacy( - **components, - requires_safety_checker=self.config.requires_safety_checker) + **components, requires_safety_checker=self.config.requires_safety_checker + ) output = temp_pipeline( prompt=prompt, image=image, @@ -209,6 +197,7 @@ def inpaint_legacy( return_dict=return_dict, callback=callback, callback_steps=callback_steps, - **kwargs, ) + **kwargs, + ) return output diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py index ce7f96b22cc24..3ad5c35785e9a 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py @@ -16,8 +16,7 @@ from typing import Any, Callable, Dict, List, Optional, Union import paddle -from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...loaders import TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel @@ -48,8 +47,7 @@ """ -class StableDiffusionModelEditingPipeline(DiffusionPipeline, - TextualInversionLoaderMixin): +class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoaderMixin): r""" Pipeline for text-to-image model editing using "Editing Implicit Assumptions in Text-to-Image Diffusion Models". This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the @@ -80,22 +78,22 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline, _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: SchedulerMixin, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=True, - with_to_k: bool=True, - with_augs: list=AUGS_CONST, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: SchedulerMixin, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + with_to_k: bool = True, + with_augs: list = AUGS_CONST, + ): super().__init__() if isinstance(scheduler, PNDMScheduler): - logger.error( - "PNDMScheduler for this pipeline is currently not supported.") + logger.error("PNDMScheduler for this pipeline is currently not supported.") if safety_checker is None and requires_safety_checker: logger.warning( @@ -120,8 +118,9 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) self.with_to_k = with_to_k @@ -147,18 +146,12 @@ def append_ca(net_): append_ca(net[1]) # get projection matrices - self.ca_clip_layers = [ - l for l in ca_layers if l.to_v.in_features == 768 - ] + self.ca_clip_layers = [l for l in ca_layers if l.to_v.in_features == 768] self.projection_matrices = [l.to_v for l in self.ca_clip_layers] self.og_matrices = [copy.deepcopy(l.to_v) for l in self.ca_clip_layers] if self.with_to_k: - self.projection_matrices = self.projection_matrices + [ - l.to_k for l in self.ca_clip_layers - ] - self.og_matrices = self.og_matrices + [ - copy.deepcopy(l.to_k) for l in self.ca_clip_layers - ] + self.projection_matrices = self.projection_matrices + [l.to_k for l in self.ca_clip_layers] + self.og_matrices = self.og_matrices + [copy.deepcopy(l.to_k) for l in self.ca_clip_layers] # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing def enable_vae_slicing(self): @@ -179,13 +172,14 @@ def disable_vae_slicing(self): # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): r""" Encodes the prompt into text encoder hidden states. Args: @@ -224,29 +218,31 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder.dtype) @@ -254,8 +250,7 @@ def _encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -265,21 +260,22 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, - self.tokenizer) + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( @@ -287,47 +283,43 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - dtype=self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) else: has_nsfw_concept = None return image, has_nsfw_concept @@ -348,54 +340,50 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs def check_inputs( - self, - prompt, - height, - width, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -408,23 +396,26 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = ( batch_size, num_channels_latents, height // self.vae_scale_factor, - width // self.vae_scale_factor, ) + width // self.vae_scale_factor, + ) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" @@ -440,11 +431,12 @@ def prepare_latents( @paddle.no_grad() def edit_model( - self, - source_prompt: str, - destination_prompt: str, - lamb: float=0.1, - restart_params: bool=True, ): + self, + source_prompt: str, + destination_prompt: str, + lamb: float = 0.1, + restart_params: bool = True, + ): r""" Apply model editing via closed-form solution (see Eq. 5 in the TIME paper https://arxiv.org/abs/2303.08084) Args: @@ -467,20 +459,17 @@ def edit_model( l.to_v = copy.deepcopy(self.og_matrices[idx_]) self.projection_matrices[idx_] = l.to_v if self.with_to_k: - l.to_k = copy.deepcopy(self.og_matrices[num_ca_clip_layers + - idx_]) + l.to_k = copy.deepcopy(self.og_matrices[num_ca_clip_layers + idx_]) self.projection_matrices[num_ca_clip_layers + idx_] = l.to_k # set up sentences old_texts = [source_prompt] new_texts = [destination_prompt] # add augmentations - base = old_texts[0] if old_texts[0][0:1] != "A" else "a" + old_texts[0][ - 1:] + base = old_texts[0] if old_texts[0][0:1] != "A" else "a" + old_texts[0][1:] for aug in self.with_augs: old_texts.append(aug + base) - base = new_texts[0] if new_texts[0][0:1] != "A" else "a" + new_texts[0][ - 1:] + base = new_texts[0] if new_texts[0][0:1] != "A" else "a" + new_texts[0][1:] for aug in self.with_augs: new_texts.append(aug + base) @@ -492,7 +481,8 @@ def edit_model( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_embeddings = self.text_encoder(text_input.input_ids)[0] old_emb, new_emb = text_embeddings old_embs.append(old_emb) @@ -504,12 +494,12 @@ def edit_model( tokens_a = self.tokenizer(old_text).input_ids tokens_b = self.tokenizer(new_text).input_ids tokens_a = [ - self.tokenizer.encode("a ")["input_ids"][1] - if self.tokenizer.decode(t) == "an" else t for t in tokens_a + self.tokenizer.encode("a ")["input_ids"][1] if self.tokenizer.decode(t) == "an" else t + for t in tokens_a ] tokens_b = [ - self.tokenizer.encode("a ")["input_ids"][1] - if self.tokenizer.decode(t) == "an" else t for t in tokens_b + self.tokenizer.encode("a ")["input_ids"][1] if self.tokenizer.decode(t) == "an" else t + for t in tokens_b ] num_orig_tokens = len(tokens_a) idxs_replace = [] @@ -529,8 +519,7 @@ def edit_model( # prepare batch: for each pair of setences, old context and new values contexts, valuess = [], [] - for old_emb, new_emb, idxs_replace in zip(old_embs, new_embs, - idxs_replaces): + for old_emb, new_emb, idxs_replace in zip(old_embs, new_embs, idxs_replaces): context = old_emb.detach() values = [] with paddle.no_grad(): @@ -545,52 +534,47 @@ def edit_model( mat1 = lamb * self.projection_matrices[layer_num].weight # mat2 = \lambda I + \sum{k k^T} - mat2 = lamb * paddle.eye(self.projection_matrices[layer_num] - .weight.shape[1]) + mat2 = lamb * paddle.eye(self.projection_matrices[layer_num].weight.shape[1]) # aggregate sums for mat1, mat2 for context, values in zip(contexts, valuess): - context_vector = context.reshape( - [context.shape[0], context.shape[1], 1]) - context_vector_T = context.reshape( - [context.shape[0], 1, context.shape[1]]) - value_vector = values[layer_num].reshape([ - values[layer_num].shape[0], values[layer_num].shape[1], 1 - ]) - for_mat1 = (value_vector @context_vector_T).sum(axis=0) - for_mat2 = (context_vector @context_vector_T).sum(axis=0) + context_vector = context.reshape([context.shape[0], context.shape[1], 1]) + context_vector_T = context.reshape([context.shape[0], 1, context.shape[1]]) + value_vector = values[layer_num].reshape([values[layer_num].shape[0], values[layer_num].shape[1], 1]) + for_mat1 = (value_vector @ context_vector_T).sum(axis=0) + for_mat2 = (context_vector @ context_vector_T).sum(axis=0) mat1 += for_mat1 mat2 += for_mat2 # update projection matrix - mat = mat1 @paddle.inverse(mat2) - self.projection_matrices[ - layer_num].weight = paddle.create_parameter( - shape=mat.shape, - dtype=mat.dtype, - default_initializer=paddle.nn.initializer.Assign(mat), ) + mat = mat1 @ paddle.inverse(mat2) + self.projection_matrices[layer_num].weight = paddle.create_parameter( + shape=mat.shape, + dtype=mat.dtype, + default_initializer=paddle.nn.initializer.Assign(mat), + ) @paddle.no_grad() def __call__( - self, - prompt: Union[str, List[str]]=None, - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: int=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, ): + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ): r""" Function invoked when calling the pipeline for generation. Args: @@ -668,7 +652,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -690,7 +675,8 @@ def __call__( do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -705,43 +691,38 @@ def __call__( width, prompt_embeds.dtype, generator, - latents, ) + latents, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( latent_model_input, t, encoder_hidden_states=prompt_embeds, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -754,8 +735,7 @@ def __call__( image = self.decode_latents(latents) # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker( - image, prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) # 10. Convert to PIL image = self.numpy_to_pil(image) @@ -764,11 +744,9 @@ def __call__( image = self.decode_latents(latents) # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker( - image, prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py index cc2586bec5107..5258f174894bf 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py @@ -15,8 +15,7 @@ from typing import Any, Callable, Dict, List, Optional, Union import paddle -from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...loaders import TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel @@ -46,8 +45,7 @@ """ -class StableDiffusionPanoramaPipeline(DiffusionPipeline, - TextualInversionLoaderMixin): +class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderMixin): r""" Pipeline for text-to-image generation using "MultiDiffusion: Fusing Diffusion Paths for Controlled Image Generation". @@ -81,20 +79,20 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: DDIMScheduler, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: DDIMScheduler, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + ): super().__init__() if isinstance(scheduler, PNDMScheduler): - logger.error( - "PNDMScheduler for this pipeline is currently not supported.") + logger.error("PNDMScheduler for this pipeline is currently not supported.") if safety_checker is None and requires_safety_checker: logger.warning( @@ -119,19 +117,21 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): r""" Encodes the prompt into text encoder hidden states. @@ -171,29 +171,31 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) @@ -201,8 +203,7 @@ def _encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -212,21 +213,22 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, - self.tokenizer) + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( @@ -234,47 +236,43 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) else: has_nsfw_concept = None return image, has_nsfw_concept @@ -295,54 +293,50 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs def check_inputs( - self, - prompt, - height, - width, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -355,23 +349,26 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = ( batch_size, num_channels_latents, height // self.vae_scale_factor, - width // self.vae_scale_factor, ) + width // self.vae_scale_factor, + ) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" @@ -385,11 +382,7 @@ def prepare_latents( latents = latents * self.scheduler.init_noise_sigma return latents - def get_views(self, - panorama_height, - panorama_width, - window_size=64, - stride=8): + def get_views(self, panorama_height, panorama_width, window_size=64, stride=8): # Here, we define the mappings F_i (see Eq. 7 in the MultiDiffusion paper https://arxiv.org/abs/2302.08113) panorama_height /= 8 panorama_width /= 8 @@ -408,25 +401,25 @@ def get_views(self, @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - prompt: Union[str, List[str]]=None, - height: Optional[int]=512, - width: Optional[int]=2048, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, ): + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = 512, + width: Optional[int] = 2048, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ): r""" Function invoked when calling the pipeline for generation. @@ -508,7 +501,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -530,7 +524,8 @@ def __call__( do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -545,7 +540,8 @@ def __call__( width, prompt_embeds.dtype, generator, - latents, ) + latents, + ) # 6. Define panorama grid and initialize views for synthesis. views = self.get_views(height, width) @@ -558,8 +554,7 @@ def __call__( # 8. Denoising loop # Each denoising step also includes refinement of the latents with respect to the # views. - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): count.zero_() @@ -572,44 +567,39 @@ def __call__( # MultiDiffusion paper for more details: https://arxiv.org/abs/2302.08113 for h_start, h_end, w_start, w_end in views: # get the latents corresponding to the current view coordinates - latents_for_view = latents[:, :, h_start:h_end, w_start: - w_end] + latents_for_view = latents[:, :, h_start:h_end, w_start:w_end] # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents_for_view] * 2) - if do_classifier_free_guidance else - latents_for_view) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = ( + paddle.concat([latents_for_view] * 2) if do_classifier_free_guidance else latents_for_view + ) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( latent_model_input, t, encoder_hidden_states=prompt_embeds, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 latents_view_denoised = self.scheduler.step( - noise_pred, t, latents_for_view, - **extra_step_kwargs).prev_sample - value[:, :, h_start:h_end, w_start: - w_end] += latents_view_denoised + noise_pred, t, latents_for_view, **extra_step_kwargs + ).prev_sample + value[:, :, h_start:h_end, w_start:w_end] += latents_view_denoised count[:, :, h_start:h_end, w_start:w_end] += 1 # take the MultiDiffusion step. Eq. 5 in MultiDiffusion paper: https://arxiv.org/abs/2302.08113 latents = paddle.where(count > 0, value / count, value) # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -618,8 +608,7 @@ def __call__( image = self.decode_latents(latents) # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, - prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) # 10. Convert to PIL if output_type == "pil": @@ -628,5 +617,4 @@ def __call__( if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py index 1ae1d85aacf36..7a5cb8d8a0a5e 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py @@ -22,19 +22,33 @@ import paddle.nn.functional as F import paddle.optimizer import PIL -from paddlenlp.transformers import (BlipForConditionalGeneration, BlipProcessor, - CLIPImageProcessor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import ( + BlipForConditionalGeneration, + BlipProcessor, + CLIPImageProcessor, + CLIPTextModel, + CLIPTokenizer, +) from ...loaders import TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import Attention -from ...schedulers import (DDIMScheduler, DDPMScheduler, - EulerAncestralDiscreteScheduler, - LMSDiscreteScheduler) +from ...schedulers import ( + DDIMScheduler, + DDPMScheduler, + EulerAncestralDiscreteScheduler, + LMSDiscreteScheduler, +) from ...schedulers.scheduling_ddim_inverse import DDIMInverseScheduler -from ...utils import (PIL_INTERPOLATION, BaseOutput, deprecate, logging, - randint_tensor, randn_tensor, replace_example_docstring) +from ...utils import ( + PIL_INTERPOLATION, + BaseOutput, + deprecate, + logging, + randint_tensor, + randn_tensor, + replace_example_docstring, +) from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker @@ -172,11 +186,7 @@ def preprocess(image): w, h = image[0].size w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 - image = [ - np.array(i.resize( - (w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] - for i in image - ] + image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image] image = np.concatenate(image, axis=0) image = np.array(image).astype(np.float32) / 255.0 image = image.transpose(0, 3, 1, 2) @@ -194,13 +204,11 @@ def prepare_unet(unet: UNet2DConditionModel): module_name = name.replace(".processor", "") module: nn.Layer = unet.get_sublayer(module_name) if "attn2" in name: - pix2pix_zero_attn_procs[name] = Pix2PixZeroAttnProcessor( - is_pix2pix_zero=True) + pix2pix_zero_attn_procs[name] = Pix2PixZeroAttnProcessor(is_pix2pix_zero=True) for params in module.parameters(): params.stop_gradient = False else: - pix2pix_zero_attn_procs[name] = Pix2PixZeroAttnProcessor( - is_pix2pix_zero=False) + pix2pix_zero_attn_procs[name] = Pix2PixZeroAttnProcessor(is_pix2pix_zero=False) for params in module.parameters(): params.stop_gradient = True @@ -213,7 +221,7 @@ def __init__(self): self.loss = 0.0 def compute_loss(self, predictions, targets): - self.loss += ((predictions - targets)**2).sum((1, 2)).mean(0) + self.loss += ((predictions - targets) ** 2).sum((1, 2)).mean(0) class Pix2PixZeroAttnProcessor: @@ -226,23 +234,22 @@ def __init__(self, is_pix2pix_zero=False): self.reference_cross_attn_map = {} def __call__( - self, - attn: Attention, - hidden_states, - encoder_hidden_states=None, - attention_mask=None, - timestep=None, - loss=None, ): + self, + attn: Attention, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + timestep=None, + loss=None, + ): batch_size, sequence_length, _ = hidden_states.shape - attention_mask = attn.prepare_attention_mask( - attention_mask, sequence_length, batch_size) + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) query = attn.to_q(hidden_states) if encoder_hidden_states is None: encoder_hidden_states = hidden_states elif attn.norm_cross: - encoder_hidden_states = attn.norm_encoder_hidden_states( - encoder_hidden_states) + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) key = attn.to_k(encoder_hidden_states) value = attn.to_v(encoder_hidden_states) @@ -255,14 +262,11 @@ def __call__( if self.is_pix2pix_zero and timestep is not None: # new bookkeeping to save the attention weights. if loss is None: - self.reference_cross_attn_map[timestep.item( - )] = attention_probs.detach().flatten(0, 1) + self.reference_cross_attn_map[timestep.item()] = attention_probs.detach().flatten(0, 1) # compute loss elif loss is not None: - prev_attn_probs = self.reference_cross_attn_map.pop( - timestep.item()) - loss.compute_loss( - attention_probs.flatten(0, 1), prev_attn_probs) + prev_attn_probs = self.reference_cross_attn_map.pop(timestep.item()) + loss.compute_loss(attention_probs.flatten(0, 1), prev_attn_probs) hidden_states = paddle.matmul(attention_probs, value) hidden_states = attn.batch_to_head_dim(hidden_states) @@ -314,20 +318,24 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline): ] def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: Union[DDPMScheduler, DDIMScheduler, - EulerAncestralDiscreteScheduler, - LMSDiscreteScheduler, ], - feature_extractor: CLIPImageProcessor, - safety_checker: StableDiffusionSafetyChecker, - inverse_scheduler: DDIMInverseScheduler, - caption_generator: BlipForConditionalGeneration, - caption_processor: BlipProcessor, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: Union[ + DDPMScheduler, + DDIMScheduler, + EulerAncestralDiscreteScheduler, + LMSDiscreteScheduler, + ], + feature_extractor: CLIPImageProcessor, + safety_checker: StableDiffusionSafetyChecker, + inverse_scheduler: DDIMInverseScheduler, + caption_generator: BlipForConditionalGeneration, + caption_processor: BlipProcessor, + requires_safety_checker: bool = True, + ): super().__init__() if safety_checker is None and requires_safety_checker: @@ -356,19 +364,21 @@ def __init__( feature_extractor=feature_extractor, caption_processor=caption_processor, caption_generator=caption_generator, - inverse_scheduler=inverse_scheduler, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + inverse_scheduler=inverse_scheduler, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): r""" Encodes the prompt into text encoder hidden states. @@ -408,29 +418,31 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) @@ -438,8 +450,7 @@ def _encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -449,21 +460,22 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, - self.tokenizer) + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( @@ -471,47 +483,43 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) else: has_nsfw_concept = None return image, has_nsfw_concept @@ -532,66 +540,65 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - image, - source_embeds, - target_embeds, - callback_steps, - prompt_embeds=None, ): + self, + prompt, + image, + source_embeds, + target_embeds, + callback_steps, + prompt_embeds=None, + ): if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if source_embeds is None and target_embeds is None: - raise ValueError( - "`source_embeds` and `target_embeds` cannot be undefined.") + raise ValueError("`source_embeds` and `target_embeds` cannot be undefined.") if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = ( batch_size, num_channels_latents, height // self.vae_scale_factor, - width // self.vae_scale_factor, ) + width // self.vae_scale_factor, + ) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" @@ -611,43 +618,38 @@ def generate_caption(self, images): # make sure cast caption_generator position_ids dtype int64 try: self.caption_generator.text_decoder.bert.embeddings.position_ids = ( - self.caption_generator.text_decoder.bert.embeddings. - position_ids.cast("int64")) + self.caption_generator.text_decoder.bert.embeddings.position_ids.cast("int64") + ) except Exception: pass text = "a photography of" - inputs = self.caption_processor( - images=images, text=text, return_tensors="pd") - inputs["pixel_values"] = inputs["pixel_values"].cast( - self.caption_generator.dtype) + inputs = self.caption_processor(images=images, text=text, return_tensors="pd") + inputs["pixel_values"] = inputs["pixel_values"].cast(self.caption_generator.dtype) outputs = self.caption_generator.generate(**inputs, max_length=128)[0] # offload caption generator - caption = self.caption_processor.batch_decode( - outputs, skip_special_tokens=True)[0] + caption = self.caption_processor.batch_decode(outputs, skip_special_tokens=True)[0] return text + " " + caption - def construct_direction(self, - embs_source: paddle.Tensor, - embs_target: paddle.Tensor): + def construct_direction(self, embs_source: paddle.Tensor, embs_target: paddle.Tensor): """Constructs the edit direction to steer the image generation process semantically.""" return (embs_target.mean(0) - embs_source.mean(0)).unsqueeze(0) @paddle.no_grad() - def get_embeds(self, prompt: List[str], - batch_size: int=16) -> paddle.Tensor: + def get_embeds(self, prompt: List[str], batch_size: int = 16) -> paddle.Tensor: num_prompts = len(prompt) embeds = [] for i in range(0, num_prompts, batch_size): - prompt_slice = prompt[i:i + batch_size] + prompt_slice = prompt[i : i + batch_size] input_ids = self.tokenizer( prompt_slice, padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ).input_ids + return_tensors="pd", + ).input_ids embeds.append(self.text_encoder(input_ids)[0]) @@ -668,10 +670,7 @@ def prepare_image_latents(self, image, batch_size, dtype, generator=None): ) if isinstance(generator, list): - latents = [ - self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i]) - for i in range(batch_size) - ] + latents = [self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)] latents = paddle.concat(latents, axis=0) else: latents = self.vae.encode(image).latent_dist.sample(generator) @@ -691,10 +690,10 @@ def prepare_image_latents(self, image, batch_size, dtype, generator=None): "len(prompt) != len(image)", "1.0.0", deprecation_message, - standard_warn=False, ) + standard_warn=False, + ) additional_latents_per_image = batch_size // latents.shape[0] - latents = paddle.concat( - [latents] * additional_latents_per_image, axis=0) + latents = paddle.concat([latents] * additional_latents_per_image, axis=0) else: raise ValueError( f"Cannot duplicate `image` of batch size {latents.shape[0]} to {batch_size} text prompts." @@ -704,21 +703,16 @@ def prepare_image_latents(self, image, batch_size, dtype, generator=None): return latents - def get_epsilon(self, - model_output: paddle.Tensor, - sample: paddle.Tensor, - timestep: int): + def get_epsilon(self, model_output: paddle.Tensor, sample: paddle.Tensor, timestep: int): pred_type = self.inverse_scheduler.config.prediction_type alpha_prod_t = self.inverse_scheduler.alphas_cumprod[timestep] beta_prod_t = 1 - alpha_prod_t if pred_type == "epsilon": return model_output elif pred_type == "sample": - return (sample - alpha_prod_t** - (0.5) * model_output) / beta_prod_t**(0.5) + return (sample - alpha_prod_t ** (0.5) * model_output) / beta_prod_t ** (0.5) elif pred_type == "v_prediction": - return (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5 - ) * sample + return (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample else: raise ValueError( f"prediction_type given as {pred_type} must be one of `epsilon`, `sample`, or `v_prediction`" @@ -728,15 +722,11 @@ def auto_corr_loss(self, hidden_states, generator=None): reg_loss = 0.0 for i in range(hidden_states.shape[0]): for j in range(hidden_states.shape[1]): - noise = hidden_states[i:i + 1, j:j + 1, :, :] + noise = hidden_states[i : i + 1, j : j + 1, :, :] while True: - roll_amount = randint_tensor( - noise.shape[2] // 2, shape=(1, ), - generator=generator).item() - reg_loss += (noise * paddle.roll( - noise, shifts=roll_amount, axis=2)).mean()**2 - reg_loss += (noise * paddle.roll( - noise, shifts=roll_amount, axis=3)).mean()**2 + roll_amount = randint_tensor(noise.shape[2] // 2, shape=(1,), generator=generator).item() + reg_loss += (noise * paddle.roll(noise, shifts=roll_amount, axis=2)).mean() ** 2 + reg_loss += (noise * paddle.roll(noise, shifts=roll_amount, axis=3)).mean() ** 2 if noise.shape[2] <= 8: break @@ -751,29 +741,29 @@ def kl_divergence(self, hidden_states): @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - prompt: Optional[Union[str, List[str]]]=None, - image: Optional[Union[paddle.Tensor, PIL.Image.Image]]=None, - source_embeds: paddle.Tensor=None, - target_embeds: paddle.Tensor=None, - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - cross_attention_guidance_amount: float=0.1, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, ): + self, + prompt: Optional[Union[str, List[str]]] = None, + image: Optional[Union[paddle.Tensor, PIL.Image.Image]] = None, + source_embeds: paddle.Tensor = None, + target_embeds: paddle.Tensor = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + cross_attention_guidance_amount: float = 0.1, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ): r""" Function invoked when calling the pipeline for generation. @@ -857,7 +847,8 @@ def __call__( source_embeds, target_embeds, callback_steps, - prompt_embeds, ) + prompt_embeds, + ) # 3. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -881,7 +872,8 @@ def __call__( do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -897,7 +889,8 @@ def __call__( width, prompt_embeds.dtype, generator, - latents, ) + latents, + ) latents_init = latents.clone() # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline @@ -908,37 +901,31 @@ def __call__( self.unet = prepare_unet(self.unet) # 7. Denoising loop where we obtain the cross-attention maps. - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( latent_model_input, t, encoder_hidden_states=prompt_embeds, - cross_attention_kwargs={"timestep": t}, ).sample + cross_attention_kwargs={"timestep": t}, + ).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -952,15 +939,12 @@ def __call__( # 10. Second denoising loop to generate the edited image. latents = latents_init - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # we want to learn the latent such that it steers the generation # process towards the edited direction, so make the make initial @@ -969,9 +953,7 @@ def __call__( x_in.stop_gradient = False # optimizer - opt = paddle.optimizer.SGD( - parameters=[x_in], - learning_rate=cross_attention_guidance_amount) + opt = paddle.optimizer.SGD(parameters=[x_in], learning_rate=cross_attention_guidance_amount) with paddle.set_grad_enabled(True): # initialize loss @@ -982,8 +964,8 @@ def __call__( x_in, t, encoder_hidden_states=prompt_embeds_edit.detach(), - cross_attention_kwargs={"timestep": t, - "loss": loss}, ).sample + cross_attention_kwargs={"timestep": t, "loss": loss}, + ).sample loss.loss.backward(retain_graph=False) opt.step() @@ -993,32 +975,28 @@ def __call__( x_in.detach(), t, encoder_hidden_states=prompt_embeds_edit, - cross_attention_kwargs={"timestep": None}, ).sample + cross_attention_kwargs={"timestep": None}, + ).sample latents = x_in.detach().chunk(2)[0] # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() # 11. Post-process the latents. edited_image = self.decode_latents(latents) # 12. Run the safety checker. - edited_image, has_nsfw_concept = self.run_safety_checker( - edited_image, prompt_embeds.dtype) + edited_image, has_nsfw_concept = self.run_safety_checker(edited_image, prompt_embeds.dtype) # 13. Convert to PIL. if output_type == "pil": @@ -1027,31 +1005,30 @@ def __call__( if not return_dict: return (edited_image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=edited_image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=edited_image, nsfw_content_detected=has_nsfw_concept) @paddle.no_grad() @replace_example_docstring(EXAMPLE_INVERT_DOC_STRING) def invert( - self, - prompt: Optional[str]=None, - image: Union[paddle.Tensor, PIL.Image.Image]=None, - num_inference_steps: int=50, - guidance_scale: float=1, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - cross_attention_guidance_amount: float=0.1, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - lambda_auto_corr: float=20.0, - lambda_kl: float=20.0, - num_reg_steps: int=5, - num_auto_corr_rolls: int=5, ): + self, + prompt: Optional[str] = None, + image: Union[paddle.Tensor, PIL.Image.Image] = None, + num_inference_steps: int = 50, + guidance_scale: float = 1, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + cross_attention_guidance_amount: float = 0.1, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + lambda_auto_corr: float = 20.0, + lambda_kl: float = 20.0, + num_reg_steps: int = 5, + num_auto_corr_rolls: int = 5, + ): r""" Function used to generate inverted latents given a prompt and image. @@ -1130,8 +1107,7 @@ def invert( image = preprocess(image) # 4. Prepare latent variables - latents = self.prepare_image_latents(image, batch_size, self.vae.dtype, - generator) + latents = self.prepare_image_latents(image, batch_size, self.vae.dtype, generator) # 5. Encode input prompt num_images_per_prompt = 1 @@ -1139,7 +1115,8 @@ def invert( prompt, num_images_per_prompt, do_classifier_free_guidance, - prompt_embeds=prompt_embeds, ) + prompt_embeds=prompt_embeds, + ) # 4. Prepare timesteps self.inverse_scheduler.set_timesteps(num_inference_steps) @@ -1150,28 +1127,25 @@ def invert( self.unet = prepare_unet(self.unet) # 7. Denoising loop where we obtain the cross-attention maps. - num_warmup_steps = ( - len(timesteps) - num_inference_steps * self.inverse_scheduler.order) + num_warmup_steps = len(timesteps) - num_inference_steps * self.inverse_scheduler.order with self.progress_bar(total=num_inference_steps - 1) as progress_bar: for i, t in enumerate(timesteps[:-1]): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.inverse_scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.inverse_scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( latent_model_input, t, encoder_hidden_states=prompt_embeds, - cross_attention_kwargs={"timestep": t}, ).sample + cross_attention_kwargs={"timestep": t}, + ).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # regularization of the noise prediction with paddle.set_grad_enabled(True): @@ -1182,11 +1156,9 @@ def invert( var.stop_gradient = False # Derive epsilon from model output before regularizing to IID standard normal - var_epsilon = self.get_epsilon( - var, latent_model_input.detach(), t) + var_epsilon = self.get_epsilon(var, latent_model_input.detach(), t) - l_ac = self.auto_corr_loss( - var_epsilon, generator=generator) + l_ac = self.auto_corr_loss(var_epsilon, generator=generator) l_ac.backward() grad = var.grad.detach() / num_auto_corr_rolls @@ -1197,8 +1169,7 @@ def invert( var.stop_gradient = False # Derive epsilon from model output before regularizing to IID standard normal - var_epsilon = self.get_epsilon( - var, latent_model_input.detach(), t) + var_epsilon = self.get_epsilon(var, latent_model_input.detach(), t) l_kld = self.kl_divergence(var_epsilon) l_kld.backward() @@ -1209,13 +1180,12 @@ def invert( noise_pred = noise_pred.detach() # compute the previous noisy sample x_t -> x_t-1 - latents = self.inverse_scheduler.step(noise_pred, t, - latents).prev_sample + latents = self.inverse_scheduler.step(noise_pred, t, latents).prev_sample # call the callback, if provided if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.inverse_scheduler.order == 0): + (i + 1) > num_warmup_steps and (i + 1) % self.inverse_scheduler.order == 0 + ): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -1232,5 +1202,4 @@ def invert( if not return_dict: return (inverted_latents, image) - return Pix2PixInversionPipelineOutput( - latents=inverted_latents, images=image) + return Pix2PixInversionPipelineOutput(latents=inverted_latents, images=image) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py index 56fac99a80c30..3a8030d6a986d 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py @@ -17,8 +17,7 @@ import paddle import paddle.nn.functional as F -from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...loaders import TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel @@ -52,21 +51,20 @@ def __init__(self): self.attention_probs = None def __call__( - self, - attn, - hidden_states, - encoder_hidden_states=None, - attention_mask=None, ): + self, + attn, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + ): batch_size, sequence_length, _ = hidden_states.shape - attention_mask = attn.prepare_attention_mask( - attention_mask, sequence_length, batch_size) + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) query = attn.to_q(hidden_states) if encoder_hidden_states is None: encoder_hidden_states = hidden_states elif attn.norm_cross: - encoder_hidden_states = attn.norm_encoder_hidden_states( - encoder_hidden_states) + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) key = attn.to_k(encoder_hidden_states) value = attn.to_v(encoder_hidden_states) @@ -90,8 +88,7 @@ def __call__( # Modified to get self-attention guidance scale in this paper (https://arxiv.org/pdf/2210.00939.pdf) as an input -class StableDiffusionSAGPipeline(DiffusionPipeline, - TextualInversionLoaderMixin): +class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin): r""" Pipeline for text-to-image generation using Stable Diffusion. @@ -121,15 +118,16 @@ class StableDiffusionSAGPipeline(DiffusionPipeline, _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + ): super().__init__() self.register_modules( @@ -139,19 +137,21 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): r""" Encodes the prompt into text encoder hidden states. @@ -191,29 +191,31 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) @@ -221,8 +223,7 @@ def _encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -232,21 +233,22 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, - self.tokenizer) + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( @@ -254,47 +256,43 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, dtype): if self.safety_checker is not None: - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) else: has_nsfw_concept = None return image, has_nsfw_concept @@ -315,54 +313,50 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs def check_inputs( - self, - prompt, - height, - width, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -375,23 +369,26 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = ( batch_size, num_channels_latents, height // self.vae_scale_factor, - width // self.vae_scale_factor, ) + width // self.vae_scale_factor, + ) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" @@ -408,26 +405,26 @@ def prepare_latents( @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - prompt: Union[str, List[str]]=None, - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - sag_scale: float=0.75, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, ): + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + sag_scale: float = 0.75, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ): r""" Function invoked when calling the pipeline for generation. @@ -512,7 +509,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -538,7 +536,8 @@ def __call__( do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -553,17 +552,16 @@ def __call__( width, prompt_embeds.dtype, generator, - latents, ) + latents, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. Denoising loop store_processor = CrossAttnStoreProcessor() - self.unet.mid_block.attentions[0].transformer_blocks[ - 0].attn1.processor = store_processor - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + self.unet.mid_block.attentions[0].transformer_blocks[0].attn1.processor = store_processor + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order map_size = None @@ -571,28 +569,25 @@ def get_map_size(module, input, output): nonlocal map_size map_size = output.sample.shape[-2:] - forward_hook = self.unet.mid_block.attentions[ - 0].register_forward_post_hook(get_map_size) + forward_hook = self.unet.mid_block.attentions[0].register_forward_post_hook(get_map_size) with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( latent_model_input, t, encoder_hidden_states=prompt_embeds, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # perform self-attention guidance with the stored self-attentnion map if do_self_attention_guidance: @@ -603,23 +598,19 @@ def get_map_size(module, input, output): # DDIM-like prediction of x0 pred_x0 = self.pred_x0(latents, noise_pred_uncond, t) # get the stored attention maps - uncond_attn, cond_attn = store_processor.attention_probs.chunk( - 2) + uncond_attn, cond_attn = store_processor.attention_probs.chunk(2) # self-attention-based degrading of latents degraded_latents = self.sag_masking( pred_x0, uncond_attn, map_size, t, - self.pred_epsilon(latents, noise_pred_uncond, t), ) + self.pred_epsilon(latents, noise_pred_uncond, t), + ) uncond_emb, _ = prompt_embeds.chunk(2) # forward and give guidance - degraded_pred = self.unet( - degraded_latents, - t, - encoder_hidden_states=uncond_emb).sample - noise_pred += sag_scale * ( - noise_pred_uncond - degraded_pred) + degraded_pred = self.unet(degraded_latents, t, encoder_hidden_states=uncond_emb).sample + noise_pred += sag_scale * (noise_pred_uncond - degraded_pred) else: # DDIM-like prediction of x0 pred_x0 = self.pred_x0(latents, noise_pred, t) @@ -631,22 +622,17 @@ def get_map_size(module, input, output): cond_attn, map_size, t, - self.pred_epsilon(latents, noise_pred, t), ) + self.pred_epsilon(latents, noise_pred, t), + ) # forward and give guidance - degraded_pred = self.unet( - degraded_latents, - t, - encoder_hidden_states=prompt_embeds).sample + degraded_pred = self.unet(degraded_latents, t, encoder_hidden_states=prompt_embeds).sample noise_pred += sag_scale * (noise_pred - degraded_pred) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -656,8 +642,7 @@ def get_map_size(module, input, output): image = self.decode_latents(latents) # 9. Run safety checker - image, has_nsfw_concept = self.run_safety_checker(image, - prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) # 10. Convert to PIL if output_type == "pil": @@ -666,8 +651,7 @@ def get_map_size(module, input, output): if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) def sag_masking(self, original_latents, attn_map, map_size, t, eps): # Same masking process as in SAG paper: https://arxiv.org/pdf/2210.00939.pdf @@ -681,20 +665,20 @@ def sag_masking(self, original_latents, attn_map, map_size, t, eps): attn_map = attn_map.reshape([b, h, hw1, hw2]) attn_mask = attn_map.mean(1, keepdim=False).sum(1, keepdim=False) > 1.0 - attn_mask = (attn_mask.reshape([b, map_size[0], map_size[1]]) - .unsqueeze(1).tile([1, latent_channel, 1, 1]) - .cast(attn_map.dtype)) + attn_mask = ( + attn_mask.reshape([b, map_size[0], map_size[1]]) + .unsqueeze(1) + .tile([1, latent_channel, 1, 1]) + .cast(attn_map.dtype) + ) attn_mask = F.interpolate(attn_mask, (latent_h, latent_w)) # Blur according to the self-attention mask - degraded_latents = gaussian_blur_2d( - original_latents, kernel_size=9, sigma=1.0) - degraded_latents = degraded_latents * attn_mask + original_latents * ( - 1 - attn_mask) + degraded_latents = gaussian_blur_2d(original_latents, kernel_size=9, sigma=1.0) + degraded_latents = degraded_latents * attn_mask + original_latents * (1 - attn_mask) # Noise it again to match the noise level - degraded_latents = self.scheduler.add_noise( - degraded_latents, noise=eps, timesteps=t) + degraded_latents = self.scheduler.add_noise(degraded_latents, noise=eps, timesteps=t) return degraded_latents @@ -705,20 +689,18 @@ def pred_x0(self, sample, model_output, timestep): beta_prod_t = 1 - alpha_prod_t if self.scheduler.config.prediction_type == "epsilon": - pred_original_sample = (sample - beta_prod_t** - (0.5) * model_output) / alpha_prod_t**(0.5) + pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) elif self.scheduler.config.prediction_type == "sample": pred_original_sample = model_output elif self.scheduler.config.prediction_type == "v_prediction": - pred_original_sample = (alpha_prod_t**0.5) * sample - ( - beta_prod_t**0.5) * model_output + pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output # predict V - model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t** - 0.5) * sample + model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample else: raise ValueError( f"prediction_type given as {self.scheduler.config.prediction_type} must be one of `epsilon`, `sample`," - " or `v_prediction`") + " or `v_prediction`" + ) return pred_original_sample @@ -729,15 +711,14 @@ def pred_epsilon(self, sample, model_output, timestep): if self.scheduler.config.prediction_type == "epsilon": pred_eps = model_output elif self.scheduler.config.prediction_type == "sample": - pred_eps = (sample - - (alpha_prod_t**0.5) * model_output) / (beta_prod_t**0.5) + pred_eps = (sample - (alpha_prod_t**0.5) * model_output) / (beta_prod_t**0.5) elif self.scheduler.config.prediction_type == "v_prediction": - pred_eps = (beta_prod_t**0.5) * sample + (alpha_prod_t**0.5 - ) * model_output + pred_eps = (beta_prod_t**0.5) * sample + (alpha_prod_t**0.5) * model_output else: raise ValueError( f"prediction_type given as {self.scheduler.config.prediction_type} must be one of `epsilon`, `sample`," - " or `v_prediction`") + " or `v_prediction`" + ) return pred_eps @@ -753,12 +734,9 @@ def gaussian_blur_2d(img, kernel_size, sigma): x_kernel = x_kernel.cast(img.dtype) kernel2d = paddle.matmul(x_kernel[:, None], x_kernel[None, :]) - kernel2d = kernel2d.expand( - [img.shape[-3], 1, kernel2d.shape[0], kernel2d.shape[1]]) + kernel2d = kernel2d.expand([img.shape[-3], 1, kernel2d.shape[0], kernel2d.shape[1]]) - padding = [ - kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2 - ] + padding = [kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2] img = F.pad(img, padding, mode="reflect") img = F.conv2d(img, kernel2d, groups=img.shape[-3]) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 4a2ca10a74b68..85b0706b3ed80 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -37,8 +37,7 @@ def preprocess(image): if isinstance(image[0], PIL.Image.Image): w, h = image[0].size - w, h = map(lambda x: x - x % 64, - (w, h)) # resize to integer multiple of 64 + w, h = map(lambda x: x - x % 64, (w, h)) # resize to integer multiple of 64 image = [np.array(i.resize((w, h)))[None, :] for i in image] image = np.concatenate(image, axis=0) @@ -78,20 +77,21 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline): """ def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - low_res_scheduler: DDPMScheduler, - scheduler: KarrasDiffusionSchedulers, - max_noise_level: int=350, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + low_res_scheduler: DDPMScheduler, + scheduler: KarrasDiffusionSchedulers, + max_noise_level: int = 350, + ): super().__init__() # check if vae has a config attribute `scaling_factor` and if it is set to 0.08333, else set it to 0.08333 and deprecate is_vae_scaling_factor_set_to_0_08333 = ( - hasattr(vae.config, "scaling_factor") and - vae.config.scaling_factor == 0.08333) + hasattr(vae.config, "scaling_factor") and vae.config.scaling_factor == 0.08333 + ) if not is_vae_scaling_factor_set_to_0_08333: deprecation_message = ( "The configuration file of the vae does not contain `scaling_factor` or it is set to" @@ -105,7 +105,8 @@ def __init__( "wrong scaling_factor", "1.0.0", deprecation_message, - standard_warn=False, ) + standard_warn=False, + ) vae.register_to_config(scaling_factor=0.08333) self.register_modules( @@ -114,18 +115,20 @@ def __init__( tokenizer=tokenizer, unet=unet, low_res_scheduler=low_res_scheduler, - scheduler=scheduler, ) + scheduler=scheduler, + ) self.register_to_config(max_noise_level=max_noise_level) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): r""" Encodes the prompt into text encoder hidden states. @@ -161,29 +164,31 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) @@ -191,8 +196,7 @@ def _encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -202,14 +206,16 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt @@ -219,36 +225,33 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds @@ -259,15 +262,13 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs @@ -283,13 +284,13 @@ def decode_latents(self, latents): def check_inputs(self, prompt, image, noise_level, callback_steps): if not isinstance(prompt, str) and not isinstance(prompt, list): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - if (not isinstance(image, paddle.Tensor) and - not isinstance(image, PIL.Image.Image) and - not isinstance(image, list)): + if ( + not isinstance(image, paddle.Tensor) + and not isinstance(image, PIL.Image.Image) + and not isinstance(image, list) + ): raise ValueError( f"`image` has to be of type `paddle.Tensor`, `PIL.Image.Image` or `list` but is {type(image)}" ) @@ -312,34 +313,32 @@ def check_inputs(self, prompt, image, noise_level, callback_steps): # check noise level if noise_level > self.config.max_noise_level: - raise ValueError( - f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}" - ) + raise ValueError(f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = (batch_size, num_channels_latents, height, width) if latents is None: latents = randn_tensor(shape, generator=generator, dtype=dtype) else: if latents.shape != list(shape): - raise ValueError( - f"Unexpected latents shape, got {latents.shape}, expected {shape}" - ) + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") latents = latents # scale the initial noise by the standard deviation required by the scheduler @@ -348,25 +347,24 @@ def prepare_latents( @paddle.no_grad() def __call__( - self, - prompt: Union[str, List[str]]=None, - image: Union[paddle.Tensor, PIL.Image.Image, List[ - PIL.Image.Image]]=None, - num_inference_steps: int=75, - guidance_scale: float=9.0, - noise_level: int=20, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, ): + self, + prompt: Union[str, List[str]] = None, + image: Union[paddle.Tensor, PIL.Image.Image, List[PIL.Image.Image]] = None, + num_inference_steps: int = 75, + guidance_scale: float = 9.0, + noise_level: int = 20, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + ): r""" Function invoked when calling the pipeline for generation. @@ -472,7 +470,8 @@ def __call__( do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 4. Preprocess image image = preprocess(image) @@ -484,13 +483,11 @@ def __call__( # 5. Add noise to image noise_level = paddle.to_tensor([noise_level], dtype="int64") - noise = randn_tensor( - image.shape, generator=generator, dtype=prompt_embeds.dtype) + noise = randn_tensor(image.shape, generator=generator, dtype=prompt_embeds.dtype) image = self.low_res_scheduler.add_noise(image, noise, noise_level) batch_multiplier = 2 if do_classifier_free_guidance else 1 - image = paddle.concat([image] * batch_multiplier * - num_images_per_prompt) + image = paddle.concat([image] * batch_multiplier * num_images_per_prompt) noise_level = paddle.concat([noise_level] * image.shape[0]) # 6. Prepare latent variables @@ -503,7 +500,8 @@ def __call__( width, prompt_embeds.dtype, generator, - latents, ) + latents, + ) # 7. Check that sizes of image and latents match num_channels_image = image.shape[1] @@ -513,48 +511,41 @@ def __call__( f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +" f" `num_channels_image`: {num_channels_image} " f" = {num_channels_latents+num_channels_image}. Please verify the config of" - " `pipeline.unet` or your `image` input.") + " `pipeline.unet` or your `image` input." + ) # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 9. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents # concat latents, mask, masked_image_latents in the channel dimension - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) - latent_model_input = paddle.concat( - [latent_model_input, image.cast(latent_model_input.dtype)], - axis=1) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + latent_model_input = paddle.concat([latent_model_input, image.cast(latent_model_input.dtype)], axis=1) # predict the noise residual noise_pred = self.unet( latent_model_input, t, encoder_hidden_states=prompt_embeds, - class_labels=noise_level, ).sample + class_labels=noise_level, + ).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -569,6 +560,6 @@ def __call__( image = self.numpy_to_pil(image) if not return_dict: - return (image, ) + return (image,) return ImagePipelineOutput(images=image) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py index f89be55fdda9d..eaa7be8cb0324 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py @@ -17,8 +17,11 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union import paddle -from paddlenlp.transformers import (CLIPTextModel, CLIPTextModelWithProjection, - CLIPTokenizer) +from paddlenlp.transformers import ( + CLIPTextModel, + CLIPTextModelWithProjection, + CLIPTokenizer, +) from paddlenlp.transformers.clip.modeling import CLIPTextModelOutput from ...loaders import TextualInversionLoaderMixin @@ -26,6 +29,7 @@ from ...models.embeddings import get_timestep_embedding from ...schedulers import KarrasDiffusionSchedulers from ...utils import logging, randn_tensor, replace_example_docstring + # from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor, replace_example_docstring from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer @@ -103,22 +107,23 @@ class StableUnCLIPPipeline(DiffusionPipeline, TextualInversionLoaderMixin): vae: AutoencoderKL def __init__( - self, - # prior components - prior_tokenizer: CLIPTokenizer, - prior_text_encoder: CLIPTextModelWithProjection, - prior: PriorTransformer, - prior_scheduler: KarrasDiffusionSchedulers, - # image noising components - image_normalizer: StableUnCLIPImageNormalizer, - image_noising_scheduler: KarrasDiffusionSchedulers, - # regular denoising components - tokenizer: CLIPTokenizer, - text_encoder: CLIPTextModelWithProjection, - unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - # vae - vae: AutoencoderKL, ): + self, + # prior components + prior_tokenizer: CLIPTokenizer, + prior_text_encoder: CLIPTextModelWithProjection, + prior: PriorTransformer, + prior_scheduler: KarrasDiffusionSchedulers, + # image noising components + image_normalizer: StableUnCLIPImageNormalizer, + image_noising_scheduler: KarrasDiffusionSchedulers, + # regular denoising components + tokenizer: CLIPTokenizer, + text_encoder: CLIPTextModelWithProjection, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + # vae + vae: AutoencoderKL, + ): super().__init__() self.register_modules( @@ -132,18 +137,20 @@ def __init__( text_encoder=text_encoder, unet=unet, scheduler=scheduler, - vae=vae, ) + vae=vae, + ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) # Copied from ppdiffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline._encode_prompt with _encode_prompt->_encode_prior_prompt, tokenizer->prior_tokenizer, text_encoder->prior_text_encoder def _encode_prior_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]]=None, - text_attention_mask: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None, + text_attention_mask: Optional[paddle.Tensor] = None, + ): if text_model_output is None: batch_size = len(prompt) if isinstance(prompt, list) else 1 # get prompt text embeddings @@ -153,44 +160,42 @@ def _encode_prior_prompt( max_length=self.prior_tokenizer.model_max_length, return_attention_mask=True, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids text_mask = text_inputs.attention_mask - untruncated_ids = self.prior_tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.prior_tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.prior_tokenizer.batch_decode( - untruncated_ids[:, self.prior_tokenizer.model_max_length - - 1:-1]) + untruncated_ids[:, self.prior_tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" f" {self.prior_tokenizer.model_max_length} tokens: {removed_text}" ) - text_input_ids = text_input_ids[:, :self.prior_tokenizer. - model_max_length] + text_input_ids = text_input_ids[:, : self.prior_tokenizer.model_max_length] prior_text_encoder_output = self.prior_text_encoder(text_input_ids) prompt_embeds = prior_text_encoder_output.text_embeds - prior_text_encoder_hidden_states = ( - prior_text_encoder_output.last_hidden_state) + prior_text_encoder_hidden_states = prior_text_encoder_output.last_hidden_state else: batch_size = text_model_output[0].shape[0] prompt_embeds, prior_text_encoder_hidden_states = ( text_model_output[0], - text_model_output[1], ) + text_model_output[1], + ) text_mask = text_attention_mask - prompt_embeds = prompt_embeds.repeat_interleave( - num_images_per_prompt, axis=0) - prior_text_encoder_hidden_states = ( - prior_text_encoder_hidden_states.repeat_interleave( - num_images_per_prompt, axis=0)) + prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, axis=0) + prior_text_encoder_hidden_states = prior_text_encoder_hidden_states.repeat_interleave( + num_images_per_prompt, axis=0 + ) text_mask = text_mask.repeat_interleave(num_images_per_prompt, axis=0) @@ -203,46 +208,43 @@ def _encode_prior_prompt( max_length=self.prior_tokenizer.model_max_length, return_attention_mask=True, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) uncond_text_mask = uncond_input.attention_mask - negative_prompt_embeds_prior_text_encoder_output = self.prior_text_encoder( - uncond_input.input_ids) + negative_prompt_embeds_prior_text_encoder_output = self.prior_text_encoder(uncond_input.input_ids) - negative_prompt_embeds = ( - negative_prompt_embeds_prior_text_encoder_output.text_embeds) + negative_prompt_embeds = negative_prompt_embeds_prior_text_encoder_output.text_embeds uncond_prior_text_encoder_hidden_states = ( - negative_prompt_embeds_prior_text_encoder_output. - last_hidden_state) + negative_prompt_embeds_prior_text_encoder_output.last_hidden_state + ) # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len]) seq_len = uncond_prior_text_encoder_hidden_states.shape[1] - uncond_prior_text_encoder_hidden_states = ( - uncond_prior_text_encoder_hidden_states.tile( - [1, num_images_per_prompt, 1])) - uncond_prior_text_encoder_hidden_states = ( - uncond_prior_text_encoder_hidden_states.reshape( - [batch_size * num_images_per_prompt, seq_len, -1])) - uncond_text_mask = uncond_text_mask.repeat_interleave( - num_images_per_prompt, axis=0) + uncond_prior_text_encoder_hidden_states = uncond_prior_text_encoder_hidden_states.tile( + [1, num_images_per_prompt, 1] + ) + uncond_prior_text_encoder_hidden_states = uncond_prior_text_encoder_hidden_states.reshape( + [batch_size * num_images_per_prompt, seq_len, -1] + ) + uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, axis=0) # done duplicates # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) - prior_text_encoder_hidden_states = paddle.concat([ - uncond_prior_text_encoder_hidden_states, - prior_text_encoder_hidden_states, - ]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) + prior_text_encoder_hidden_states = paddle.concat( + [ + uncond_prior_text_encoder_hidden_states, + prior_text_encoder_hidden_states, + ] + ) text_mask = paddle.concat([uncond_text_mask, text_mask]) @@ -250,13 +252,14 @@ def _encode_prior_prompt( # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): r""" Encodes the prompt into text encoder hidden states. @@ -296,29 +299,31 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) @@ -326,8 +331,7 @@ def _encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -337,21 +341,22 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, - self.tokenizer) + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( @@ -359,36 +364,33 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds @@ -408,15 +410,13 @@ def prepare_prior_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.prior_scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.prior_scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the prior_scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.prior_scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.prior_scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs @@ -428,40 +428,38 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - height, - width, - callback_steps, - noise_level, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + height, + width, + callback_steps, + noise_level, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( @@ -473,11 +471,8 @@ def check_inputs( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - if prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -488,17 +483,18 @@ def check_inputs( if type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) if prompt_embeds is not None and negative_prompt_embeds is not None: if prompt_embeds.shape != negative_prompt_embeds.shape: raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) - if (noise_level < 0 or noise_level >= - self.image_noising_scheduler.config.num_train_timesteps): + if noise_level < 0 or noise_level >= self.image_noising_scheduler.config.num_train_timesteps: raise ValueError( f"`noise_level` must be between 0 and {self.image_noising_scheduler.config.num_train_timesteps - 1}, inclusive." ) @@ -509,20 +505,19 @@ def prepare_latents(self, shape, dtype, generator, latents, scheduler): latents = randn_tensor(shape, generator=generator, dtype=dtype) else: if latents.shape != list(shape): - raise ValueError( - f"Unexpected latents shape, got {latents.shape}, expected {shape}" - ) + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") latents = latents latents = latents * scheduler.init_noise_sigma return latents def noise_image_embeddings( - self, - image_embeds: paddle.Tensor, - noise_level: int, - noise: Optional[paddle.Tensor]=None, - generator: Optional[paddle.Generator]=None, ): + self, + image_embeds: paddle.Tensor, + noise_level: int, + noise: Optional[paddle.Tensor] = None, + generator: Optional[paddle.Generator] = None, + ): """ Add noise to the image embeddings. The amount of noise is controlled by a `noise_level` input. A higher `noise_level` increases the variance in the final un-noised images. @@ -536,17 +531,13 @@ def noise_image_embeddings( The embeddings are normalized before the noise is applied and un-normalized after the noise is applied. """ if noise is None: - noise = randn_tensor( - image_embeds.shape, - generator=generator, - dtype=image_embeds.dtype) + noise = randn_tensor(image_embeds.shape, generator=generator, dtype=image_embeds.dtype) noise_level = paddle.to_tensor([noise_level] * image_embeds.shape[0]) image_embeds = self.image_normalizer.scale(image_embeds) - image_embeds = self.image_noising_scheduler.add_noise( - image_embeds, timesteps=noise_level, noise=noise) + image_embeds = self.image_noising_scheduler.add_noise(image_embeds, timesteps=noise_level, noise=noise) image_embeds = self.image_normalizer.unscale(image_embeds) @@ -554,7 +545,8 @@ def noise_image_embeddings( timesteps=noise_level, embedding_dim=image_embeds.shape[-1], flip_sin_to_cos=True, - downscale_freq_shift=0, ) + downscale_freq_shift=0, + ) # `get_timestep_embeddings` does not contain any weights and will always return f32 tensors, # but we might actually be running in fp16. so we need to cast here. @@ -568,30 +560,31 @@ def noise_image_embeddings( @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - # regular denoising process args - prompt: Optional[Union[str, List[str]]]=None, - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=20, - guidance_scale: float=10.0, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[paddle.Generator]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - noise_level: int=0, - # prior args - prior_num_inference_steps: int=25, - prior_guidance_scale: float=4.0, - prior_latents: Optional[paddle.Tensor]=None, ): + self, + # regular denoising process args + prompt: Optional[Union[str, List[str]]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 20, + guidance_scale: float = 10.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[paddle.Generator] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + noise_level: int = 0, + # prior args + prior_num_inference_steps: int = 25, + prior_guidance_scale: float = 4.0, + prior_latents: Optional[paddle.Tensor] = None, + ): """ Function invoked when calling the pipeline for generation. @@ -687,7 +680,8 @@ def __call__( noise_level=noise_level, negative_prompt=negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -705,13 +699,11 @@ def __call__( prior_do_classifier_free_guidance = prior_guidance_scale > 1.0 # 3. Encode input prompt - ( - prior_prompt_embeds, - prior_text_encoder_hidden_states, - prior_text_mask, ) = self._encode_prior_prompt( - prompt=prompt, - num_images_per_prompt=num_images_per_prompt, - do_classifier_free_guidance=prior_do_classifier_free_guidance, ) + (prior_prompt_embeds, prior_text_encoder_hidden_states, prior_text_mask,) = self._encode_prior_prompt( + prompt=prompt, + num_images_per_prompt=num_images_per_prompt, + do_classifier_free_guidance=prior_do_classifier_free_guidance, + ) # 4. Prepare prior timesteps self.prior_scheduler.set_timesteps(prior_num_inference_steps) @@ -724,43 +716,43 @@ def __call__( prior_prompt_embeds.dtype, generator, prior_latents, - self.prior_scheduler, ) + self.prior_scheduler, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline - prior_extra_step_kwargs = self.prepare_prior_extra_step_kwargs( - generator, eta) + prior_extra_step_kwargs = self.prepare_prior_extra_step_kwargs(generator, eta) # 7. Prior denoising loop for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([prior_latents] * 2) - if prior_do_classifier_free_guidance else - prior_latents) - latent_model_input = self.prior_scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = ( + paddle.concat([prior_latents] * 2) if prior_do_classifier_free_guidance else prior_latents + ) + latent_model_input = self.prior_scheduler.scale_model_input(latent_model_input, t) predicted_image_embedding = self.prior( latent_model_input, timestep=t, proj_embedding=prior_prompt_embeds, encoder_hidden_states=prior_text_encoder_hidden_states, - attention_mask=prior_text_mask, ).predicted_image_embedding + attention_mask=prior_text_mask, + ).predicted_image_embedding if prior_do_classifier_free_guidance: ( predicted_image_embedding_uncond, predicted_image_embedding_text, ) = predicted_image_embedding.chunk(2) - predicted_image_embedding = ( - predicted_image_embedding_uncond + prior_guidance_scale * - (predicted_image_embedding_text - - predicted_image_embedding_uncond)) + predicted_image_embedding = predicted_image_embedding_uncond + prior_guidance_scale * ( + predicted_image_embedding_text - predicted_image_embedding_uncond + ) prior_latents = self.prior_scheduler.step( predicted_image_embedding, timestep=t, sample=prior_latents, - **prior_extra_step_kwargs, ).prev_sample + **prior_extra_step_kwargs, + ).prev_sample if callback is not None and i % callback_steps == 0: callback(i, t, prior_latents) @@ -783,13 +775,15 @@ def __call__( do_classifier_free_guidance=do_classifier_free_guidance, negative_prompt=negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 9. Prepare image embeddings image_embeds = self.noise_image_embeddings( image_embeds=image_embeds, noise_level=noise_level, - generator=generator, ) + generator=generator, + ) if do_classifier_free_guidance: negative_prompt_embeds = paddle.zeros_like(image_embeds) @@ -809,23 +803,23 @@ def __call__( batch_size, num_channels_latents, height // self.vae_scale_factor, - width // self.vae_scale_factor, ) + width // self.vae_scale_factor, + ) latents = self.prepare_latents( shape=shape, dtype=prompt_embeds.dtype, generator=generator, latents=latents, - scheduler=self.scheduler, ) + scheduler=self.scheduler, + ) # 12. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 13. Denoising loop for i, t in enumerate(self.progress_bar(timesteps)): - latent_model_input = (paddle.concat([latents] * 2) - if do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( @@ -833,17 +827,16 @@ def __call__( t, encoder_hidden_states=prompt_embeds, class_labels=image_embeds, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -856,6 +849,6 @@ def __call__( image = self.numpy_to_pil(image) if not return_dict: - return (image, ) + return (image,) return ImagePipelineOutput(images=image) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py index 043b5a310a9de..288dccda66f3b 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py @@ -18,9 +18,12 @@ import paddle import PIL -from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel, - CLIPTokenizer, - CLIPVisionModelWithProjection) +from paddlenlp.transformers import ( + CLIPImageProcessor, + CLIPTextModel, + CLIPTokenizer, + CLIPVisionModelWithProjection, +) from ...loaders import TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel @@ -60,8 +63,7 @@ """ -class StableUnCLIPImg2ImgPipeline(DiffusionPipeline, - TextualInversionLoaderMixin): +class StableUnCLIPImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin): """ Pipeline for text-guided image to image generation using stable unCLIP. @@ -108,20 +110,21 @@ class StableUnCLIPImg2ImgPipeline(DiffusionPipeline, vae: AutoencoderKL def __init__( - self, - # image encoding components - feature_extractor: CLIPImageProcessor, - image_encoder: CLIPVisionModelWithProjection, - # image noising components - image_normalizer: StableUnCLIPImageNormalizer, - image_noising_scheduler: KarrasDiffusionSchedulers, - # regular denoising components - tokenizer: CLIPTokenizer, - text_encoder: CLIPTextModel, - unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - # vae - vae: AutoencoderKL, ): + self, + # image encoding components + feature_extractor: CLIPImageProcessor, + image_encoder: CLIPVisionModelWithProjection, + # image noising components + image_normalizer: StableUnCLIPImageNormalizer, + image_noising_scheduler: KarrasDiffusionSchedulers, + # regular denoising components + tokenizer: CLIPTokenizer, + text_encoder: CLIPTextModel, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + # vae + vae: AutoencoderKL, + ): super().__init__() self.register_modules( @@ -133,19 +136,21 @@ def __init__( text_encoder=text_encoder, unet=unet, scheduler=scheduler, - vae=vae, ) + vae=vae, + ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): r""" Encodes the prompt into text encoder hidden states. @@ -185,29 +190,31 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) @@ -215,8 +222,7 @@ def _encode_prompt( bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: @@ -226,21 +232,22 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, - self.tokenizer) + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( @@ -248,48 +255,46 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds def _encode_image( - self, - image, - batch_size, - num_images_per_prompt, - do_classifier_free_guidance, - noise_level, - generator, - image_embeds, ): + self, + image, + batch_size, + num_images_per_prompt, + do_classifier_free_guidance, + noise_level, + generator, + image_embeds, + ): dtype = self.image_encoder.dtype if isinstance(image, PIL.Image.Image): @@ -306,8 +311,7 @@ def _encode_image( if image_embeds is None: if not isinstance(image, paddle.Tensor): - image = self.feature_extractor( - images=image, return_tensors="pd").pixel_values + image = self.feature_extractor(images=image, return_tensors="pd").pixel_values image = image.cast(dtype) image_embeds = self.image_encoder(image).image_embeds @@ -315,7 +319,8 @@ def _encode_image( image_embeds = self.noise_image_embeddings( image_embeds=image_embeds, noise_level=noise_level, - generator=generator, ) + generator=generator, + ) # duplicate image embeddings for each generation per prompt, using mps friendly method image_embeds = image_embeds.unsqueeze(1) @@ -350,42 +355,40 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - image, - height, - width, - callback_steps, - noise_level, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, - image_embeds=None, ): + self, + prompt, + image, + height, + width, + callback_steps, + noise_level, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + image_embeds=None, + ): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( @@ -397,11 +400,8 @@ def check_inputs( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - if prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -412,17 +412,18 @@ def check_inputs( if type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) if prompt_embeds is not None and negative_prompt_embeds is not None: if prompt_embeds.shape != negative_prompt_embeds.shape: raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) - if (noise_level < 0 or noise_level >= - self.image_noising_scheduler.config.num_train_timesteps): + if noise_level < 0 or noise_level >= self.image_noising_scheduler.config.num_train_timesteps: raise ValueError( f"`noise_level` must be between 0 and {self.image_noising_scheduler.config.num_train_timesteps - 1}, inclusive." ) @@ -438,28 +439,33 @@ def check_inputs( ) if image is not None: - if (not isinstance(image, paddle.Tensor) and - not isinstance(image, PIL.Image.Image) and - not isinstance(image, list)): + if ( + not isinstance(image, paddle.Tensor) + and not isinstance(image, PIL.Image.Image) + and not isinstance(image, list) + ): raise ValueError( "`image` has to be of type `paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" - f" {type(image)}") + f" {type(image)}" + ) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = ( batch_size, num_channels_latents, height // self.vae_scale_factor, - width // self.vae_scale_factor, ) + width // self.vae_scale_factor, + ) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" @@ -475,11 +481,12 @@ def prepare_latents( # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_unclip.StableUnCLIPPipeline.noise_image_embeddings def noise_image_embeddings( - self, - image_embeds: paddle.Tensor, - noise_level: int, - noise: Optional[paddle.Tensor]=None, - generator: Optional[paddle.Generator]=None, ): + self, + image_embeds: paddle.Tensor, + noise_level: int, + noise: Optional[paddle.Tensor] = None, + generator: Optional[paddle.Generator] = None, + ): """ Add noise to the image embeddings. The amount of noise is controlled by a `noise_level` input. A higher `noise_level` increases the variance in the final un-noised images. @@ -493,18 +500,12 @@ def noise_image_embeddings( The embeddings are normalized before the noise is applied and un-normalized after the noise is applied. """ if noise is None: - noise = randn_tensor( - image_embeds.shape, - generator=generator, - dtype=image_embeds.dtype) - noise_level = paddle.to_tensor([noise_level] * - image_embeds.shape[0]).reshape( - [image_embeds.shape[0]]) + noise = randn_tensor(image_embeds.shape, generator=generator, dtype=image_embeds.dtype) + noise_level = paddle.to_tensor([noise_level] * image_embeds.shape[0]).reshape([image_embeds.shape[0]]) image_embeds = self.image_normalizer.scale(image_embeds) - image_embeds = self.image_noising_scheduler.add_noise( - image_embeds, timesteps=noise_level, noise=noise) + image_embeds = self.image_noising_scheduler.add_noise(image_embeds, timesteps=noise_level, noise=noise) image_embeds = self.image_normalizer.unscale(image_embeds) @@ -512,7 +513,8 @@ def noise_image_embeddings( timesteps=noise_level, embedding_dim=image_embeds.shape[-1], flip_sin_to_cos=True, - downscale_freq_shift=0, ) + downscale_freq_shift=0, + ) # `get_timestep_embeddings` does not contain any weights and will always return f32 tensors, # but we might actually be running in fp16. so we need to cast here. @@ -525,27 +527,28 @@ def noise_image_embeddings( @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - image: Union[paddle.Tensor, PIL.Image.Image]=None, - prompt: Union[str, List[str]]=None, - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=20, - guidance_scale: float=10, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[paddle.Generator]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - noise_level: int=0, - image_embeds: Optional[paddle.Tensor]=None, ): + self, + image: Union[paddle.Tensor, PIL.Image.Image] = None, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 20, + guidance_scale: float = 10, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[paddle.Generator] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + noise_level: int = 0, + image_embeds: Optional[paddle.Tensor] = None, + ): r""" Function invoked when calling the pipeline for generation. @@ -641,7 +644,8 @@ def __call__( negative_prompt=negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, - image_embeds=image_embeds, ) + image_embeds=image_embeds, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -665,7 +669,8 @@ def __call__( do_classifier_free_guidance=do_classifier_free_guidance, negative_prompt=negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 4. Encoder input image noise_level = paddle.to_tensor(noise_level) @@ -676,7 +681,8 @@ def __call__( do_classifier_free_guidance=do_classifier_free_guidance, noise_level=noise_level, generator=generator, - image_embeds=image_embeds, ) + image_embeds=image_embeds, + ) # 5. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -691,17 +697,16 @@ def __call__( width=width, dtype=prompt_embeds.dtype, generator=generator, - latents=latents, ) + latents=latents, + ) # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 8. Denoising loop for i, t in enumerate(self.progress_bar(timesteps)): - latent_model_input = (paddle.concat([latents] * 2) - if do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( @@ -709,17 +714,16 @@ def __call__( t, encoder_hidden_states=prompt_embeds, class_labels=image_embeds, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -732,6 +736,6 @@ def __call__( image = self.numpy_to_pil(image) if not return_dict: - return (image, ) + return (image,) return ImagePipelineOutput(images=image) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/safety_checker.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/safety_checker.py index 8fa2d0f3796b1..28920a1c6de42 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/safety_checker.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/safety_checker.py @@ -16,8 +16,11 @@ import numpy as np import paddle import paddle.nn.functional as F -from paddlenlp.transformers import (CLIPPretrainedModel, CLIPVisionConfig, - CLIPVisionModel) +from paddlenlp.transformers import ( + CLIPPretrainedModel, + CLIPVisionConfig, + CLIPVisionModel, +) from ...utils import logging @@ -27,8 +30,7 @@ def cosine_distance(image_embeds, text_embeds): normalized_image_embeds = F.normalize(image_embeds) normalized_text_embeds = F.normalize(text_embeds) - return paddle.matmul( - normalized_image_embeds, normalized_text_embeds, transpose_y=True) + return paddle.matmul(normalized_image_embeds, normalized_text_embeds, transpose_y=True) class StableDiffusionSafetyChecker(CLIPPretrainedModel): @@ -40,12 +42,11 @@ def __init__(self, config: CLIPVisionConfig): self.clip = CLIPVisionModel(config) self.vision_projection = paddle.create_parameter( (config.hidden_size, config.projection_dim), - dtype=paddle.get_default_dtype(), ) + dtype=paddle.get_default_dtype(), + ) - self.register_buffer("concept_embeds", - paddle.ones([17, config.projection_dim])) - self.register_buffer("special_care_embeds", - paddle.ones([3, config.projection_dim])) + self.register_buffer("concept_embeds", paddle.ones([17, config.projection_dim])) + self.register_buffer("special_care_embeds", paddle.ones([3, config.projection_dim])) self.register_buffer("concept_embeds_weights", paddle.ones([17])) self.register_buffer("special_care_embeds_weights", paddle.ones([3])) @@ -56,11 +57,8 @@ def forward(self, clip_input, images): image_embeds = paddle.matmul(pooled_output, self.vision_projection) # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 - special_cos_dist = ( - cosine_distance(image_embeds, self.special_care_embeds) - .astype("float32").numpy()) - cos_dist = (cosine_distance( - image_embeds, self.concept_embeds).astype("float32").numpy()) + special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds).astype("float32").numpy() + cos_dist = cosine_distance(image_embeds, self.concept_embeds).astype("float32").numpy() result = [] batch_size = image_embeds.shape[0] @@ -78,22 +76,16 @@ def forward(self, clip_input, images): for concept_idx in range(len(special_cos_dist[0])): concept_cos = special_cos_dist[i][concept_idx] - concept_threshold = self.special_care_embeds_weights[ - concept_idx].item() - result_img["special_scores"][concept_idx] = round( - concept_cos - concept_threshold + adjustment, 3) + concept_threshold = self.special_care_embeds_weights[concept_idx].item() + result_img["special_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3) if result_img["special_scores"][concept_idx] > 0: - result_img["special_care"].append({ - concept_idx, result_img["special_scores"][concept_idx] - }) + result_img["special_care"].append({concept_idx, result_img["special_scores"][concept_idx]}) adjustment = 0.01 for concept_idx in range(len(cos_dist[0])): concept_cos = cos_dist[i][concept_idx] - concept_threshold = self.concept_embeds_weights[ - concept_idx].item() - result_img["concept_scores"][concept_idx] = round( - concept_cos - concept_threshold + adjustment, 3) + concept_threshold = self.concept_embeds_weights[concept_idx].item() + result_img["concept_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3) if result_img["concept_scores"][concept_idx] > 0: result_img["bad_concepts"].append(concept_idx) @@ -111,34 +103,29 @@ def forward(self, clip_input, images): if any(has_nsfw_concepts): logger.warning( "Potential NSFW content was detected in one or more images. A black image will be returned instead." - " Try again with a different prompt and/or seed.") + " Try again with a different prompt and/or seed." + ) return images, has_nsfw_concepts - def forward_fastdeploy(self, - clip_input: paddle.Tensor, - images: paddle.Tensor): + def forward_fastdeploy(self, clip_input: paddle.Tensor, images: paddle.Tensor): pooled_output = self.clip(clip_input)[1] # pooled_output image_embeds = paddle.matmul(pooled_output, self.vision_projection) - special_cos_dist = cosine_distance(image_embeds, - self.special_care_embeds) + special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds) cos_dist = cosine_distance(image_embeds, self.concept_embeds) # increase this value to create a stronger `nsfw` filter # at the cost of increasing the possibility of filtering benign images adjustment = 0.0 - special_scores = ( - special_cos_dist - self.special_care_embeds_weights + adjustment) + special_scores = special_cos_dist - self.special_care_embeds_weights + adjustment # special_scores = special_scores.round(decimals=3) special_care = paddle.any(special_scores > 0, axis=1) special_adjustment = special_care * 0.01 - special_adjustment = special_adjustment.unsqueeze(1).expand( - [-1, cos_dist.shape[1]]) + special_adjustment = special_adjustment.unsqueeze(1).expand([-1, cos_dist.shape[1]]) - concept_scores = (cos_dist - self.concept_embeds_weights - ) + special_adjustment + concept_scores = (cos_dist - self.concept_embeds_weights) + special_adjustment # concept_scores = concept_scores.round(decimals=3) has_nsfw_concepts = paddle.any(concept_scores > 0, axis=1) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py index dd502e817aac3..8792792dd7fc4 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py @@ -32,34 +32,38 @@ class StableUnCLIPImageNormalizer(ModelMixin, ConfigMixin): @register_to_config def __init__( - self, - embedding_dim: int=768, ): + self, + embedding_dim: int = 768, + ): super().__init__() self.mean = self.create_parameter( (1, embedding_dim), dtype=paddle.get_default_dtype(), - default_initializer=nn.initializer.Constant(0.0), ) + default_initializer=nn.initializer.Constant(0.0), + ) self.std = self.create_parameter( (1, embedding_dim), dtype=paddle.get_default_dtype(), - default_initializer=nn.initializer.Constant(1.0), ) + default_initializer=nn.initializer.Constant(1.0), + ) def to( - self, - device: Optional[str]=None, - dtype: Optional[paddle.dtype]=None, ): + self, + device: Optional[str] = None, + dtype: Optional[paddle.dtype] = None, + ): if dtype is not None: self.mean = self.create_parameter( self.mean.shape, dtype=dtype, - default_initializer=paddle.nn.initializer.Assign( - self.mean.numpy()), ) + default_initializer=paddle.nn.initializer.Assign(self.mean.numpy()), + ) self.std = self.create_parameter( self.std.shape, dtype=dtype, - default_initializer=paddle.nn.initializer.Assign(self.std.numpy( - )), ) + default_initializer=paddle.nn.initializer.Assign(self.std.numpy()), + ) if device is not None: self.mean._to(device) self.std._to(device) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py index 82b88765d936c..b2c27a601a306 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py @@ -20,8 +20,7 @@ import numpy as np import paddle from packaging import version -from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict from ...models import AutoencoderKL, UNet2DConditionModel @@ -67,41 +66,38 @@ class StableDiffusionPipelineSafe(DiffusionPipeline): _optional_components = ["safety_checker", "feature_extractor"] def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: SafeStableDiffusionSafetyChecker, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: SafeStableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + ): super().__init__() safety_concept: Optional[str] = ( "an image showing hate, harassment, violence, suffering, humiliation, harm, suicide, sexual, nudity," " bodily fluids, blood, obscene gestures, illegal activity, drug use, theft, vandalism, weapons, child" - " abuse, brutality, cruelty") + " abuse, brutality, cruelty" + ) - if (hasattr(scheduler.config, "steps_offset") and - scheduler.config.steps_offset != 1): + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " "to update the config accordingly as leaving `steps_offset` might led to incorrect results" " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" - " file") - deprecate( - "steps_offset!=1", - "1.0.0", - deprecation_message, - standard_warn=False) + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["steps_offset"] = 1 scheduler._internal_dict = FrozenDict(new_config) - if (hasattr(scheduler.config, "clip_sample") and - scheduler.config.clip_sample is True): + if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`." " `clip_sample` should be set to False in the configuration file. Please make sure to update the" @@ -109,11 +105,7 @@ def __init__( " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very" " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file" ) - deprecate( - "clip_sample not set", - "1.0.0", - deprecation_message, - standard_warn=False) + deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(scheduler.config) new_config["clip_sample"] = False scheduler._internal_dict = FrozenDict(new_config) @@ -134,12 +126,10 @@ def __init__( " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." ) - is_unet_version_less_0_9_0 = hasattr( - unet.config, "_ppdiffusers_version") and version.parse( - version.parse(unet.config._ppdiffusers_version) - .base_version) < version.parse("0.9.0.dev0") - is_unet_sample_size_less_64 = (hasattr(unet.config, "sample_size") and - unet.config.sample_size < 64) + is_unet_version_less_0_9_0 = hasattr(unet.config, "_ppdiffusers_version") and version.parse( + version.parse(unet.config._ppdiffusers_version).base_version + ) < version.parse("0.9.0.dev0") + is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: deprecation_message = ( "The configuration file of the unet has set the default `sample_size` to smaller than" @@ -150,12 +140,9 @@ def __init__( " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`" " in the config might lead to incorrect results in future versions. If you have downloaded this" " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" - " the `unet/config.json` file") - deprecate( - "sample_size<64", - "1.0.0", - deprecation_message, - standard_warn=False) + " the `unet/config.json` file" + ) + deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) new_config = dict(unet.config) new_config["sample_size"] = 64 unet._internal_dict = FrozenDict(new_config) @@ -167,9 +154,10 @@ def __init__( unet=unet, scheduler=scheduler, safety_checker=safety_checker, - feature_extractor=feature_extractor, ) + feature_extractor=feature_extractor, + ) self._safety_text_concept = safety_concept - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.register_to_config(requires_safety_checker=requires_safety_checker) @property @@ -194,12 +182,13 @@ def safety_concept(self, concept): self._safety_text_concept = concept def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt, - enable_safety_guidance, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt, + enable_safety_guidance, + ): r""" Encodes the prompt into text encoder hidden states. @@ -221,35 +210,35 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, untruncated_ids): - removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = prompt_embeds[0] # duplicate text embeddings for each generation per prompt, using mps friendly method bs_embed, seq_len, _ = prompt_embeds.shape prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance: @@ -259,14 +248,16 @@ def _encode_prompt( elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt @@ -276,25 +267,24 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # Encode the safety concept text if enable_safety_guidance: @@ -303,40 +293,35 @@ def _encode_prompt( padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) - safety_embeddings = self.text_encoder( - safety_concept_input.input_ids)[0] + return_tensors="pd", + ) + safety_embeddings = self.text_encoder(safety_concept_input.input_ids)[0] # duplicate safety embeddings for each generation per prompt, using mps friendly method seq_len = safety_embeddings.shape[1] - safety_embeddings = safety_embeddings.tile( - [batch_size, num_images_per_prompt, 1]) - safety_embeddings = safety_embeddings.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + safety_embeddings = safety_embeddings.tile([batch_size, num_images_per_prompt, 1]) + safety_embeddings = safety_embeddings.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance + sld, we need to do three forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing three forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds, safety_embeddings]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds, safety_embeddings]) else: # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds def run_safety_checker(self, image, dtype, enable_safety_guidance): if self.safety_checker is not None: images = image.copy() - safety_checker_input = self.feature_extractor( - self.numpy_to_pil(image), return_tensors="pd") + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pd") image, has_nsfw_concept = self.safety_checker( - images=image, - clip_input=safety_checker_input.pixel_values.cast(dtype)) + images=image, clip_input=safety_checker_input.pixel_values.cast(dtype) + ) flagged_images = np.zeros((2, *image.shape[1:])) if any(has_nsfw_concept): logger.warning( @@ -369,54 +354,50 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs def check_inputs( - self, - prompt, - height, - width, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -429,23 +410,26 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = ( batch_size, num_channels_latents, height // self.vae_scale_factor, - width // self.vae_scale_factor, ) + width // self.vae_scale_factor, + ) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" @@ -460,49 +444,48 @@ def prepare_latents( return latents def perform_safety_guidance( - self, - enable_safety_guidance, - safety_momentum, - noise_guidance, - noise_pred_out, - i, - sld_guidance_scale, - sld_warmup_steps, - sld_threshold, - sld_momentum_scale, - sld_mom_beta, ): + self, + enable_safety_guidance, + safety_momentum, + noise_guidance, + noise_pred_out, + i, + sld_guidance_scale, + sld_warmup_steps, + sld_threshold, + sld_momentum_scale, + sld_mom_beta, + ): # Perform SLD guidance if enable_safety_guidance: if safety_momentum is None: safety_momentum = paddle.zeros_like(noise_guidance) - noise_pred_text, noise_pred_uncond = noise_pred_out[ - 0], noise_pred_out[1] + noise_pred_text, noise_pred_uncond = noise_pred_out[0], noise_pred_out[1] noise_pred_safety_concept = noise_pred_out[2] # Equation 6 scale = paddle.clip( - paddle.abs((noise_pred_text - noise_pred_safety_concept)) * - sld_guidance_scale, - max=1.0, ) + paddle.abs((noise_pred_text - noise_pred_safety_concept)) * sld_guidance_scale, + max=1.0, + ) # Equation 6 safety_concept_scale = paddle.where( (noise_pred_text - noise_pred_safety_concept) >= sld_threshold, paddle.zeros_like(scale), - scale, ) + scale, + ) # Equation 4 noise_guidance_safety = paddle.multiply( - (noise_pred_safety_concept - noise_pred_uncond), - safety_concept_scale) + (noise_pred_safety_concept - noise_pred_uncond), safety_concept_scale + ) # Equation 7 - noise_guidance_safety = ( - noise_guidance_safety + sld_momentum_scale * safety_momentum) + noise_guidance_safety = noise_guidance_safety + sld_momentum_scale * safety_momentum # Equation 8 - safety_momentum = (sld_mom_beta * safety_momentum + - (1 - sld_mom_beta) * noise_guidance_safety) + safety_momentum = sld_mom_beta * safety_momentum + (1 - sld_mom_beta) * noise_guidance_safety if i >= sld_warmup_steps: # Warmup # Equation 3 @@ -511,27 +494,27 @@ def perform_safety_guidance( @paddle.no_grad() def __call__( - self, - prompt: Union[str, List[str]], - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - sld_guidance_scale: Optional[float]=1000, - sld_warmup_steps: Optional[int]=10, - sld_threshold: Optional[float]=0.01, - sld_momentum_scale: Optional[float]=0.3, - sld_mom_beta: Optional[float]=0.4, ): + self, + prompt: Union[str, List[str]], + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + sld_guidance_scale: Optional[float] = 1000, + sld_warmup_steps: Optional[int] = 10, + sld_threshold: Optional[float] = 0.01, + sld_momentum_scale: Optional[float] = 0.3, + sld_mom_beta: Optional[float] = 0.4, + ): r""" Function invoked when calling the pipeline for generation. @@ -620,8 +603,7 @@ def __call__( # corresponds to doing no classifier free guidance. do_classifier_free_guidance = guidance_scale > 1.0 - enable_safety_guidance = (sld_guidance_scale > 1.0 and - do_classifier_free_guidance) + enable_safety_guidance = sld_guidance_scale > 1.0 and do_classifier_free_guidance if not enable_safety_guidance: warnings.warn("Safety checker disabled!") @@ -631,7 +613,8 @@ def __call__( num_images_per_prompt, do_classifier_free_guidance, negative_prompt, - enable_safety_guidance, ) + enable_safety_guidance, + ) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -646,36 +629,35 @@ def __call__( width, prompt_embeds.dtype, generator, - latents, ) + latents, + ) # 6. Prepare extra step kwargs. extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) safety_momentum = None - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat( - [latents] * (3 if enable_safety_guidance else 2)) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = ( + paddle.concat([latents] * (3 if enable_safety_guidance else 2)) + if do_classifier_free_guidance + else latents + ) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual - noise_pred = self.unet( - latent_model_input, t, - encoder_hidden_states=prompt_embeds).sample + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample # perform guidance if do_classifier_free_guidance: - noise_pred_out = noise_pred.chunk( - (3 if enable_safety_guidance else 2)) + noise_pred_out = noise_pred.chunk((3 if enable_safety_guidance else 2)) noise_pred_uncond, noise_pred_text = ( noise_pred_out[0], - noise_pred_out[1], ) + noise_pred_out[1], + ) # default classifier free guidance noise_guidance = noise_pred_text - noise_pred_uncond @@ -688,32 +670,28 @@ def __call__( # Equation 6 scale = paddle.clip( - paddle.abs( - (noise_pred_text - noise_pred_safety_concept)) * - sld_guidance_scale, - max=1.0, ) + paddle.abs((noise_pred_text - noise_pred_safety_concept)) * sld_guidance_scale, + max=1.0, + ) # Equation 6 safety_concept_scale = paddle.where( - (noise_pred_text - noise_pred_safety_concept) >= - sld_threshold, + (noise_pred_text - noise_pred_safety_concept) >= sld_threshold, paddle.zeros_like(scale), - scale, ) + scale, + ) # Equation 4 noise_guidance_safety = paddle.multiply( (noise_pred_safety_concept - noise_pred_uncond), - safety_concept_scale, ) + safety_concept_scale, + ) # Equation 7 - noise_guidance_safety = ( - noise_guidance_safety + sld_momentum_scale * - safety_momentum) + noise_guidance_safety = noise_guidance_safety + sld_momentum_scale * safety_momentum # Equation 8 - safety_momentum = ( - sld_mom_beta * safety_momentum + - (1 - sld_mom_beta) * noise_guidance_safety) + safety_momentum = sld_mom_beta * safety_momentum + (1 - sld_mom_beta) * noise_guidance_safety if i >= sld_warmup_steps: # Warmup # Equation 3 @@ -722,13 +700,10 @@ def __call__( noise_pred = noise_pred_uncond + guidance_scale * noise_guidance # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -738,7 +713,8 @@ def __call__( # 9. Run safety checker image, has_nsfw_concept, flagged_images = self.run_safety_checker( - image, prompt_embeds.dtype, enable_safety_guidance) + image, prompt_embeds.dtype, enable_safety_guidance + ) # 10. Convert to PIL if output_type == "pil": @@ -751,11 +727,12 @@ def __call__( image, has_nsfw_concept, self._safety_text_concept if enable_safety_guidance else None, - flagged_images, ) + flagged_images, + ) return StableDiffusionSafePipelineOutput( images=image, nsfw_content_detected=has_nsfw_concept, - applied_safety_concept=self._safety_text_concept - if enable_safety_guidance else None, - unsafe_images=flagged_images, ) + applied_safety_concept=self._safety_text_concept if enable_safety_guidance else None, + unsafe_images=flagged_images, + ) diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/safety_checker.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/safety_checker.py index ceae2727162f5..43772eac7c2cb 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/safety_checker.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_safe/safety_checker.py @@ -15,8 +15,11 @@ import paddle import paddle.nn.functional as F -from paddlenlp.transformers import (CLIPPretrainedModel, CLIPVisionConfig, - CLIPVisionModel) +from paddlenlp.transformers import ( + CLIPPretrainedModel, + CLIPVisionConfig, + CLIPVisionModel, +) from ...utils import logging @@ -26,8 +29,7 @@ def cosine_distance(image_embeds, text_embeds): normalized_image_embeds = F.normalize(image_embeds) normalized_text_embeds = F.normalize(text_embeds) - return paddle.matmul( - normalized_image_embeds, normalized_text_embeds, transpose_y=True) + return paddle.matmul(normalized_image_embeds, normalized_text_embeds, transpose_y=True) class SafeStableDiffusionSafetyChecker(CLIPPretrainedModel): @@ -39,12 +41,11 @@ def __init__(self, config: CLIPVisionConfig): self.vision_projection = paddle.create_parameter( (config.hidden_size, config.projection_dim), - dtype=paddle.get_default_dtype(), ) + dtype=paddle.get_default_dtype(), + ) - self.register_buffer("concept_embeds", - paddle.ones([17, config.projection_dim])) - self.register_buffer("special_care_embeds", - paddle.ones([3, config.projection_dim])) + self.register_buffer("concept_embeds", paddle.ones([17, config.projection_dim])) + self.register_buffer("special_care_embeds", paddle.ones([3, config.projection_dim])) self.register_buffer("concept_embeds_weights", paddle.ones([17])) self.register_buffer("special_care_embeds_weights", paddle.ones([3])) @@ -55,11 +56,8 @@ def forward(self, clip_input, images): image_embeds = paddle.matmul(pooled_output, self.vision_projection) # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16 - special_cos_dist = ( - cosine_distance(image_embeds, self.special_care_embeds) - .astype("float32").numpy()) - cos_dist = (cosine_distance( - image_embeds, self.concept_embeds).astype("float32").numpy()) + special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds).astype("float32").numpy() + cos_dist = cosine_distance(image_embeds, self.concept_embeds).astype("float32").numpy() result = [] batch_size = image_embeds.shape[0] @@ -77,22 +75,16 @@ def forward(self, clip_input, images): for concept_idx in range(len(special_cos_dist[0])): concept_cos = special_cos_dist[i][concept_idx] - concept_threshold = self.special_care_embeds_weights[ - concept_idx].item() - result_img["special_scores"][concept_idx] = round( - concept_cos - concept_threshold + adjustment, 3) + concept_threshold = self.special_care_embeds_weights[concept_idx].item() + result_img["special_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3) if result_img["special_scores"][concept_idx] > 0: - result_img["special_care"].append({ - concept_idx, result_img["special_scores"][concept_idx] - }) + result_img["special_care"].append({concept_idx, result_img["special_scores"][concept_idx]}) adjustment = 0.01 for concept_idx in range(len(cos_dist[0])): concept_cos = cos_dist[i][concept_idx] - concept_threshold = self.concept_embeds_weights[ - concept_idx].item() - result_img["concept_scores"][concept_idx] = round( - concept_cos - concept_threshold + adjustment, 3) + concept_threshold = self.concept_embeds_weights[concept_idx].item() + result_img["concept_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3) if result_img["concept_scores"][concept_idx] > 0: result_img["bad_concepts"].append(concept_idx) @@ -102,30 +94,24 @@ def forward(self, clip_input, images): return images, has_nsfw_concepts - def forward_fastdeploy(self, - clip_input: paddle.Tensor, - images: paddle.Tensor): + def forward_fastdeploy(self, clip_input: paddle.Tensor, images: paddle.Tensor): pooled_output = self.clip(clip_input)[1] # pooled_output image_embeds = paddle.matmul(pooled_output, self.vision_projection) - special_cos_dist = cosine_distance(image_embeds, - self.special_care_embeds) + special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds) cos_dist = cosine_distance(image_embeds, self.concept_embeds) # increase this value to create a stronger `nsfw` filter # at the cost of increasing the possibility of filtering benign images adjustment = 0.0 - special_scores = ( - special_cos_dist - self.special_care_embeds_weights + adjustment) + special_scores = special_cos_dist - self.special_care_embeds_weights + adjustment # special_scores = special_scores.round(decimals=3) special_care = paddle.any(special_scores > 0, axis=1) special_adjustment = special_care * 0.01 - special_adjustment = special_adjustment.unsqueeze(1).expand( - [-1, cos_dist.shape[1]]) + special_adjustment = special_adjustment.unsqueeze(1).expand([-1, cos_dist.shape[1]]) - concept_scores = (cos_dist - self.concept_embeds_weights - ) + special_adjustment + concept_scores = (cos_dist - self.concept_embeds_weights) + special_adjustment # concept_scores = concept_scores.round(decimals=3) has_nsfw_concepts = paddle.any(concept_scores > 0, axis=1) diff --git a/ppdiffusers/ppdiffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py b/ppdiffusers/ppdiffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py index acd0aad93d9ee..d06ace3696225 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py +++ b/ppdiffusers/ppdiffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py @@ -48,14 +48,14 @@ def __init__(self, unet: UNet2DModel, scheduler: KarrasVeScheduler): @paddle.no_grad() def __call__( - self, - batch_size: int=1, - num_inference_steps: int=50, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - **kwargs, ) -> Union[Tuple, ImagePipelineOutput]: + self, + batch_size: int = 1, + num_inference_steps: int = 50, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + **kwargs, + ) -> Union[Tuple, ImagePipelineOutput]: r""" Args: batch_size (`int`, *optional*, defaults to 1): @@ -82,8 +82,7 @@ def __call__( model = self.unet # sample x_0 ~ N(0, sigma_0^2 * I) - sample = (randn_tensor( - shape, generator=generator) * self.scheduler.init_noise_sigma) + sample = randn_tensor(shape, generator=generator) * self.scheduler.init_noise_sigma self.scheduler.set_timesteps(num_inference_steps) @@ -94,31 +93,28 @@ def __call__( # 1. Select temporarily increased noise level sigma_hat # 2. Add new noise to move from sample_i to sample_hat - sample_hat, sigma_hat = self.scheduler.add_noise_to_input( - sample, sigma, generator=generator) + sample_hat, sigma_hat = self.scheduler.add_noise_to_input(sample, sigma, generator=generator) # 3. Predict the noise residual given the noise magnitude `sigma_hat` # The model inputs and output are adjusted by following eq. (213) in [1]. - model_output = (sigma_hat / 2) * model((sample_hat + 1) / 2, - sigma_hat / 2).sample + model_output = (sigma_hat / 2) * model((sample_hat + 1) / 2, sigma_hat / 2).sample # 4. Evaluate dx/dt at sigma_hat # 5. Take Euler step from sigma to sigma_prev - step_output = self.scheduler.step(model_output, sigma_hat, - sigma_prev, sample_hat) + step_output = self.scheduler.step(model_output, sigma_hat, sigma_prev, sample_hat) if sigma_prev != 0: # 6. Apply 2nd order correction # The model inputs and output are adjusted by following eq. (213) in [1]. - model_output = (sigma_prev / 2) * model( - (step_output.prev_sample + 1) / 2, sigma_prev / 2).sample + model_output = (sigma_prev / 2) * model((step_output.prev_sample + 1) / 2, sigma_prev / 2).sample step_output = self.scheduler.step_correct( model_output, sigma_hat, sigma_prev, sample_hat, step_output.prev_sample, - step_output["derivative"], ) + step_output["derivative"], + ) sample = step_output.prev_sample sample = (sample / 2 + 0.5).clip(0, 1) @@ -127,6 +123,6 @@ def __call__( image = self.numpy_to_pil(image) if not return_dict: - return (image, ) + return (image,) return ImagePipelineOutput(images=image) diff --git a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/__init__.py b/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/__init__.py index 2ab0f9892a8b6..649c39a7ecdad 100644 --- a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/__init__.py +++ b/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/__init__.py @@ -19,8 +19,12 @@ import numpy as np import paddle -from ...utils import (BaseOutput, OptionalDependencyNotAvailable, - is_paddle_available, is_paddlenlp_available) +from ...utils import ( + BaseOutput, + OptionalDependencyNotAvailable, + is_paddle_available, + is_paddlenlp_available, +) @dataclass diff --git a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py index de047ee797c85..8ecc3b2759f33 100644 --- a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +++ b/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py @@ -47,8 +47,7 @@ """ -def tensor2vid(video: paddle.Tensor, mean=[0.5, 0.5, 0.5], - std=[0.5, 0.5, 0.5]) -> List[np.ndarray]: +def tensor2vid(video: paddle.Tensor, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) -> List[np.ndarray]: # This code is copied from https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78 # reshape to ncfhw mean = paddle.to_tensor(mean).reshape((1, -1, 1, 1, 1)) @@ -85,29 +84,32 @@ class TextToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin): """ def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet3DConditionModel, - scheduler: KarrasDiffusionSchedulers, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet3DConditionModel, + scheduler: KarrasDiffusionSchedulers, + ): super().__init__() self.register_modules( vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, - scheduler=scheduler, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + scheduler=scheduler, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): """ Encodes the prompt into text encoder hidden states. @@ -145,32 +147,30 @@ def _encode_prompt( padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( f"The following part of your input was truncated because CLIP can only handle sequences up to {self.tokenizer.model_max_length} tokens: {removed_text}" ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None - prompt_embeds = self.text_encoder( - text_input_ids, attention_mask=attention_mask) + prompt_embeds = self.text_encoder(text_input_ids, attention_mask=attention_mask) prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.cast(self.text_encoder.dtype) bs_embed, seq_len, _ = prompt_embeds.shape # duplicate text embeddings for each generation per prompt, using mps friendly method prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - (bs_embed * num_images_per_prompt, seq_len, -1)) + prompt_embeds = prompt_embeds.reshape((bs_embed * num_images_per_prompt, seq_len, -1)) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance and negative_prompt_embeds is None: uncond_tokens: List[str] @@ -191,48 +191,41 @@ def _encode_prompt( # textual inversion: procecss multi-vector tokens if necessary if isinstance(self, TextualInversionLoaderMixin): - uncond_tokens = self.maybe_convert_prompt(uncond_tokens, - self.tokenizer) + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) max_length = prompt_embeds.shape[1] uncond_input = self.tokenizer( uncond_tokens, padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + return_tensors="pd", + ) + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None - negative_prompt_embeds = self.text_encoder( - uncond_input.input_ids, attention_mask=attention_mask) + negative_prompt_embeds = self.text_encoder(uncond_input.input_ids, attention_mask=attention_mask) negative_prompt_embeds = negative_prompt_embeds[0] if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.cast( - self.text_encoder.dtype) - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - (batch_size * num_images_per_prompt, seq_len, -1)) + negative_prompt_embeds = negative_prompt_embeds.cast(self.text_encoder.dtype) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape((batch_size * num_images_per_prompt, seq_len, -1)) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds def decode_latents(self, latents): latents = 1 / self.vae.config.scaling_factor * latents batch_size, channels, num_frames, height, width = latents.shape - latents = latents.transpose([0, 2, 1, 3, 4]).reshape( - (batch_size * num_frames, channels, height, width)) + latents = latents.transpose([0, 2, 1, 3, 4]).reshape((batch_size * num_frames, channels, height, width)) image = self.vae.decode(latents).sample - video = (image[None, :] - .reshape((batch_size, num_frames, -1) + tuple(image.shape[2:])) - .transpose([0, 2, 1, 3, 4])) + video = ( + image[None, :].reshape((batch_size, num_frames, -1) + tuple(image.shape[2:])).transpose([0, 2, 1, 3, 4]) + ) video = video.cast("float32") return video @@ -241,33 +234,33 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs( - self, - prompt, - height, - width, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) - if (callback_steps is None or callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") + if ( + callback_steps is None + or callback_steps is not None + and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}." ) @@ -279,11 +272,8 @@ def check_inputs( raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two." @@ -295,21 +285,23 @@ def check_inputs( ) def prepare_latents( - self, - batch_size, - num_channels_latents, - num_frames, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + num_frames, + height, + width, + dtype, + generator, + latents=None, + ): shape = ( batch_size, num_channels_latents, num_frames, height // self.vae_scale_factor, - width // self.vae_scale_factor, ) + width // self.vae_scale_factor, + ) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( f"You have passed a list of generators of length {len(generator)}, but requested an effective batch size of {batch_size}. Make sure the batch size matches the length of the generators." @@ -323,25 +315,25 @@ def prepare_latents( @paddle.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( - self, - prompt: Union[str, List[str]]=None, - height: Optional[int]=None, - width: Optional[int]=None, - num_frames: int=16, - num_inference_steps: int=50, - guidance_scale: float=9.0, - negative_prompt: Optional[Union[str, List[str]]]=None, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="np", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: int=1, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, ): + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_frames: int = 16, + num_inference_steps: int = 50, + guidance_scale: float = 9.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "np", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ): """ Function invoked when calling the pipeline for generation. @@ -423,7 +415,8 @@ def __call__( callback_steps, negative_prompt, prompt_embeds, - negative_prompt_embeds, ) + negative_prompt_embeds, + ) # 2. Define call parameters if prompt is not None and isinstance(prompt, str): batch_size = 1 @@ -442,7 +435,8 @@ def __call__( do_classifier_free_guidance, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -459,48 +453,38 @@ def __call__( width, prompt_embeds.dtype, generator, - latents, ) + latents, + ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) # 7. Denoising loop - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( latent_model_input, t, encoder_hidden_states=prompt_embeds, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample # perform guidance if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk( - chunks=2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # reshape latents bsz, channel, frames, width, height = latents.shape - latents = latents.transpose([0, 2, 1, 3, 4]).reshape( - (bsz * frames, channel, width, height)) - noise_pred = noise_pred.transpose([0, 2, 1, 3, 4]).reshape( - (bsz * frames, channel, width, height)) + latents = latents.transpose([0, 2, 1, 3, 4]).reshape((bsz * frames, channel, width, height)) + noise_pred = noise_pred.transpose([0, 2, 1, 3, 4]).reshape((bsz * frames, channel, width, height)) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample - latents = (latents[None, :].reshape( - (bsz, frames, channel, width, height)) - .transpose([0, 2, 1, 3, 4])) + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample + latents = latents[None, :].reshape((bsz, frames, channel, width, height)).transpose([0, 2, 1, 3, 4]) # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -510,5 +494,5 @@ def __call__( else: video = tensor2vid(video_tensor) if not return_dict: - return (video, ) + return (video,) return TextToVideoSDPipelineOutput(frames=video) diff --git a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py index 5f9ccbe235000..106382dceb106 100644 --- a/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +++ b/ppdiffusers/ppdiffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py @@ -20,31 +20,26 @@ import paddle import paddle.nn.functional as F import PIL -from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel, - CLIPTokenizer) +from paddlenlp.transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ppdiffusers.models import AutoencoderKL, UNet2DConditionModel from ppdiffusers.pipelines.stable_diffusion import ( - StableDiffusionPipeline, StableDiffusionSafetyChecker) + StableDiffusionPipeline, + StableDiffusionSafetyChecker, +) from ppdiffusers.schedulers import KarrasDiffusionSchedulers from ppdiffusers.utils import BaseOutput def rearrange_0(tensor, f): F, C, H, W = tensor.shape - tensor = paddle.transpose( - x=paddle.reshape( - x=tensor, shape=(F // f, f, C, H, W)), - perm=(0, 2, 1, 3, 4)) + tensor = paddle.transpose(x=paddle.reshape(x=tensor, shape=(F // f, f, C, H, W)), perm=(0, 2, 1, 3, 4)) return tensor def rearrange_1(tensor): B, C, F, H, W = tensor.shape - return paddle.reshape( - x=paddle.transpose( - x=tensor, perm=(0, 2, 1, 3, 4)), - shape=(B * F, C, H, W)) + return paddle.reshape(x=paddle.transpose(x=tensor, perm=(0, 2, 1, 3, 4)), shape=(B * F, C, H, W)) def rearrange_3(tensor, f): @@ -70,21 +65,15 @@ class CrossFrameAttnProcessor: def __init__(self, batch_size=2): self.batch_size = batch_size - def __call__(self, - attn, - hidden_states, - encoder_hidden_states=None, - attention_mask=None): + def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None): batch_size, sequence_length, _ = hidden_states.shape - attention_mask = attn.prepare_attention_mask( - attention_mask, sequence_length, batch_size) + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) query = attn.to_q(hidden_states) is_cross_attention = encoder_hidden_states is not None if encoder_hidden_states is None: encoder_hidden_states = hidden_states elif attn.norm_cross: - encoder_hidden_states = attn.norm_encoder_hidden_states( - encoder_hidden_states) + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) key = attn.to_k(encoder_hidden_states) value = attn.to_v(encoder_hidden_states) @@ -144,10 +133,10 @@ def warp_single_latent(latent, reference_flow): if isinstance(latent.dtype, paddle.dtype): dtype = latent.dtype elif isinstance(latent.dtype, str) and latent.dtype not in [ - "cpu", - "cuda", - "ipu", - "xpu", + "cpu", + "cuda", + "ipu", + "xpu", ]: dtype = latent.dtype elif isinstance(latent.dtype, paddle.Tensor): @@ -161,13 +150,11 @@ def warp_single_latent(latent, reference_flow): coords_t0 = coords_t0 * 2.0 - 1.0 coords_t0 = F.interpolate(x=coords_t0, size=(h, w), mode="bilinear") coords_t0 = paddle.transpose(x=coords_t0, perm=(0, 2, 3, 1)) - warped = F.grid_sample( - x=latent, grid=coords_t0, mode="nearest", padding_mode="reflection") + warped = F.grid_sample(x=latent, grid=coords_t0, mode="nearest", padding_mode="reflection") return warped -def create_motion_field(motion_field_strength_x, motion_field_strength_y, - frame_ids, dtype): +def create_motion_field(motion_field_strength_x, motion_field_strength_y, frame_ids, dtype): """ Create translation motion field @@ -184,15 +171,12 @@ def create_motion_field(motion_field_strength_x, motion_field_strength_y, seq_length = len(frame_ids) reference_flow = paddle.zeros(shape=(seq_length, 2, 512, 512), dtype=dtype) for fr_idx in range(seq_length): - reference_flow[(fr_idx), (0), :, :] = (motion_field_strength_x * - frame_ids[fr_idx]) - reference_flow[(fr_idx), (1), :, :] = (motion_field_strength_y * - frame_ids[fr_idx]) + reference_flow[(fr_idx), (0), :, :] = motion_field_strength_x * frame_ids[fr_idx] + reference_flow[(fr_idx), (1), :, :] = motion_field_strength_y * frame_ids[fr_idx] return reference_flow -def create_motion_field_and_warp_latents( - motion_field_strength_x, motion_field_strength_y, frame_ids, latents): +def create_motion_field_and_warp_latents(motion_field_strength_x, motion_field_strength_y, frame_ids, latents): """ Creates translation motion and warps the latents accordingly @@ -210,11 +194,11 @@ def create_motion_field_and_warp_latents( motion_field_strength_x=motion_field_strength_x, motion_field_strength_y=motion_field_strength_y, frame_ids=frame_ids, - dtype=latents.dtype, ) + dtype=latents.dtype, + ) warped_latents = latents.clone().detach() for i in range(len(warped_latents)): - warped_latents[i] = warp_single_latent(latents[i][None], - motion_field[i][None]) + warped_latents[i] = warp_single_latent(latents[i][None], motion_field[i][None]) return warped_latents @@ -244,15 +228,16 @@ class TextToVideoZeroPipeline(StableDiffusionPipeline): """ def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: KarrasDiffusionSchedulers, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPImageProcessor, - requires_safety_checker: bool=True, ): + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + ): super().__init__( vae, text_encoder, @@ -261,7 +246,8 @@ def __init__( scheduler, safety_checker, feature_extractor, - requires_safety_checker, ) + requires_safety_checker, + ) self.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2)) def forward_loop(self, x_t0, t0, t1, generator): @@ -277,24 +263,23 @@ def forward_loop(self, x_t0, t0, t1, generator): Returns: x_t1: forward process applied to x_t0 from time t0 to t1. """ - eps = paddle.randn( - shape=x_t0.shape, generator=generator, dtype=x_t0.dtype) + eps = paddle.randn(shape=x_t0.shape, generator=generator, dtype=x_t0.dtype) alpha_vec = paddle.prod(x=self.scheduler.alphas[t0:t1]) - x_t1 = paddle.sqrt(x=alpha_vec) * x_t0 + paddle.sqrt(x=1 - - alpha_vec) * eps + x_t1 = paddle.sqrt(x=alpha_vec) * x_t0 + paddle.sqrt(x=1 - alpha_vec) * eps return x_t1 def backward_loop( - self, - latents, - timesteps, - prompt_embeds, - guidance_scale, - callback, - callback_steps, - num_warmup_steps, - extra_step_kwargs, - cross_attention_kwargs=None, ): + self, + latents, + timesteps, + prompt_embeds, + guidance_scale, + callback, + callback_steps, + num_warmup_steps, + extra_step_kwargs, + cross_attention_kwargs=None, + ): """ Perform backward process given list of time steps @@ -326,32 +311,27 @@ def backward_loop( with self.progress_bar(total=num_steps) as progress_bar: for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat(x=[latents] * 2) if - do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat(x=[latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual noise_pred = self.unet( latent_model_input, t, encoder_hidden_states=prompt_embeds, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample # perform guidance if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk( - chunks=2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if (i == len(timesteps) - 1 or i + 1 > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or i + 1 > num_warmup_steps and (i + 1) % self.scheduler.order == 0: progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) @@ -359,27 +339,27 @@ def backward_loop( @paddle.no_grad() def __call__( - self, - prompt: Union[str, List[str]], - video_length: Optional[int]=8, - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_videos_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - motion_field_strength_x: float=12, - motion_field_strength_y: float=12, - output_type: Optional[str]="tensor", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - t0: int=44, - t1: int=47, ): + self, + prompt: Union[str, List[str]], + video_length: Optional[int] = 8, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_videos_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + motion_field_strength_x: float = 12, + motion_field_strength_y: float = 12, + output_type: Optional[str] = "tensor", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + t0: int = 44, + t1: int = 47, + ): """ Function invoked when calling the pipeline for generation. @@ -471,12 +451,14 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # Encode input prompt - prompt_embeds = self._encode_prompt(prompt, num_videos_per_prompt, - do_classifier_free_guidance, - negative_prompt) + prompt_embeds = self._encode_prompt( + prompt, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt + ) # Prepare timesteps - self.scheduler.set_timesteps(num_inference_steps, ) + self.scheduler.set_timesteps( + num_inference_steps, + ) timesteps = self.scheduler.timesteps # Prepare latent variables @@ -488,35 +470,37 @@ def __call__( width, prompt_embeds.dtype, generator, - latents, ) + latents, + ) # Prepare extra step kwargs. extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order # Perform the first backward process up to time T_1 x_1_t1 = self.backward_loop( - timesteps=timesteps[:-t1 - 1], + timesteps=timesteps[: -t1 - 1], prompt_embeds=prompt_embeds, latents=latents, guidance_scale=guidance_scale, callback=callback, callback_steps=callback_steps, extra_step_kwargs=extra_step_kwargs, - num_warmup_steps=num_warmup_steps, ) + num_warmup_steps=num_warmup_steps, + ) scheduler_copy = copy.deepcopy(self.scheduler) # Perform the second backward process up to time T_0 x_1_t0 = self.backward_loop( - timesteps=timesteps[-t1 - 1:-t0 - 1], + timesteps=timesteps[-t1 - 1 : -t0 - 1], prompt_embeds=prompt_embeds, latents=x_1_t1, guidance_scale=guidance_scale, callback=callback, callback_steps=callback_steps, extra_step_kwargs=extra_step_kwargs, - num_warmup_steps=0, ) + num_warmup_steps=0, + ) # Propagate first frame latents at time T_0 to remaining frames x_2k_t0 = x_1_t0.tile(repeat_times=[video_length - 1, 1, 1, 1]) @@ -526,31 +510,34 @@ def __call__( motion_field_strength_x=motion_field_strength_x, motion_field_strength_y=motion_field_strength_y, latents=x_2k_t0, - frame_ids=frame_ids[1:], ) + frame_ids=frame_ids[1:], + ) # Perform forward process up to time T_1 x_2k_t1 = self.forward_loop( x_t0=x_2k_t0, t0=timesteps[-t0 - 1].item(), t1=timesteps[-t1 - 1].item(), - generator=generator, ) + generator=generator, + ) # Perform backward process from time T_1 to 0 x_1k_t1 = paddle.concat(x=[x_1_t1, x_2k_t1]) b, l, d = prompt_embeds.shape - prompt_embeds = (prompt_embeds[:, (None)] - .tile(repeat_times=[1, video_length, 1, 1]) - .reshape([b * video_length, l, d])) + prompt_embeds = ( + prompt_embeds[:, (None)].tile(repeat_times=[1, video_length, 1, 1]).reshape([b * video_length, l, d]) + ) self.scheduler = scheduler_copy x_1k_0 = self.backward_loop( - timesteps=timesteps[-t1 - 1:], + timesteps=timesteps[-t1 - 1 :], prompt_embeds=prompt_embeds, latents=x_1k_t1, guidance_scale=guidance_scale, callback=callback, callback_steps=callback_steps, extra_step_kwargs=extra_step_kwargs, - num_warmup_steps=0, ) + num_warmup_steps=0, + ) latents = x_1k_0 paddle.device.cuda.empty_cache() if output_type == "latent": @@ -558,9 +545,7 @@ def __call__( has_nsfw_concept = None else: image = self.decode_latents(latents) - image, has_nsfw_concept = self.run_safety_checker( - image, prompt_embeds.dtype) + image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype) if not return_dict: return image, has_nsfw_concept - return TextToVideoPipelineOutput( - images=image, nsfw_content_detected=has_nsfw_concept) + return TextToVideoPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/ppdiffusers/ppdiffusers/pipelines/unclip/__init__.py b/ppdiffusers/ppdiffusers/pipelines/unclip/__init__.py index 90e39132e944b..4fa798729384f 100644 --- a/ppdiffusers/ppdiffusers/pipelines/unclip/__init__.py +++ b/ppdiffusers/ppdiffusers/pipelines/unclip/__init__.py @@ -12,15 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ...utils import (OptionalDependencyNotAvailable, is_paddle_available, - is_paddlenlp_available) +from ...utils import ( + OptionalDependencyNotAvailable, + is_paddle_available, + is_paddlenlp_available, +) try: if not (is_paddlenlp_available() and is_paddle_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from ...utils.dummy_paddle_and_paddlenlp_objects import ( - UnCLIPImageVariationPipeline, UnCLIPPipeline) + UnCLIPImageVariationPipeline, + UnCLIPPipeline, + ) else: from .pipeline_unclip import UnCLIPPipeline from .pipeline_unclip_image_variation import UnCLIPImageVariationPipeline diff --git a/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip.py b/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip.py index 4c591a6c434cb..9f9d905244ac2 100644 --- a/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip.py +++ b/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip.py @@ -75,17 +75,18 @@ class UnCLIPPipeline(DiffusionPipeline): super_res_scheduler: UnCLIPScheduler def __init__( - self, - prior: PriorTransformer, - decoder: UNet2DConditionModel, - text_encoder: CLIPTextModelWithProjection, - tokenizer: CLIPTokenizer, - text_proj: UnCLIPTextProjModel, - super_res_first: UNet2DModel, - super_res_last: UNet2DModel, - prior_scheduler: UnCLIPScheduler, - decoder_scheduler: UnCLIPScheduler, - super_res_scheduler: UnCLIPScheduler, ): + self, + prior: PriorTransformer, + decoder: UNet2DConditionModel, + text_encoder: CLIPTextModelWithProjection, + tokenizer: CLIPTokenizer, + text_proj: UnCLIPTextProjModel, + super_res_first: UNet2DModel, + super_res_last: UNet2DModel, + prior_scheduler: UnCLIPScheduler, + decoder_scheduler: UnCLIPScheduler, + super_res_scheduler: UnCLIPScheduler, + ): super().__init__() self.register_modules( @@ -98,27 +99,27 @@ def __init__( super_res_last=super_res_last, prior_scheduler=prior_scheduler, decoder_scheduler=decoder_scheduler, - super_res_scheduler=super_res_scheduler, ) + super_res_scheduler=super_res_scheduler, + ) def prepare_latents(self, shape, dtype, generator, latents, scheduler): if latents is None: latents = randn_tensor(shape, generator=generator, dtype=dtype) else: if latents.shape != list(shape): - raise ValueError( - f"Unexpected latents shape, got {latents.shape}, expected {shape}" - ) + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") latents = latents * scheduler.init_noise_sigma return latents def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]]=None, - text_attention_mask: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None, + text_attention_mask: Optional[paddle.Tensor] = None, + ): if text_model_output is None: batch_size = len(prompt) if isinstance(prompt, list) else 1 # get prompt text embeddings @@ -128,23 +129,24 @@ def _encode_prompt( max_length=self.tokenizer.model_max_length, truncation=True, return_attention_mask=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids text_mask = text_inputs.attention_mask - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, - untruncated_ids): + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") - text_input_ids = text_input_ids[:, : - self.tokenizer.model_max_length] + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length] text_encoder_output = self.text_encoder(text_input_ids) @@ -155,27 +157,26 @@ def _encode_prompt( batch_size = text_model_output[0].shape[0] prompt_embeds, text_encoder_hidden_states = ( text_model_output[0], - text_model_output[1], ) + text_model_output[1], + ) text_mask = text_attention_mask # duplicate text embeddings for each generation per prompt seq_len = prompt_embeds.shape[1] prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt]) - prompt_embeds = prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len]) + prompt_embeds = prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len]) # duplicate text_encoder_hidden_states for each generation per prompt seq_len = text_encoder_hidden_states.shape[1] - text_encoder_hidden_states = text_encoder_hidden_states.tile( - [1, num_images_per_prompt, 1]) + text_encoder_hidden_states = text_encoder_hidden_states.tile([1, num_images_per_prompt, 1]) text_encoder_hidden_states = text_encoder_hidden_states.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + [batch_size * num_images_per_prompt, seq_len, -1] + ) # duplicate text_mask for each generation per prompt seq_len = text_mask.shape[1] text_mask = text_mask.tile([1, num_images_per_prompt]) - text_mask = text_mask.reshape( - [batch_size * num_images_per_prompt, seq_len]) + text_mask = text_mask.reshape([batch_size * num_images_per_prompt, seq_len]) # prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, axis=0) # text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, axis=0) @@ -190,47 +191,38 @@ def _encode_prompt( max_length=self.tokenizer.model_max_length, return_attention_mask=True, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) uncond_text_mask = uncond_input.attention_mask - negative_prompt_embeds_text_encoder_output = self.text_encoder( - uncond_input.input_ids) + negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids) - negative_prompt_embeds = ( - negative_prompt_embeds_text_encoder_output.text_embeds) - uncond_text_encoder_hidden_states = ( - negative_prompt_embeds_text_encoder_output.last_hidden_state) + negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds + uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len]) seq_len = uncond_text_encoder_hidden_states.shape[1] - uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.tile( - [1, num_images_per_prompt, 1]) - uncond_text_encoder_hidden_states = ( - uncond_text_encoder_hidden_states.reshape( - [batch_size * num_images_per_prompt, seq_len, -1])) + uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.tile([1, num_images_per_prompt, 1]) + uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.reshape( + [batch_size * num_images_per_prompt, seq_len, -1] + ) # duplicate uncond_text_mask for each generation per prompt seq_len = uncond_text_mask.shape[1] uncond_text_mask = uncond_text_mask.tile([1, num_images_per_prompt]) - uncond_text_mask = uncond_text_mask.reshape( - [batch_size * num_images_per_prompt, seq_len]) + uncond_text_mask = uncond_text_mask.reshape([batch_size * num_images_per_prompt, seq_len]) # uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, axis=0) # done duplicates # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) - text_encoder_hidden_states = paddle.concat([ - uncond_text_encoder_hidden_states, text_encoder_hidden_states - ]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) + text_encoder_hidden_states = paddle.concat([uncond_text_encoder_hidden_states, text_encoder_hidden_states]) text_mask = paddle.concat([uncond_text_mask, text_mask]) @@ -238,23 +230,23 @@ def _encode_prompt( @paddle.no_grad() def __call__( - self, - prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: int=1, - prior_num_inference_steps: int=25, - decoder_num_inference_steps: int=25, - super_res_num_inference_steps: int=7, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - prior_latents: Optional[paddle.Tensor]=None, - decoder_latents: Optional[paddle.Tensor]=None, - super_res_latents: Optional[paddle.Tensor]=None, - text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]]=None, - text_attention_mask: Optional[paddle.Tensor]=None, - prior_guidance_scale: float=4.0, - decoder_guidance_scale: float=8.0, - output_type: Optional[str]="pil", - return_dict: bool=True, ): + self, + prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: int = 1, + prior_num_inference_steps: int = 25, + decoder_num_inference_steps: int = 25, + super_res_num_inference_steps: int = 7, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + prior_latents: Optional[paddle.Tensor] = None, + decoder_latents: Optional[paddle.Tensor] = None, + super_res_latents: Optional[paddle.Tensor] = None, + text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None, + text_attention_mask: Optional[paddle.Tensor] = None, + prior_guidance_scale: float = 4.0, + decoder_guidance_scale: float = 8.0, + output_type: Optional[str] = "pil", + return_dict: bool = True, + ): """ Function invoked when calling the pipeline for generation. @@ -312,23 +304,21 @@ def __call__( elif isinstance(prompt, list): batch_size = len(prompt) else: - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") else: batch_size = text_model_output[0].shape[0] batch_size = batch_size * num_images_per_prompt - do_classifier_free_guidance = (prior_guidance_scale > 1.0 or - decoder_guidance_scale > 1.0) + do_classifier_free_guidance = prior_guidance_scale > 1.0 or decoder_guidance_scale > 1.0 prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt( prompt, num_images_per_prompt, do_classifier_free_guidance, text_model_output, - text_attention_mask, ) + text_attention_mask, + ) # prior @@ -342,30 +332,29 @@ def __call__( prompt_embeds.dtype, generator, prior_latents, - self.prior_scheduler, ) + self.prior_scheduler, + ) for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([prior_latents] * 2) - if do_classifier_free_guidance else - prior_latents) + latent_model_input = paddle.concat([prior_latents] * 2) if do_classifier_free_guidance else prior_latents predicted_image_embedding = self.prior( latent_model_input, timestep=t, proj_embedding=prompt_embeds, encoder_hidden_states=text_encoder_hidden_states, - attention_mask=text_mask, ).predicted_image_embedding + attention_mask=text_mask, + ).predicted_image_embedding if do_classifier_free_guidance: ( predicted_image_embedding_uncond, predicted_image_embedding_text, ) = predicted_image_embedding.chunk(2) - predicted_image_embedding = ( - predicted_image_embedding_uncond + prior_guidance_scale * - (predicted_image_embedding_text - - predicted_image_embedding_uncond)) + predicted_image_embedding = predicted_image_embedding_uncond + prior_guidance_scale * ( + predicted_image_embedding_text - predicted_image_embedding_uncond + ) if i + 1 == prior_timesteps_tensor.shape[0]: prev_timestep = None @@ -377,7 +366,8 @@ def __call__( timestep=t, sample=prior_latents, generator=generator, - prev_timestep=prev_timestep, ).prev_sample + prev_timestep=prev_timestep, + ).prev_sample prior_latents = self.prior.post_process_latents(prior_latents) @@ -390,13 +380,15 @@ def __call__( image_embeddings=image_embeddings, prompt_embeds=prompt_embeds, text_encoder_hidden_states=text_encoder_hidden_states, - do_classifier_free_guidance=do_classifier_free_guidance, ) + do_classifier_free_guidance=do_classifier_free_guidance, + ) decoder_text_mask = F.pad( text_mask.unsqueeze(0), (self.text_proj.clip_extra_context_tokens, 0), value=1, - data_format="NCL", ).squeeze(0) + data_format="NCL", + ).squeeze(0) self.decoder_scheduler.set_timesteps(decoder_num_inference_steps) decoder_timesteps_tensor = self.decoder_scheduler.timesteps @@ -410,20 +402,22 @@ def __call__( text_encoder_hidden_states.dtype, generator, decoder_latents, - self.decoder_scheduler, ) + self.decoder_scheduler, + ) for i, t in enumerate(self.progress_bar(decoder_timesteps_tensor)): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([decoder_latents] * 2) - if do_classifier_free_guidance else - decoder_latents) + latent_model_input = ( + paddle.concat([decoder_latents] * 2) if do_classifier_free_guidance else decoder_latents + ) noise_pred = self.decoder( sample=latent_model_input, timestep=t, encoder_hidden_states=text_encoder_hidden_states, class_labels=additive_clip_time_embeddings, - attention_mask=decoder_text_mask, ).sample + attention_mask=decoder_text_mask, + ).sample if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) @@ -431,20 +425,19 @@ def __call__( noise_pred_uncond, _ = noise_pred_uncond.split( [ latent_model_input.shape[1], - noise_pred_uncond.shape[1] - - latent_model_input.shape[1], + noise_pred_uncond.shape[1] - latent_model_input.shape[1], ], - axis=1, ) + axis=1, + ) noise_pred_text, predicted_variance = noise_pred_text.split( [ latent_model_input.shape[1], noise_pred_text.shape[1] - latent_model_input.shape[1], ], - axis=1, ) - noise_pred = noise_pred_uncond + decoder_guidance_scale * ( - noise_pred_text - noise_pred_uncond) - noise_pred = paddle.concat( - [noise_pred, predicted_variance], axis=1) + axis=1, + ) + noise_pred = noise_pred_uncond + decoder_guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1) if i + 1 == decoder_timesteps_tensor.shape[0]: prev_timestep = None @@ -457,7 +450,8 @@ def __call__( t, decoder_latents, prev_timestep=prev_timestep, - generator=generator, ).prev_sample + generator=generator, + ).prev_sample decoder_latents = decoder_latents.clip(-1, 1) @@ -479,7 +473,8 @@ def __call__( image_small.dtype, generator, super_res_latents, - self.super_res_scheduler, ) + self.super_res_scheduler, + ) interpolate_antialias = {} if "antialias" in inspect.signature(F.interpolate).parameters: @@ -490,7 +485,8 @@ def __call__( size=[height, width], mode="bicubic", align_corners=False, - **interpolate_antialias, ) + **interpolate_antialias, + ) for i, t in enumerate(self.progress_bar(super_res_timesteps_tensor)): # no classifier free guidance @@ -501,15 +497,14 @@ def __call__( unet = self.super_res_first latent_model_input = paddle.concat( - [ - super_res_latents, - image_upscaled.cast(super_res_latents.dtype) - ], - axis=1, ) + [super_res_latents, image_upscaled.cast(super_res_latents.dtype)], + axis=1, + ) noise_pred = unet( sample=latent_model_input, - timestep=t, ).sample + timestep=t, + ).sample if i + 1 == super_res_timesteps_tensor.shape[0]: prev_timestep = None @@ -522,7 +517,8 @@ def __call__( t, super_res_latents, prev_timestep=prev_timestep, - generator=generator, ).prev_sample + generator=generator, + ).prev_sample image = super_res_latents # done super res @@ -537,6 +533,6 @@ def __call__( image = self.numpy_to_pil(image) if not return_dict: - return (image, ) + return (image,) return ImagePipelineOutput(images=image) diff --git a/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip_image_variation.py b/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip_image_variation.py index ada35969b9c65..f303633b838ee 100644 --- a/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip_image_variation.py +++ b/ppdiffusers/ppdiffusers/pipelines/unclip/pipeline_unclip_image_variation.py @@ -18,9 +18,12 @@ import paddle import paddle.nn.functional as F import PIL -from paddlenlp.transformers import (CLIPImageProcessor, - CLIPTextModelWithProjection, CLIPTokenizer, - CLIPVisionModelWithProjection) +from paddlenlp.transformers import ( + CLIPImageProcessor, + CLIPTextModelWithProjection, + CLIPTokenizer, + CLIPVisionModelWithProjection, +) from ...models import UNet2DConditionModel, UNet2DModel from ...pipelines import DiffusionPipeline, ImagePipelineOutput @@ -78,17 +81,18 @@ class UnCLIPImageVariationPipeline(DiffusionPipeline): super_res_scheduler: UnCLIPScheduler def __init__( - self, - decoder: UNet2DConditionModel, - text_encoder: CLIPTextModelWithProjection, - tokenizer: CLIPTokenizer, - text_proj: UnCLIPTextProjModel, - feature_extractor: CLIPImageProcessor, - image_encoder: CLIPVisionModelWithProjection, - super_res_first: UNet2DModel, - super_res_last: UNet2DModel, - decoder_scheduler: UnCLIPScheduler, - super_res_scheduler: UnCLIPScheduler, ): + self, + decoder: UNet2DConditionModel, + text_encoder: CLIPTextModelWithProjection, + tokenizer: CLIPTokenizer, + text_proj: UnCLIPTextProjModel, + feature_extractor: CLIPImageProcessor, + image_encoder: CLIPVisionModelWithProjection, + super_res_first: UNet2DModel, + super_res_last: UNet2DModel, + decoder_scheduler: UnCLIPScheduler, + super_res_scheduler: UnCLIPScheduler, + ): super().__init__() self.register_modules( @@ -101,7 +105,8 @@ def __init__( super_res_first=super_res_first, super_res_last=super_res_last, decoder_scheduler=decoder_scheduler, - super_res_scheduler=super_res_scheduler, ) + super_res_scheduler=super_res_scheduler, + ) # Copied from ppdiffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents def prepare_latents(self, shape, dtype, generator, latents, scheduler): @@ -109,15 +114,12 @@ def prepare_latents(self, shape, dtype, generator, latents, scheduler): latents = randn_tensor(shape, generator=generator, dtype=dtype) else: if latents.shape != list(shape): - raise ValueError( - f"Unexpected latents shape, got {latents.shape}, expected {shape}" - ) + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") latents = latents * scheduler.init_noise_sigma return latents - def _encode_prompt(self, prompt, num_images_per_prompt, - do_classifier_free_guidance): + def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance): batch_size = len(prompt) if isinstance(prompt, list) else 1 # get prompt text embeddings @@ -126,7 +128,8 @@ def _encode_prompt(self, prompt, num_images_per_prompt, padding="max_length", max_length=self.tokenizer.model_max_length, return_attention_mask=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids text_mask = text_inputs.attention_mask text_encoder_output = self.text_encoder(text_input_ids) @@ -137,21 +140,19 @@ def _encode_prompt(self, prompt, num_images_per_prompt, # duplicate text embeddings for each generation per prompt seq_len = prompt_embeds.shape[1] prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt]) - prompt_embeds = prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len]) + prompt_embeds = prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len]) # duplicate text_encoder_hidden_states for each generation per prompt seq_len = text_encoder_hidden_states.shape[1] - text_encoder_hidden_states = text_encoder_hidden_states.tile( - [1, num_images_per_prompt, 1]) + text_encoder_hidden_states = text_encoder_hidden_states.tile([1, num_images_per_prompt, 1]) text_encoder_hidden_states = text_encoder_hidden_states.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + [batch_size * num_images_per_prompt, seq_len, -1] + ) # duplicate text_mask for each generation per prompt seq_len = text_mask.shape[1] text_mask = text_mask.tile([1, num_images_per_prompt]) - text_mask = text_mask.reshape( - [batch_size * num_images_per_prompt, seq_len]) + text_mask = text_mask.reshape([batch_size * num_images_per_prompt, seq_len]) # prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, axis=0) # text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, axis=0) @@ -167,91 +168,81 @@ def _encode_prompt(self, prompt, num_images_per_prompt, max_length=max_length, return_attention_mask=True, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) uncond_text_mask = uncond_input.attention_mask - negative_prompt_embeds_text_encoder_output = self.text_encoder( - uncond_input.input_ids) + negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids) - negative_prompt_embeds = ( - negative_prompt_embeds_text_encoder_output.text_embeds) - uncond_text_encoder_hidden_states = ( - negative_prompt_embeds_text_encoder_output.last_hidden_state) + negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds + uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len]) seq_len = uncond_text_encoder_hidden_states.shape[1] - uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.tile( - [1, num_images_per_prompt, 1]) - uncond_text_encoder_hidden_states = ( - uncond_text_encoder_hidden_states.reshape( - [batch_size * num_images_per_prompt, seq_len, -1])) + uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.tile([1, num_images_per_prompt, 1]) + uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.reshape( + [batch_size * num_images_per_prompt, seq_len, -1] + ) # duplicate uncond_text_mask for each generation per prompt seq_len = uncond_text_mask.shape[1] uncond_text_mask = uncond_text_mask.tile([1, num_images_per_prompt]) - uncond_text_mask = uncond_text_mask.reshape( - [batch_size * num_images_per_prompt, seq_len]) + uncond_text_mask = uncond_text_mask.reshape([batch_size * num_images_per_prompt, seq_len]) # uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, axis=0) # done duplicates # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) - text_encoder_hidden_states = paddle.concat([ - uncond_text_encoder_hidden_states, text_encoder_hidden_states - ]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) + text_encoder_hidden_states = paddle.concat([uncond_text_encoder_hidden_states, text_encoder_hidden_states]) text_mask = paddle.concat([uncond_text_mask, text_mask]) return prompt_embeds, text_encoder_hidden_states, text_mask def _encode_image( - self, - image, - num_images_per_prompt, - image_embeddings: Optional[paddle.Tensor]=None, ): + self, + image, + num_images_per_prompt, + image_embeddings: Optional[paddle.Tensor] = None, + ): dtype = self.image_encoder.dtype if image_embeddings is None: if not isinstance(image, paddle.Tensor): - image = self.feature_extractor( - images=image, return_tensors="pd").pixel_values + image = self.feature_extractor(images=image, return_tensors="pd").pixel_values image = image.cast(dtype) image_embeddings = self.image_encoder(image).image_embeds batch_size, seq_len = image_embeddings.shape image_embeddings = image_embeddings.tile([1, num_images_per_prompt]) - image_embeddings = image_embeddings.reshape( - [batch_size * num_images_per_prompt, seq_len]) + image_embeddings = image_embeddings.reshape([batch_size * num_images_per_prompt, seq_len]) # image_embeddings = image_embeddings.repeat_interleave(num_images_per_prompt, axis=0) return image_embeddings @paddle.no_grad() def __call__( - self, - image: Optional[Union[PIL.Image.Image, List[PIL.Image.Image], - paddle.Tensor]]=None, - num_images_per_prompt: int=1, - decoder_num_inference_steps: int=25, - super_res_num_inference_steps: int=7, - generator: Optional[paddle.Generator]=None, - decoder_latents: Optional[paddle.Tensor]=None, - super_res_latents: Optional[paddle.Tensor]=None, - image_embeddings: Optional[paddle.Tensor]=None, - decoder_guidance_scale: float=8.0, - output_type: Optional[str]="pil", - return_dict: bool=True, ): + self, + image: Optional[Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor]] = None, + num_images_per_prompt: int = 1, + decoder_num_inference_steps: int = 25, + super_res_num_inference_steps: int = 7, + generator: Optional[paddle.Generator] = None, + decoder_latents: Optional[paddle.Tensor] = None, + super_res_latents: Optional[paddle.Tensor] = None, + image_embeddings: Optional[paddle.Tensor] = None, + decoder_guidance_scale: float = 8.0, + output_type: Optional[str] = "pil", + return_dict: bool = True, + ): """ Function invoked when calling the pipeline for generation. @@ -307,23 +298,25 @@ def __call__( do_classifier_free_guidance = decoder_guidance_scale > 1.0 prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt( - prompt, num_images_per_prompt, do_classifier_free_guidance) + prompt, num_images_per_prompt, do_classifier_free_guidance + ) - image_embeddings = self._encode_image(image, num_images_per_prompt, - image_embeddings) + image_embeddings = self._encode_image(image, num_images_per_prompt, image_embeddings) # decoder text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj( image_embeddings=image_embeddings, prompt_embeds=prompt_embeds, text_encoder_hidden_states=text_encoder_hidden_states, - do_classifier_free_guidance=do_classifier_free_guidance, ) + do_classifier_free_guidance=do_classifier_free_guidance, + ) decoder_text_mask = F.pad( text_mask.unsqueeze(0), (self.text_proj.clip_extra_context_tokens, 0), value=1, - data_format="NCL", ).squeeze(0) + data_format="NCL", + ).squeeze(0) self.decoder_scheduler.set_timesteps(decoder_num_inference_steps) decoder_timesteps_tensor = self.decoder_scheduler.timesteps @@ -338,20 +331,22 @@ def __call__( text_encoder_hidden_states.dtype, generator, decoder_latents, - self.decoder_scheduler, ) + self.decoder_scheduler, + ) for i, t in enumerate(self.progress_bar(decoder_timesteps_tensor)): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([decoder_latents] * 2) - if do_classifier_free_guidance else - decoder_latents) + latent_model_input = ( + paddle.concat([decoder_latents] * 2) if do_classifier_free_guidance else decoder_latents + ) noise_pred = self.decoder( sample=latent_model_input, timestep=t, encoder_hidden_states=text_encoder_hidden_states, class_labels=additive_clip_time_embeddings, - attention_mask=decoder_text_mask, ).sample + attention_mask=decoder_text_mask, + ).sample if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) @@ -359,20 +354,19 @@ def __call__( noise_pred_uncond, _ = noise_pred_uncond.split( [ latent_model_input.shape[1], - noise_pred_uncond.shape[1] - - latent_model_input.shape[1], + noise_pred_uncond.shape[1] - latent_model_input.shape[1], ], - axis=1, ) + axis=1, + ) noise_pred_text, predicted_variance = noise_pred_text.split( [ latent_model_input.shape[1], noise_pred_text.shape[1] - latent_model_input.shape[1], ], - axis=1, ) - noise_pred = noise_pred_uncond + decoder_guidance_scale * ( - noise_pred_text - noise_pred_uncond) - noise_pred = paddle.concat( - [noise_pred, predicted_variance], axis=1) + axis=1, + ) + noise_pred = noise_pred_uncond + decoder_guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = paddle.concat([noise_pred, predicted_variance], axis=1) if i + 1 == decoder_timesteps_tensor.shape[0]: prev_timestep = None @@ -385,7 +379,8 @@ def __call__( t, decoder_latents, prev_timestep=prev_timestep, - generator=generator, ).prev_sample + generator=generator, + ).prev_sample decoder_latents = decoder_latents.clip(-1, 1) @@ -408,7 +403,8 @@ def __call__( image_small.dtype, generator, super_res_latents, - self.super_res_scheduler, ) + self.super_res_scheduler, + ) interpolate_antialias = {} if "antialias" in inspect.signature(F.interpolate).parameters: @@ -419,7 +415,8 @@ def __call__( size=[height, width], mode="bicubic", align_corners=False, - **interpolate_antialias, ) + **interpolate_antialias, + ) for i, t in enumerate(self.progress_bar(super_res_timesteps_tensor)): # no classifier free guidance @@ -430,15 +427,14 @@ def __call__( unet = self.super_res_first latent_model_input = paddle.concat( - [ - super_res_latents, - image_upscaled.cast(super_res_latents.dtype) - ], - axis=1, ) + [super_res_latents, image_upscaled.cast(super_res_latents.dtype)], + axis=1, + ) noise_pred = unet( sample=latent_model_input, - timestep=t, ).sample + timestep=t, + ).sample if i + 1 == super_res_timesteps_tensor.shape[0]: prev_timestep = None @@ -451,7 +447,8 @@ def __call__( t, super_res_latents, prev_timestep=prev_timestep, - generator=generator, ).prev_sample + generator=generator, + ).prev_sample image = super_res_latents @@ -467,6 +464,6 @@ def __call__( image = self.numpy_to_pil(image) if not return_dict: - return (image, ) + return (image,) return ImagePipelineOutput(images=image) diff --git a/ppdiffusers/ppdiffusers/pipelines/unclip/text_proj.py b/ppdiffusers/ppdiffusers/pipelines/unclip/text_proj.py index 3ce07c27f08b6..69b442fa526ee 100644 --- a/ppdiffusers/ppdiffusers/pipelines/unclip/text_proj.py +++ b/ppdiffusers/ppdiffusers/pipelines/unclip/text_proj.py @@ -29,53 +29,52 @@ class UnCLIPTextProjModel(ModelMixin, ConfigMixin): @register_to_config def __init__( - self, - *, - clip_extra_context_tokens: int=4, - clip_embeddings_dim: int=768, - time_embed_dim: int, - cross_attention_dim, ): + self, + *, + clip_extra_context_tokens: int = 4, + clip_embeddings_dim: int = 768, + time_embed_dim: int, + cross_attention_dim, + ): super().__init__() self.learned_classifier_free_guidance_embeddings = self.create_parameter( - (clip_embeddings_dim, ), + (clip_embeddings_dim,), dtype=paddle.get_default_dtype(), - default_initializer=nn.initializer.Constant(0.0), ) + default_initializer=nn.initializer.Constant(0.0), + ) # parameters for additional clip time embeddings self.embedding_proj = nn.Linear(clip_embeddings_dim, time_embed_dim) - self.clip_image_embeddings_project_to_time_embeddings = nn.Linear( - clip_embeddings_dim, time_embed_dim) + self.clip_image_embeddings_project_to_time_embeddings = nn.Linear(clip_embeddings_dim, time_embed_dim) # parameters for encoder hidden states self.clip_extra_context_tokens = clip_extra_context_tokens self.clip_extra_context_tokens_proj = nn.Linear( - clip_embeddings_dim, - self.clip_extra_context_tokens * cross_attention_dim) - self.encoder_hidden_states_proj = nn.Linear(clip_embeddings_dim, - cross_attention_dim) + clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim + ) + self.encoder_hidden_states_proj = nn.Linear(clip_embeddings_dim, cross_attention_dim) self.text_encoder_hidden_states_norm = nn.LayerNorm(cross_attention_dim) def forward( - self, - *, - image_embeddings, - prompt_embeds, - text_encoder_hidden_states, - do_classifier_free_guidance, ): + self, + *, + image_embeddings, + prompt_embeds, + text_encoder_hidden_states, + do_classifier_free_guidance, + ): image_embeddings = image_embeddings.cast(self.dtype) if do_classifier_free_guidance: # Add the classifier free guidance embeddings to the image embeddings image_embeddings_batch_size = image_embeddings.shape[0] - classifier_free_guidance_embeddings = ( - self.learned_classifier_free_guidance_embeddings.unsqueeze(0)) - classifier_free_guidance_embeddings = ( - classifier_free_guidance_embeddings.expand( - [image_embeddings_batch_size, -1])) - image_embeddings = paddle.concat( - [classifier_free_guidance_embeddings, image_embeddings], axis=0) + classifier_free_guidance_embeddings = self.learned_classifier_free_guidance_embeddings.unsqueeze(0) + classifier_free_guidance_embeddings = classifier_free_guidance_embeddings.expand( + [image_embeddings_batch_size, -1] + ) + image_embeddings = paddle.concat([classifier_free_guidance_embeddings, image_embeddings], axis=0) # The image embeddings batch size and the text embeddings batch size are equal assert image_embeddings.shape[0] == prompt_embeds.shape[0] @@ -85,26 +84,17 @@ def forward( # "Specifically, we modify the architecture described in Nichol et al. (2021) by projecting and # adding CLIP embeddings to the existing timestep embedding, ... time_projected_prompt_embeds = self.embedding_proj(prompt_embeds) - time_projected_image_embeddings = ( - self.clip_image_embeddings_project_to_time_embeddings( - image_embeddings)) - additive_clip_time_embeddings = ( - time_projected_image_embeddings + time_projected_prompt_embeds) + time_projected_image_embeddings = self.clip_image_embeddings_project_to_time_embeddings(image_embeddings) + additive_clip_time_embeddings = time_projected_image_embeddings + time_projected_prompt_embeds # ... and by projecting CLIP embeddings into four # extra tokens of context that are concatenated to the sequence of outputs from the GLIDE text encoder" - clip_extra_context_tokens = self.clip_extra_context_tokens_proj( - image_embeddings) - clip_extra_context_tokens = clip_extra_context_tokens.reshape( - [batch_size, -1, self.clip_extra_context_tokens]) - clip_extra_context_tokens = clip_extra_context_tokens.transpose( - [0, 2, 1]) - - text_encoder_hidden_states = self.encoder_hidden_states_proj( - text_encoder_hidden_states) - text_encoder_hidden_states = self.text_encoder_hidden_states_norm( - text_encoder_hidden_states) - text_encoder_hidden_states = paddle.concat( - [clip_extra_context_tokens, text_encoder_hidden_states], axis=1) + clip_extra_context_tokens = self.clip_extra_context_tokens_proj(image_embeddings) + clip_extra_context_tokens = clip_extra_context_tokens.reshape([batch_size, -1, self.clip_extra_context_tokens]) + clip_extra_context_tokens = clip_extra_context_tokens.transpose([0, 2, 1]) + + text_encoder_hidden_states = self.encoder_hidden_states_proj(text_encoder_hidden_states) + text_encoder_hidden_states = self.text_encoder_hidden_states_norm(text_encoder_hidden_states) + text_encoder_hidden_states = paddle.concat([clip_extra_context_tokens, text_encoder_hidden_states], axis=1) return text_encoder_hidden_states, additive_clip_time_embeddings diff --git a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/__init__.py b/ppdiffusers/ppdiffusers/pipelines/unidiffuser/__init__.py index 769e211a22e88..d0e447e0ef36e 100644 --- a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/__init__.py +++ b/ppdiffusers/ppdiffusers/pipelines/unidiffuser/__init__.py @@ -18,9 +18,13 @@ import numpy as np import PIL -from ...utils import (BaseOutput, OptionalDependencyNotAvailable, - is_einops_available, is_paddle_available, - is_paddlenlp_available) +from ...utils import ( + BaseOutput, + OptionalDependencyNotAvailable, + is_einops_available, + is_paddle_available, + is_paddlenlp_available, +) @dataclass @@ -40,12 +44,12 @@ class ImageTextPipelineOutput(BaseOutput): try: - if not (is_paddlenlp_available() and is_paddle_available() and - is_einops_available()): + if not (is_paddlenlp_available() and is_paddle_available() and is_einops_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from ...utils.dummy_paddle_and_paddlenlp_and_einops_objects import \ - UniDiffuserPipeline + from ...utils.dummy_paddle_and_paddlenlp_and_einops_objects import ( + UniDiffuserPipeline, + ) from ...utils.dummy_paddle_and_paddlenlp_objects import CaptionDecoder else: from .caption_decoder import CaptionDecoder diff --git a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/caption_decoder.py b/ppdiffusers/ppdiffusers/pipelines/unidiffuser/caption_decoder.py index 5fd8b8659eb9a..81f5e5a0b5212 100644 --- a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/caption_decoder.py +++ b/ppdiffusers/ppdiffusers/pipelines/unidiffuser/caption_decoder.py @@ -27,19 +27,20 @@ class CaptionDecoder(ModelMixin, ConfigMixin): @register_to_config def __init__( - self, - prefix_length: int=77, - hidden_dim: int=64, - vocab_size: int=50258, - hidden_size: int=768, - num_hidden_layers: int=12, - intermediate_size: int=3072, - hidden_act: int="gelu", - hidden_dropout_prob: int=0.1, - attention_probs_dropout_prob: int=0.1, - max_position_embeddings: int=1024, - initializer_range: int=0.02, - eos_token_id: int=50257, ): + self, + prefix_length: int = 77, + hidden_dim: int = 64, + vocab_size: int = 50258, + hidden_size: int = 768, + num_hidden_layers: int = 12, + intermediate_size: int = 3072, + hidden_act: int = "gelu", + hidden_dropout_prob: int = 0.1, + attention_probs_dropout_prob: int = 0.1, + max_position_embeddings: int = 1024, + initializer_range: int = 0.02, + eos_token_id: int = 50257, + ): super(CaptionDecoder, self).__init__() self.prefix_length = prefix_length config = GPTConfig( @@ -52,25 +53,24 @@ def __init__( attention_probs_dropout_prob=attention_probs_dropout_prob, max_position_embeddings=max_position_embeddings, initializer_range=initializer_range, - eos_token_id=eos_token_id, ) + eos_token_id=eos_token_id, + ) self.gpt = GPTLMHeadModel(config) self.hidden_dim = hidden_dim - self.encode_prefix = (nn.Linear(hidden_size, hidden_dim) - if hidden_dim is not None else nn.Identity()) - self.decode_prefix = (nn.Linear(hidden_dim, hidden_size) - if hidden_dim is not None else nn.Identity()) + self.encode_prefix = nn.Linear(hidden_size, hidden_dim) if hidden_dim is not None else nn.Identity() + self.decode_prefix = nn.Linear(hidden_dim, hidden_size) if hidden_dim is not None else nn.Identity() def get_dummy_token(self, batch_size: int) -> paddle.Tensor: - return paddle.zeros( - [batch_size, self.prefix_length], dtype=paddle.int64) + return paddle.zeros([batch_size, self.prefix_length], dtype=paddle.int64) def forward( - self, - tokens: paddle.Tensor, - prefix: paddle.Tensor, - attention_mask: Optional[paddle.Tensor]=None, - labels: Optional[paddle.Tensor]=None, ): + self, + tokens: paddle.Tensor, + prefix: paddle.Tensor, + attention_mask: Optional[paddle.Tensor] = None, + labels: Optional[paddle.Tensor] = None, + ): embedding_text = self.gpt.gpt.embeddings.word_embeddings(tokens) hidden = self.encode_prefix(prefix) prefix = self.decode_prefix(hidden) @@ -79,9 +79,7 @@ def forward( if labels is not None: dummy_token = self.get_dummy_token(tokens.shape[0]) labels = paddle.concat((dummy_token, tokens), axis=1) - out = self.gpt(inputs_embeds=embedding_cat, - labels=labels, - attention_mask=attention_mask) + out = self.gpt(inputs_embeds=embedding_cat, labels=labels, attention_mask=attention_mask) if self.hidden_dim: return out, hidden @@ -98,24 +96,21 @@ def generate_captions(self, tokenizer, features, use_beam_search=True): for feature in features: feature = self.decode_prefix(feature) # back to the clip feature if use_beam_search: - generated_captions.append( - self.generate_beam( - tokenizer=tokenizer, embedding=feature)[0]) + generated_captions.append(self.generate_beam(tokenizer=tokenizer, embedding=feature)[0]) else: - generated_captions.append( - self.generate2( - tokenizer=tokenizer, embedding=feature)) + generated_captions.append(self.generate2(tokenizer=tokenizer, embedding=feature)) return generated_captions @paddle.no_grad() def generate_beam( - self, - tokenizer, - prompt=None, - embedding=None, - beam_size: int=5, - entry_length: int=67, # maximum number of words - temperature: float=1.0, ): + self, + tokenizer, + prompt=None, + embedding=None, + beam_size: int = 5, + entry_length: int = 67, # maximum number of words + temperature: float = 1.0, + ): stop_token_index = self.gpt.config.eos_token_id tokens = None scores = None @@ -132,14 +127,12 @@ def generate_beam( for i in range(entry_length): logits = self.gpt(inputs_embeds=generated) - logits = logits[:, -1, :] / (temperature - if temperature > 0 else 1.0) + logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0) logits = F.softmax(logits, axis=-1).log() if scores is None: scores, next_tokens = logits.topk(beam_size, -1) generated = generated.expand([beam_size, *generated.shape[1:]]) - next_tokens, scores = next_tokens.transpose( - [1, 0]), scores.squeeze(0) + next_tokens, scores = next_tokens.transpose([1, 0]), scores.squeeze(0) if tokens is None: tokens = next_tokens else: @@ -151,8 +144,7 @@ def generate_beam( scores_sum = scores[:, None] + logits seq_lengths[~is_stopped] += 1 scores_sum_average = scores_sum / seq_lengths[:, None] - scores_sum_average, next_tokens = scores_sum_average.reshape( - [-1]).topk(beam_size, -1) + scores_sum_average, next_tokens = scores_sum_average.reshape([-1]).topk(beam_size, -1) next_tokens_source = next_tokens // scores_sum.shape[1] seq_lengths = seq_lengths[next_tokens_source] next_tokens = next_tokens % scores_sum.shape[1] @@ -165,19 +157,18 @@ def generate_beam( is_stopped = is_stopped[next_tokens_source] is_stopped = paddle.cast(is_stopped, "bool") - next_token_embed = self.gpt.get_input_embeddings()( - next_tokens.squeeze()).reshape([generated.shape[0], 1, -1]) + next_token_embed = self.gpt.get_input_embeddings()(next_tokens.squeeze()).reshape( + [generated.shape[0], 1, -1] + ) generated = paddle.concat((generated, next_token_embed), axis=1) - is_stopped = paddle.bitwise_or( - is_stopped, next_tokens.equal(stop_token_index).squeeze()) + is_stopped = paddle.bitwise_or(is_stopped, next_tokens.equal(stop_token_index).squeeze()) if is_stopped.all(): break scores = scores / seq_lengths output_list = tokens.cpu().numpy() output_texts = [ - tokenizer.decode( - output[:int(length)], skip_special_tokens=True) + tokenizer.decode(output[: int(length)], skip_special_tokens=True) for output, length in zip(output_list, seq_lengths) ] order = scores.argsort(descending=True) @@ -186,15 +177,16 @@ def generate_beam( @paddle.no_grad() def generate2( - self, - tokenizer, - tokens=None, - prompt=None, - embedding=None, - entry_count: int=1, - entry_length: int=67, # maximum number of words - top_p: float=0.8, - temperature: float=1.0, ): + self, + tokenizer, + tokens=None, + prompt=None, + embedding=None, + entry_count: int = 1, + entry_length: int = 67, # maximum number of words + top_p: float = 0.8, + temperature: float = 1.0, + ): generated_list = [] stop_token_index = self.gpt.config.eos_token_id filter_value = -float("Inf") @@ -210,16 +202,12 @@ def generate2( for entry_idx in range(entry_length): logits = self.gpt(inputs_embeds=generated) - logits = logits[:, -1, :] / (temperature - if temperature > 0 else 1.0) + logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0) sorted_logits = paddle.sort(logits, descending=True) sorted_indices = paddle.argsort(logits, descending=True) - cumulative_probs = paddle.cumsum( - F.softmax( - sorted_logits, axis=-1), axis=-1) + cumulative_probs = paddle.cumsum(F.softmax(sorted_logits, axis=-1), axis=-1) sorted_indices_to_remove = cumulative_probs > top_p - sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[ - ..., :-1].clone() + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() sorted_indices_to_remove[..., 0] = 0 indices_to_remove = sorted_indices[sorted_indices_to_remove] @@ -235,8 +223,7 @@ def generate2( break output_list = list(tokens.squeeze().cpu().numpy()) - output_text = tokenizer.decode( - output_list, skip_special_tokens=True) + output_text = tokenizer.decode(output_list, skip_special_tokens=True) generated_list.append(output_text) return generated_list[0] diff --git a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/pipeline_unidiffuser.py b/ppdiffusers/ppdiffusers/pipelines/unidiffuser/pipeline_unidiffuser.py index 17bab677a8e47..c025b3e06973e 100644 --- a/ppdiffusers/ppdiffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +++ b/ppdiffusers/ppdiffusers/pipelines/unidiffuser/pipeline_unidiffuser.py @@ -19,9 +19,13 @@ import numpy as np import paddle import PIL -from paddlenlp.transformers import (CLIPImageProcessor, CLIPTextModel, - CLIPTokenizer, - CLIPVisionModelWithProjection, GPTTokenizer) +from paddlenlp.transformers import ( + CLIPImageProcessor, + CLIPTextModel, + CLIPTokenizer, + CLIPVisionModelWithProjection, + GPTTokenizer, +) from PIL import Image from ...models import AutoencoderKL, UViTModel @@ -37,15 +41,15 @@ def center_crop(width, height, img): resample = {"box": Image.BOX, "lanczos": Image.LANCZOS}["lanczos"] crop = np.min(img.shape[:2]) - img = img[(img.shape[0] - crop) // 2:(img.shape[0] + crop) // 2, (img.shape[ - 1] - crop) // 2:(img.shape[1] + crop) // 2, ] # center crop + img = img[ + (img.shape[0] - crop) // 2 : (img.shape[0] + crop) // 2, + (img.shape[1] - crop) // 2 : (img.shape[1] + crop) // 2, + ] # center crop try: img = Image.fromarray(img, "RGB") except: img = Image.fromarray(img) - img = img.resize( - (width, height), - resample) # resize the center crop from [crop, crop] to [width, height] + img = img.resize((width, height), resample) # resize the center crop from [crop, crop] to [width, height] return np.array(img).astype(np.uint8) @@ -62,16 +66,17 @@ class UniDiffuserPipeline(DiffusionPipeline): scheduler: DPMSolverUniDiffuserScheduler def __init__( - self, - image_encoder: CLIPVisionModelWithProjection, - image_feature_extractor: CLIPImageProcessor, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UViTModel, - vae: AutoencoderKL, - caption_decoder: CaptionDecoder, - caption_tokenizer: GPTTokenizer, - scheduler: DPMSolverUniDiffuserScheduler, ): + self, + image_encoder: CLIPVisionModelWithProjection, + image_feature_extractor: CLIPImageProcessor, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UViTModel, + vae: AutoencoderKL, + caption_decoder: CaptionDecoder, + caption_tokenizer: GPTTokenizer, + scheduler: DPMSolverUniDiffuserScheduler, + ): super().__init__() self.register_modules( image_encoder=image_encoder, @@ -82,51 +87,48 @@ def __init__( vae=vae, caption_decoder=caption_decoder, caption_tokenizer=caption_tokenizer, - scheduler=scheduler, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + scheduler=scheduler, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.num_channels_latents = vae.latent_channels # 4 self.image_encoder_clip_img_dim = image_encoder.config.projection_dim # 512 self.text_encoder_seq_len = tokenizer.model_max_length # 77 - self.text_encoder_text_dim = ( - text_encoder.config.hidden_size // - text_encoder.config.num_attention_heads) # 64 + self.text_encoder_text_dim = text_encoder.config.hidden_size // text_encoder.config.num_attention_heads # 64 # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs def check_inputs( - self, - prompt, - height, - width, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -139,10 +141,10 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) - def _infer_batch_size(self, mode, image, prompt, prompt_embeds, - num_samples): + def _infer_batch_size(self, mode, image, prompt, prompt_embeds, num_samples): if mode in ["t2i", "t2i2t"]: if prompt is not None and isinstance(prompt, str): batch_size = 1 @@ -169,20 +171,16 @@ def _split(self, x, height, width): latent_width = width // self.vae_scale_factor img_vae_dim = self.num_channels_latents * latent_height * latent_width - img_vae, img_clip = x.split( - [img_vae_dim, self.image_encoder_clip_img_dim], axis=1) + img_vae, img_clip = x.split([img_vae_dim, self.image_encoder_clip_img_dim], axis=1) img_vae = einops.rearrange( img_vae, "B (C H W) -> B C H W", C=self.num_channels_latents, H=latent_height, - W=latent_width, ) - img_clip = einops.rearrange( - img_clip, - "B (L D) -> B L D", - L=1, - D=self.image_encoder_clip_img_dim) + W=latent_width, + ) + img_clip = einops.rearrange(img_clip, "B (L D) -> B L D", L=1, D=self.image_encoder_clip_img_dim) return img_vae, img_clip def _combine(self, img_vae, img_clip): @@ -205,24 +203,21 @@ def _split_joint(self, x, height, width): img_vae_dim = self.num_channels_latents * latent_height * latent_width text_dim = self.text_encoder_seq_len * self.text_encoder_text_dim - img_vae, img_clip, text = x.split( - [img_vae_dim, self.image_encoder_clip_img_dim, text_dim], axis=1) + img_vae, img_clip, text = x.split([img_vae_dim, self.image_encoder_clip_img_dim, text_dim], axis=1) img_vae = einops.rearrange( img_vae, "B (C H W) -> B C H W", C=self.num_channels_latents, H=latent_height, - W=latent_width, ) - img_clip = einops.rearrange( - img_clip, - "B (L D) -> B L D", - L=1, - D=self.image_encoder_clip_img_dim) + W=latent_width, + ) + img_clip = einops.rearrange(img_clip, "B (L D) -> B L D", L=1, D=self.image_encoder_clip_img_dim) text = einops.rearrange( text, "B (L D) -> B L D", L=self.text_encoder_seq_len, - D=self.text_encoder_text_dim, ) + D=self.text_encoder_text_dim, + ) return img_vae, img_clip, text def _combine_joint(self, img_vae, img_clip, text): @@ -238,34 +233,29 @@ def _combine_joint(self, img_vae, img_clip, text): # Modified from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt def encode_text_latents( - self, - prompt, - num_images_per_prompt, - negative_prompt=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, ): + self, + prompt, + num_images_per_prompt, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + ): if prompt_embeds is None: text_inputs = self.tokenizer( prompt, padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) prompt_embeds = self.text_encoder(text_inputs.input_ids)[0] return prompt_embeds # Modified from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_instruct_pix2pix.StableDiffusionInstructPix2PixPipeline.prepare_image_latents - def encode_image_vae_latents(self, - image, - batch_size, - num_images_per_prompt, - dtype, - generator=None): + def encode_image_vae_latents(self, image, batch_size, num_images_per_prompt, dtype, generator=None): if not isinstance(image, paddle.Tensor): - raise ValueError( - f"`image` has to be of type `paddle.Tensor`, but is {type(image)}" - ) + raise ValueError(f"`image` has to be of type `paddle.Tensor`, but is {type(image)}") image = image.cast(dtype) batch_size = batch_size * num_images_per_prompt @@ -278,17 +268,14 @@ def encode_image_vae_latents(self, # vae encode if isinstance(generator, list): image_latents = [ - self.vae.encode(image[i:i + 1]).latent_dist.sample(generator[i]) - * self.vae.scaling_factor for i in range(batch_size) + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) * self.vae.scaling_factor + for i in range(batch_size) ] image_latents = paddle.concat(image_latents, axis=0) else: - image_latents = ( - self.vae.encode(image).latent_dist.sample(generator) * - self.vae.scaling_factor) + image_latents = self.vae.encode(image).latent_dist.sample(generator) * self.vae.scaling_factor - if (batch_size > image_latents.shape[0] and - batch_size % image_latents.shape[0] != 0): + if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0: raise ValueError( f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts." ) @@ -299,22 +286,20 @@ def encode_image_vae_latents(self, # Modified from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_instruct_pix2pix.StableDiffusionInstructPix2PixPipeline.prepare_image_latents def encode_image_clip_latents( - self, - image, - batch_size, - num_images_per_prompt, - dtype, ): + self, + image, + batch_size, + num_images_per_prompt, + dtype, + ): batch_size = batch_size * num_images_per_prompt # clip encode - inputs = self.image_feature_extractor( - images=Image.fromarray(image), return_tensors="pd").pixel_values + inputs = self.image_feature_extractor(images=Image.fromarray(image), return_tensors="pd").pixel_values # TODO junnyu, support float16 we need cast dtype - image_latents = self.image_encoder( - inputs.cast(self.image_encoder.dtype)).image_embeds.unsqueeze(1) + image_latents = self.image_encoder(inputs.cast(self.image_encoder.dtype)).image_embeds.unsqueeze(1) - if (batch_size > image_latents.shape[0] and - batch_size % image_latents.shape[0] != 0): + if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0: raise ValueError( f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts." ) @@ -333,13 +318,7 @@ def decode_image_latents(self, latents): return image # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents - def prepare_text_latents(self, - batch_size, - seq_len, - hidden_size, - dtype, - generator, - latents=None): + def prepare_text_latents(self, batch_size, seq_len, hidden_size, dtype, generator, latents=None): # Prepare text latents for the CLIP embedded prompt. shape = [batch_size, seq_len, hidden_size] if isinstance(generator, list) and len(generator) != batch_size: @@ -357,14 +336,15 @@ def prepare_text_latents(self, # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_image_vae_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): # Prepare latents for the VAE embedded image. shape = [ batch_size, @@ -386,12 +366,7 @@ def prepare_image_vae_latents( return latents # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents - def prepare_image_clip_latents(self, - batch_size, - clip_img_dim, - dtype, - generator, - latents=None): + def prepare_image_clip_latents(self, batch_size, clip_img_dim, dtype, generator, latents=None): # Prepare latents for the CLIP embedded image. shape = [batch_size, 1, clip_img_dim] if isinstance(generator, list) and len(generator) != batch_size: @@ -408,66 +383,61 @@ def prepare_image_clip_latents(self, return latents def get_noise_pred( - self, - mode, - latents, - t, - img_vae, - img_clip, - prompt_embeds, - N, - guidance_scale, - height, - width, - data_type=1, - generator=None, ): + self, + mode, + latents, + t, + img_vae, + img_clip, + prompt_embeds, + N, + guidance_scale, + height, + width, + data_type=1, + generator=None, + ): dtype = self.unet.dtype if mode == "joint": - img_vae_latents, img_clip_latents, text_latents = self._split_joint( - latents, height, width) + img_vae_latents, img_clip_latents, text_latents = self._split_joint(latents, height, width) img_vae_out, img_clip_out, text_out = self.unet( img=img_vae_latents, clip_img=img_clip_latents, text=text_latents, t_img=t, t_text=t, - data_type=paddle.zeros_like( - t, dtype=paddle.int32) + data_type, ) + data_type=paddle.zeros_like(t, dtype=paddle.int32) + data_type, + ) x_out = self._combine_joint(img_vae_out, img_clip_out, text_out) if guidance_scale == 0.0: return x_out - img_vae_T = randn_tensor( - img_vae.shape, generator=generator, dtype=dtype) - img_clip_T = randn_tensor( - img_clip.shape, generator=generator, dtype=dtype) + img_vae_T = randn_tensor(img_vae.shape, generator=generator, dtype=dtype) + img_clip_T = randn_tensor(img_clip.shape, generator=generator, dtype=dtype) _, _, text_out_uncond = self.unet( img=img_vae_T, clip_img=img_clip_T, text=text_latents, t_img=paddle.ones_like(t) * N, t_text=t, - data_type=paddle.zeros_like( - t, dtype=paddle.int32) + data_type, ) - text_T = randn_tensor( - prompt_embeds.shape, generator=generator, dtype=dtype) + data_type=paddle.zeros_like(t, dtype=paddle.int32) + data_type, + ) + text_T = randn_tensor(prompt_embeds.shape, generator=generator, dtype=dtype) img_vae_out_uncond, img_clip_out_uncond, _ = self.unet( img=img_vae_latents, clip_img=img_clip_latents, text=text_T, t_img=t, t_text=paddle.ones_like(t) * N, - data_type=paddle.zeros_like( - t, dtype=paddle.int32) + data_type, ) - x_out_uncond = self._combine_joint( - img_vae_out_uncond, img_clip_out_uncond, text_out_uncond) + data_type=paddle.zeros_like(t, dtype=paddle.int32) + data_type, + ) + x_out_uncond = self._combine_joint(img_vae_out_uncond, img_clip_out_uncond, text_out_uncond) return x_out + guidance_scale * (x_out - x_out_uncond) elif mode == "t2i": - img_vae_latents, img_clip_latents = self._split(latents, height, - width) + img_vae_latents, img_clip_latents = self._split(latents, height, width) t_text = paddle.zeros([t.shape[0]], dtype=paddle.int32) img_vae_out, img_clip_out, text_out = self.unet( img=img_vae_latents, @@ -475,25 +445,23 @@ def get_noise_pred( text=prompt_embeds, t_img=t, t_text=t_text, - data_type=paddle.zeros_like( - t_text, dtype=paddle.int32) + data_type, ) + data_type=paddle.zeros_like(t_text, dtype=paddle.int32) + data_type, + ) img_out = self._combine(img_vae_out, img_clip_out) if guidance_scale == 0.0: return img_out - text_T = randn_tensor( - prompt_embeds.shape, generator=generator, dtype=dtype) + text_T = randn_tensor(prompt_embeds.shape, generator=generator, dtype=dtype) img_vae_out_uncond, img_clip_out_uncond, text_out_uncond = self.unet( img=img_vae_latents, clip_img=img_clip_latents, text=text_T, t_img=t, t_text=paddle.ones_like(t) * N, - data_type=paddle.zeros_like( - t_text, dtype=paddle.int32) + data_type, ) - img_out_uncond = self._combine(img_vae_out_uncond, - img_clip_out_uncond) + data_type=paddle.zeros_like(t_text, dtype=paddle.int32) + data_type, + ) + img_out_uncond = self._combine(img_vae_out_uncond, img_clip_out_uncond) return img_out + guidance_scale * (img_out - img_out_uncond) @@ -505,23 +473,21 @@ def get_noise_pred( text=latents, t_img=t_img, t_text=t, - data_type=paddle.zeros_like( - t_img, dtype=paddle.int32) + data_type, ) + data_type=paddle.zeros_like(t_img, dtype=paddle.int32) + data_type, + ) if guidance_scale == 0.0: return text_out - img_vae_T = randn_tensor( - img_vae.shape, generator=generator, dtype=dtype) - img_clip_T = randn_tensor( - img_clip.shape, generator=generator, dtype=dtype) + img_vae_T = randn_tensor(img_vae.shape, generator=generator, dtype=dtype) + img_clip_T = randn_tensor(img_clip.shape, generator=generator, dtype=dtype) img_vae_out_uncond, img_clip_out_uncond, text_out_uncond = self.unet( img=img_vae_T, clip_img=img_clip_T, text=latents, t_img=paddle.ones_like(t) * N, t_text=t, - data_type=paddle.zeros_like( - t, dtype=paddle.int32) + data_type, ) + data_type=paddle.zeros_like(t, dtype=paddle.int32) + data_type, + ) return text_out + guidance_scale * (text_out - text_out_uncond) elif mode == "t": @@ -531,13 +497,12 @@ def get_noise_pred( text=latents, t_img=paddle.ones_like(t) * N, t_text=t, - data_type=paddle.zeros_like( - t, dtype=paddle.int32) + data_type, ) + data_type=paddle.zeros_like(t, dtype=paddle.int32) + data_type, + ) return text_out elif mode == "i": - img_vae_latents, img_clip_latents = self._split(latents, height, - width) + img_vae_latents, img_clip_latents = self._split(latents, height, width) t_text = paddle.ones_like(t) * N img_vae_out, img_clip_out, text_out = self.unet( img=img_vae_latents, @@ -545,8 +510,8 @@ def get_noise_pred( text=prompt_embeds, t_img=t, t_text=t_text, - data_type=paddle.zeros_like( - t_text, dtype=paddle.int32) + data_type, ) + data_type=paddle.zeros_like(t_text, dtype=paddle.int32) + data_type, + ) img_out = self._combine(img_vae_out, img_clip_out) return img_out @@ -557,36 +522,34 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def _denoising_sample_fn( - self, - mode, - image_vae_latents, - image_clip_latents, - prompt_embeds, - num_inference_steps, - extra_step_kwargs, - guidance_scale, - height, - width, - callback, - callback_steps, ): + self, + mode, + image_vae_latents, + image_clip_latents, + prompt_embeds, + num_inference_steps, + extra_step_kwargs, + guidance_scale, + height, + width, + callback, + callback_steps, + ): # Prepare latent variables if mode == "joint": - latents = self._combine_joint(image_vae_latents, image_clip_latents, - prompt_embeds) + latents = self._combine_joint(image_vae_latents, image_clip_latents, prompt_embeds) elif mode in ["t2i", "i"]: latents = self._combine(image_vae_latents, image_clip_latents) elif mode in ["i2t", "t"]: @@ -599,8 +562,7 @@ def _denoising_sample_fn( timesteps = self.scheduler.timesteps N = self.scheduler.config.num_train_timesteps - num_warmup_steps = len( - timesteps) - num_inference_steps * self.scheduler.order + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): noise_pred = self.get_noise_pred( @@ -613,27 +575,23 @@ def _denoising_sample_fn( N, guidance_scale, height, - width, ) + width, + ) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided - if i == len(timesteps) - 1 or ( - (i + 1) > num_warmup_steps and - (i + 1) % self.scheduler.order == 0): + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() if callback is not None and i % callback_steps == 0: callback(i, t, latents) if mode == "joint": - image_vae_latents, image_clip_latents, text_latents = self._split_joint( - latents, height, width) + image_vae_latents, image_clip_latents, text_latents = self._split_joint(latents, height, width) return image_vae_latents, image_clip_latents, text_latents elif mode in ["t2i", "i"]: - image_vae_latents, image_clip_latents = self._split(latents, height, - width) + image_vae_latents, image_clip_latents = self._split(latents, height, width) return image_vae_latents, image_clip_latents elif mode in ["i2t", "t"]: text_latents = latents @@ -641,32 +599,32 @@ def _denoising_sample_fn( @paddle.no_grad() def __call__( - self, - mode: str="t2i", # t2i, i2t, t2i2t, i2t2i, joint, i, t - image: Optional[Union[paddle.Tensor, PIL.Image.Image]]=None, - prompt: Optional[Union[str, List[str]]]=None, - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.0, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - num_prompts_per_image: Optional[int]=1, - num_samples: int=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - prompt_latents: Optional[paddle.Tensor]=None, - vae_latents: Optional[paddle.Tensor]=None, - clip_latents: Optional[paddle.Tensor]=None, - prompt_embeds: Optional[paddle.Tensor]=None, - negative_prompt_embeds: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - use_beam_search: Optional[bool]=True, - **kwargs, ): + self, + mode: str = "t2i", # t2i, i2t, t2i2t, i2t2i, joint, i, t + image: Optional[Union[paddle.Tensor, PIL.Image.Image]] = None, + prompt: Optional[Union[str, List[str]]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + num_prompts_per_image: Optional[int] = 1, + num_samples: int = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + prompt_latents: Optional[paddle.Tensor] = None, + vae_latents: Optional[paddle.Tensor] = None, + clip_latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + use_beam_search: Optional[bool] = True, + **kwargs, + ): # 0. Default height and width to unet height = height or self.unet.config.img_size * self.vae_scale_factor width = width or self.unet.config.img_size * self.vae_scale_factor @@ -679,8 +637,7 @@ def __call__( self.check_inputs([prompt], height, width, callback_steps) # 2. Define call parameters - batch_size = self._infer_batch_size(mode, image, prompt, prompt_embeds, - num_samples) + batch_size = self._infer_batch_size(mode, image, prompt, prompt_embeds, num_samples) # 3. Encode input prompt if available; otherwise prepare text latents if mode in ["t2i", "t2i2t"]: @@ -691,7 +648,8 @@ def __call__( num_images_per_prompt, negative_prompt, prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, ) + negative_prompt_embeds=negative_prompt_embeds, + ) # Encode contexts to lower text dim, 768 -> 64 prompt_embeds = self.unet.encode_prefix(prompt_embeds) else: @@ -700,10 +658,10 @@ def __call__( batch_size, self.text_encoder_seq_len, self.text_encoder_text_dim, - paddle. - float32, # Placeholder, need to determine correct thing to do for dtype + paddle.float32, # Placeholder, need to determine correct thing to do for dtype generator, - prompt_latents, ) + prompt_latents, + ) # 4. Encode input image if available; otherwise prepare image latents if mode in ["i2t", "i2t2i"]: @@ -716,7 +674,8 @@ def __call__( image_crop, batch_size, num_prompts_per_image, # not num_images_per_prompt - prompt_embeds.dtype, ) + prompt_embeds.dtype, + ) # Encode image using VAE image_vae = (image_crop / 127.5 - 1.0).astype(np.float32) image_vae = einops.rearrange(image_vae, "h w c -> 1 c h w") @@ -725,7 +684,8 @@ def __call__( batch_size, num_prompts_per_image, # not num_images_per_prompt prompt_embeds.dtype, - generator, ) + generator, + ) else: # 4.2. Prepare image latent variables, if necessary @@ -735,7 +695,8 @@ def __call__( self.image_encoder_clip_img_dim, prompt_embeds.dtype, generator, - clip_latents, ) + clip_latents, + ) # Prepare image VAE latents image_vae_latents = self.prepare_image_vae_latents( batch_size * num_images_per_prompt, @@ -744,7 +705,8 @@ def __call__( width, prompt_embeds.dtype, generator, - vae_latents, ) + vae_latents, + ) # 5. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) @@ -762,7 +724,8 @@ def __call__( height, width, callback, - callback_steps, ) + callback_steps, + ) elif mode in ["i2t2i"]: # 'i2t2i' should do 'i2t' first outs = self._denoising_sample_fn( @@ -776,7 +739,8 @@ def __call__( height, width, callback, - callback_steps, ) + callback_steps, + ) elif mode in ["t2i2t"]: # 't2i2t' should do 't2i' first outs = self._denoising_sample_fn( @@ -790,7 +754,8 @@ def __call__( height, width, callback, - callback_steps, ) + callback_steps, + ) else: raise ValueError @@ -800,9 +765,8 @@ def __call__( image_vae_latents, image_clip_latents, text_latents = outs gen_image = self.decode_image_latents(image_vae_latents) gen_text = self.caption_decoder.generate_captions( - self.caption_tokenizer, - text_latents, - use_beam_search=use_beam_search) + self.caption_tokenizer, text_latents, use_beam_search=use_beam_search + ) elif mode in ["t2i", "i", "t2i2t"]: image_vae_latents, image_clip_latents = outs @@ -814,10 +778,10 @@ def __call__( batch_size, self.text_encoder_seq_len, self.text_encoder_text_dim, - paddle. - float32, # Placeholder, need to determine correct thing to do for dtype + paddle.float32, # Placeholder, need to determine correct thing to do for dtype generator, - prompt_latents, ) + prompt_latents, + ) text_latents = self._denoising_sample_fn( "i2t", image_vae_latents, @@ -829,11 +793,13 @@ def __call__( height, width, callback, - callback_steps, ) + callback_steps, + ) gen_text = self.caption_decoder.generate_captions( self.caption_tokenizer, text_latents, - use_beam_search=use_beam_search, ) + use_beam_search=use_beam_search, + ) elif mode in ["i2t", "t", "i2t2i"]: text_latents = outs @@ -841,7 +807,8 @@ def __call__( gen_text = self.caption_decoder.generate_captions( self.caption_tokenizer, text_latents, - use_beam_search=use_beam_search, ) + use_beam_search=use_beam_search, + ) else: # 'i2t2i' should do 't2i' later # Prepare image CLIP latents @@ -850,7 +817,8 @@ def __call__( self.image_encoder_clip_img_dim, prompt_embeds.dtype, generator, - clip_latents, ) + clip_latents, + ) # Prepare image VAE latents image_vae_latents = self.prepare_image_vae_latents( batch_size * num_images_per_prompt, @@ -859,7 +827,8 @@ def __call__( width, prompt_embeds.dtype, generator, - vae_latents, ) + vae_latents, + ) image_vae_latents, image_clip_latents = self._denoising_sample_fn( "t2i", image_vae_latents, @@ -871,7 +840,8 @@ def __call__( height, width, callback, - callback_steps, ) + callback_steps, + ) gen_image = self.decode_image_latents(image_vae_latents) # 8. Convert gen_image to PIL, gen_text has no else processing diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/__init__.py index ac2ddb173413d..309b32b2d1129 100644 --- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/__init__.py +++ b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/__init__.py @@ -13,8 +13,11 @@ # limitations under the License. # flake8: noqa -from ...utils import (OptionalDependencyNotAvailable, is_paddle_available, - is_paddlenlp_available) +from ...utils import ( + OptionalDependencyNotAvailable, + is_paddle_available, + is_paddlenlp_available, +) try: if not (is_paddlenlp_available() and is_paddle_available()): @@ -22,14 +25,19 @@ except OptionalDependencyNotAvailable: from ...utils.dummy_paddle_and_paddlenlp_objects import ( VersatileDiffusionDualGuidedPipeline, - VersatileDiffusionImageVariationPipeline, VersatileDiffusionPipeline, - VersatileDiffusionTextToImagePipeline) + VersatileDiffusionImageVariationPipeline, + VersatileDiffusionPipeline, + VersatileDiffusionTextToImagePipeline, + ) else: from .modeling_text_unet import UNetFlatConditionModel from .pipeline_versatile_diffusion import VersatileDiffusionPipeline - from .pipeline_versatile_diffusion_dual_guided import \ - VersatileDiffusionDualGuidedPipeline - from .pipeline_versatile_diffusion_image_variation import \ - VersatileDiffusionImageVariationPipeline - from .pipeline_versatile_diffusion_text_to_image import \ - VersatileDiffusionTextToImagePipeline + from .pipeline_versatile_diffusion_dual_guided import ( + VersatileDiffusionDualGuidedPipeline, + ) + from .pipeline_versatile_diffusion_image_variation import ( + VersatileDiffusionImageVariationPipeline, + ) + from .pipeline_versatile_diffusion_text_to_image import ( + VersatileDiffusionTextToImagePipeline, + ) diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/modeling_text_unet.py index 69099f5186cf6..377ab850f1e93 100644 --- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/modeling_text_unet.py +++ b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/modeling_text_unet.py @@ -24,11 +24,13 @@ from ...configuration_utils import ConfigMixin, register_to_config from ...models import ModelMixin from ...models.attention import Attention -from ...models.attention_processor import (AttentionProcessor, - AttnAddedKVProcessor, AttnProcessor) +from ...models.attention_processor import ( + AttentionProcessor, + AttnAddedKVProcessor, + AttnProcessor, +) from ...models.dual_transformer_2d import DualTransformer2DModel -from ...models.embeddings import (GaussianFourierProjection, TimestepEmbedding, - Timesteps) +from ...models.embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps from ...models.transformer_2d import Transformer2DModel from ...models.unet_2d_condition import UNet2DConditionOutput from ...utils import NEG_INF, deprecate, logging @@ -37,30 +39,29 @@ def get_down_block( - down_block_type, - num_layers, - in_channels, - out_channels, - temb_channels, - add_downsample, - resnet_eps, - resnet_act_fn, - attn_num_head_channels, - resnet_groups=None, - cross_attention_dim=None, - downsample_padding=None, - dual_cross_attention=False, - use_linear_projection=False, - only_cross_attention=False, - upcast_attention=False, - resnet_time_scale_shift="default", - resnet_skip_time_act=False, # HF missing in v0.16.1 - resnet_out_scale_factor=1.0, # HF missing in v0.16.1 - cross_attention_norm=None, # HF missing in v0.16.1 - resnet_pre_temb_non_linearity: bool=False, ): - down_block_type = (down_block_type[7:] - if down_block_type.startswith("UNetRes") else - down_block_type) + down_block_type, + num_layers, + in_channels, + out_channels, + temb_channels, + add_downsample, + resnet_eps, + resnet_act_fn, + attn_num_head_channels, + resnet_groups=None, + cross_attention_dim=None, + downsample_padding=None, + dual_cross_attention=False, + use_linear_projection=False, + only_cross_attention=False, + upcast_attention=False, + resnet_time_scale_shift="default", + resnet_skip_time_act=False, # HF missing in v0.16.1 + resnet_out_scale_factor=1.0, # HF missing in v0.16.1 + cross_attention_norm=None, # HF missing in v0.16.1 + resnet_pre_temb_non_linearity: bool = False, +): + down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type if down_block_type == "DownBlockFlat": return DownBlockFlat( num_layers=num_layers, @@ -73,12 +74,11 @@ def get_down_block( resnet_groups=resnet_groups, downsample_padding=downsample_padding, resnet_time_scale_shift=resnet_time_scale_shift, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif down_block_type == "CrossAttnDownBlockFlat": if cross_attention_dim is None: - raise ValueError( - "cross_attention_dim must be specified for CrossAttnDownBlockFlat" - ) + raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlockFlat") return CrossAttnDownBlockFlat( num_layers=num_layers, in_channels=in_channels, @@ -95,34 +95,35 @@ def get_down_block( use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, resnet_time_scale_shift=resnet_time_scale_shift, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) raise ValueError(f"{down_block_type} is not supported.") def get_up_block( - up_block_type, - num_layers, - in_channels, - out_channels, - prev_output_channel, - temb_channels, - add_upsample, - resnet_eps, - resnet_act_fn, - attn_num_head_channels, - resnet_groups=None, - cross_attention_dim=None, - dual_cross_attention=False, - use_linear_projection=False, - only_cross_attention=False, - upcast_attention=False, - resnet_time_scale_shift="default", - resnet_skip_time_act=False, # HF missing in v0.16.1 - resnet_out_scale_factor=1.0, # HF missing in v0.16.1 - cross_attention_norm=None, # HF missing in v0.16.1 - resnet_pre_temb_non_linearity: bool=False, ): - up_block_type = (up_block_type[7:] - if up_block_type.startswith("UNetRes") else up_block_type) + up_block_type, + num_layers, + in_channels, + out_channels, + prev_output_channel, + temb_channels, + add_upsample, + resnet_eps, + resnet_act_fn, + attn_num_head_channels, + resnet_groups=None, + cross_attention_dim=None, + dual_cross_attention=False, + use_linear_projection=False, + only_cross_attention=False, + upcast_attention=False, + resnet_time_scale_shift="default", + resnet_skip_time_act=False, # HF missing in v0.16.1 + resnet_out_scale_factor=1.0, # HF missing in v0.16.1 + cross_attention_norm=None, # HF missing in v0.16.1 + resnet_pre_temb_non_linearity: bool = False, +): + up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type if up_block_type == "UpBlockFlat": return UpBlockFlat( num_layers=num_layers, @@ -135,11 +136,11 @@ def get_up_block( resnet_act_fn=resnet_act_fn, resnet_groups=resnet_groups, resnet_time_scale_shift=resnet_time_scale_shift, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif up_block_type == "CrossAttnUpBlockFlat": if cross_attention_dim is None: - raise ValueError( - "cross_attention_dim must be specified for CrossAttnUpBlockFlat") + raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlockFlat") return CrossAttnUpBlockFlat( num_layers=num_layers, in_channels=in_channels, @@ -156,7 +157,8 @@ def get_up_block( use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, resnet_time_scale_shift=resnet_time_scale_shift, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) raise ValueError(f"{up_block_type} is not supported.") @@ -236,54 +238,57 @@ class conditioning with `class_embed_type` equal to `None`. @register_to_config def __init__( - self, - sample_size: Optional[int]=None, - in_channels: int=4, - out_channels: int=4, - center_input_sample: bool=False, - flip_sin_to_cos: bool=True, - freq_shift: int=0, - down_block_types: Tuple[str]=( - "CrossAttnDownBlockFlat", - "CrossAttnDownBlockFlat", - "CrossAttnDownBlockFlat", - "DownBlockFlat", ), - mid_block_type: Optional[str]="UNetMidBlockFlatCrossAttn", - up_block_types: Tuple[str]=( - "UpBlockFlat", - "CrossAttnUpBlockFlat", - "CrossAttnUpBlockFlat", - "CrossAttnUpBlockFlat", ), - only_cross_attention: Union[bool, Tuple[bool]]=False, - block_out_channels: Tuple[int]=(320, 640, 1280, 1280), - layers_per_block: Union[int, Tuple[int]]=2, - downsample_padding: int=1, - mid_block_scale_factor: float=1, - act_fn: str="silu", - norm_num_groups: Optional[int]=32, - norm_eps: float=1e-5, - cross_attention_dim: Union[int, Tuple[int]]=1280, - encoder_hid_dim: Optional[int]=None, - attention_head_dim: Union[int, Tuple[int]]=8, - dual_cross_attention: bool=False, - use_linear_projection: bool=False, - class_embed_type: Optional[str]=None, - num_class_embeds: Optional[int]=None, - upcast_attention: bool=False, - resnet_time_scale_shift: str="default", - resnet_skip_time_act: bool=False, - resnet_out_scale_factor: int=1.0, - time_embedding_type: str="positional", # fourier, positional - time_embedding_act_fn: Optional[str]=None, - timestep_post_act: Optional[str]=None, - time_cond_proj_dim: Optional[int]=None, - conv_in_kernel: int=3, - conv_out_kernel: int=3, - projection_class_embeddings_input_dim: Optional[int]=None, - class_embeddings_concat: bool=False, - mid_block_only_cross_attention: Optional[bool]=None, - cross_attention_norm: Optional[str]=None, - resnet_pre_temb_non_linearity: Optional[bool]=False, ): + self, + sample_size: Optional[int] = None, + in_channels: int = 4, + out_channels: int = 4, + center_input_sample: bool = False, + flip_sin_to_cos: bool = True, + freq_shift: int = 0, + down_block_types: Tuple[str] = ( + "CrossAttnDownBlockFlat", + "CrossAttnDownBlockFlat", + "CrossAttnDownBlockFlat", + "DownBlockFlat", + ), + mid_block_type: Optional[str] = "UNetMidBlockFlatCrossAttn", + up_block_types: Tuple[str] = ( + "UpBlockFlat", + "CrossAttnUpBlockFlat", + "CrossAttnUpBlockFlat", + "CrossAttnUpBlockFlat", + ), + only_cross_attention: Union[bool, Tuple[bool]] = False, + block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + layers_per_block: Union[int, Tuple[int]] = 2, + downsample_padding: int = 1, + mid_block_scale_factor: float = 1, + act_fn: str = "silu", + norm_num_groups: Optional[int] = 32, + norm_eps: float = 1e-5, + cross_attention_dim: Union[int, Tuple[int]] = 1280, + encoder_hid_dim: Optional[int] = None, + attention_head_dim: Union[int, Tuple[int]] = 8, + dual_cross_attention: bool = False, + use_linear_projection: bool = False, + class_embed_type: Optional[str] = None, + num_class_embeds: Optional[int] = None, + upcast_attention: bool = False, + resnet_time_scale_shift: str = "default", + resnet_skip_time_act: bool = False, + resnet_out_scale_factor: int = 1.0, + time_embedding_type: str = "positional", # fourier, positional + time_embedding_act_fn: Optional[str] = None, + timestep_post_act: Optional[str] = None, + time_cond_proj_dim: Optional[int] = None, + conv_in_kernel: int = 3, + conv_out_kernel: int = 3, + projection_class_embeddings_input_dim: Optional[int] = None, + class_embeddings_concat: bool = False, + mid_block_only_cross_attention: Optional[bool] = None, + cross_attention_norm: Optional[str] = None, + resnet_pre_temb_non_linearity: Optional[bool] = False, + ): super().__init__() self.sample_size = sample_size @@ -292,7 +297,8 @@ def __init__( if len(down_block_types) != len(up_block_types): raise ValueError( "Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`:" - f" {down_block_types}. `up_block_types`: {up_block_types}.") + f" {down_block_types}. `up_block_types`: {up_block_types}." + ) if len(block_out_channels) != len(down_block_types): raise ValueError( @@ -300,35 +306,28 @@ def __init__( f" {block_out_channels}. `down_block_types`: {down_block_types}." ) - if not isinstance( - only_cross_attention, - bool) and len(only_cross_attention) != len(down_block_types): + if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types): raise ValueError( "Must provide the same number of `only_cross_attention` as `down_block_types`." f" `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}." ) - if not isinstance( - attention_head_dim, - int) and len(attention_head_dim) != len(down_block_types): + if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types): raise ValueError( "Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`:" f" {attention_head_dim}. `down_block_types`: {down_block_types}." ) - if isinstance( - cross_attention_dim, - list) and len(cross_attention_dim) != len(down_block_types): + if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types): raise ValueError( "Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`:" f" {cross_attention_dim}. `down_block_types`: {down_block_types}." ) - if not isinstance( - layers_per_block, - int) and len(layers_per_block) != len(down_block_types): + if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types): raise ValueError( "Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`:" - f" {layers_per_block}. `down_block_types`: {down_block_types}.") + f" {layers_per_block}. `down_block_types`: {down_block_types}." + ) # input conv_in_padding = (conv_in_kernel - 1) // 2 @@ -336,26 +335,25 @@ def __init__( in_channels, block_out_channels[0], kernel_size=conv_in_kernel, - padding=conv_in_padding, ) + padding=conv_in_padding, + ) # time if time_embedding_type == "fourier": time_embed_dim = block_out_channels[0] * 2 if time_embed_dim % 2 != 0: - raise ValueError( - f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}." - ) + raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.") self.time_proj = GaussianFourierProjection( time_embed_dim // 2, set_W_to_weight=False, log=False, - flip_sin_to_cos=flip_sin_to_cos, ) + flip_sin_to_cos=flip_sin_to_cos, + ) timestep_input_dim = time_embed_dim elif time_embedding_type == "positional": time_embed_dim = block_out_channels[0] * 4 - self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, - freq_shift) + self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift) timestep_input_dim = block_out_channels[0] else: raise ValueError( @@ -367,20 +365,18 @@ def __init__( time_embed_dim, act_fn=act_fn, post_act_fn=timestep_post_act, - cond_proj_dim=time_cond_proj_dim, ) + cond_proj_dim=time_cond_proj_dim, + ) if encoder_hid_dim is not None: - self.encoder_hid_proj = nn.Linear(encoder_hid_dim, - cross_attention_dim) + self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim) else: self.encoder_hid_proj = None # class embedding if class_embed_type is None and num_class_embeds is not None: - self.class_embedding = nn.Embedding(num_class_embeds, - time_embed_dim) + self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim) elif class_embed_type == "timestep": - self.class_embedding = TimestepEmbedding(timestep_input_dim, - time_embed_dim) + self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim) elif class_embed_type == "identity": self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim) elif class_embed_type == "projection": @@ -395,15 +391,13 @@ def __init__( # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations. # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings. # As a result, `TimestepEmbedding` can be passed arbitrary vectors. - self.class_embedding = TimestepEmbedding( - projection_class_embeddings_input_dim, time_embed_dim) + self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim) elif class_embed_type == "simple_projection": if projection_class_embeddings_input_dim is None: raise ValueError( "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set" ) - self.class_embedding = nn.Linear( - projection_class_embeddings_input_dim, time_embed_dim) + self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim) else: self.class_embedding = None @@ -418,8 +412,7 @@ def __init__( elif time_embedding_act_fn == "gelu": self.time_embed_act = nn.GELU() else: - raise ValueError( - f"Unsupported activation function: {time_embedding_act_fn}") + raise ValueError(f"Unsupported activation function: {time_embedding_act_fn}") self.down_blocks = nn.LayerList([]) self.up_blocks = nn.LayerList([]) @@ -440,18 +433,16 @@ def __init__( if mid_block_only_cross_attention is None: mid_block_only_cross_attention = only_cross_attention - only_cross_attention = [only_cross_attention] * len( - down_block_types) + only_cross_attention = [only_cross_attention] * len(down_block_types) if mid_block_only_cross_attention is None: mid_block_only_cross_attention = False if isinstance(attention_head_dim, int): - attention_head_dim = (attention_head_dim, ) * len(down_block_types) + attention_head_dim = (attention_head_dim,) * len(down_block_types) if isinstance(cross_attention_dim, int): - cross_attention_dim = ( - cross_attention_dim, ) * len(down_block_types) + cross_attention_dim = (cross_attention_dim,) * len(down_block_types) if isinstance(layers_per_block, int): layers_per_block = [layers_per_block] * len(down_block_types) @@ -492,7 +483,8 @@ def __init__( resnet_skip_time_act=resnet_skip_time_act, resnet_out_scale_factor=resnet_out_scale_factor, cross_attention_norm=cross_attention_norm, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) self.down_blocks.append(down_block) # mid @@ -510,7 +502,8 @@ def __init__( dual_cross_attention=dual_cross_attention, use_linear_projection=use_linear_projection, upcast_attention=upcast_attention, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif mid_block_type == "UNetMidBlockFlatSimpleCrossAttn": self.mid_block = UNetMidBlockFlatSimpleCrossAttn( in_channels=block_out_channels[-1], @@ -525,7 +518,8 @@ def __init__( skip_time_act=resnet_skip_time_act, only_cross_attention=mid_block_only_cross_attention, cross_attention_norm=cross_attention_norm, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) elif mid_block_type is None: self.mid_block = None else: @@ -547,8 +541,7 @@ def __init__( prev_output_channel = output_channel output_channel = reversed_block_out_channels[i] - input_channel = reversed_block_out_channels[min( - i + 1, len(block_out_channels) - 1)] + input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)] # add upsample block for all BUT final layer if not is_final_block: @@ -578,7 +571,8 @@ def __init__( resnet_skip_time_act=resnet_skip_time_act, resnet_out_scale_factor=resnet_out_scale_factor, cross_attention_norm=cross_attention_norm, - resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) self.up_blocks.append(up_block) prev_output_channel = output_channel @@ -587,7 +581,8 @@ def __init__( self.conv_norm_out = nn.GroupNorm( num_channels=block_out_channels[0], num_groups=norm_num_groups, - epsilon=norm_eps, ) + epsilon=norm_eps, + ) self.conv_act = nn.Silu() else: self.conv_norm_out = None @@ -598,16 +593,20 @@ def __init__( block_out_channels[0], out_channels, kernel_size=conv_out_kernel, - padding=conv_out_padding, ) + padding=conv_out_padding, + ) @property def in_channels(self): deprecate( "in_channels", "1.0.0", - ("Accessing `in_channels` directly via unet.in_channels is deprecated. Please use" - " `unet.config.in_channels` instead"), - standard_warn=False, ) + ( + "Accessing `in_channels` directly via unet.in_channels is deprecated. Please use" + " `unet.config.in_channels` instead" + ), + standard_warn=False, + ) return self.config.in_channels @property @@ -620,16 +619,12 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]: # set recursively processors = {} - def fn_recursive_add_processors( - name: str, - module: nn.Layer, - processors: Dict[str, AttentionProcessor]): + def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]): if hasattr(module, "set_processor"): processors[f"{name}.processor"] = module.processor for sub_name, child in module.named_children(): - fn_recursive_add_processors(f"{name}.{sub_name}", child, - processors) + fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) return processors @@ -638,9 +633,7 @@ def fn_recursive_add_processors( return processors - def set_attn_processor(self, - processor: Union[AttentionProcessor, Dict[ - str, AttentionProcessor]]): + def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): r""" Parameters: `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`): @@ -665,8 +658,7 @@ def fn_recursive_attn_processor(name: str, module: nn.Layer, processor): module.set_processor(processor.pop(f"{name}.processor")) for sub_name, child in module.named_children(): - fn_recursive_attn_processor(f"{name}.{sub_name}", child, - processor) + fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) for name, module in self.named_children(): fn_recursive_attn_processor(name, module, processor) @@ -714,8 +706,7 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer): # make smallest slice possible slice_size = num_sliceable_layers * [1] - slice_size = (num_sliceable_layers * [slice_size] - if not isinstance(slice_size, list) else slice_size) + slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size if len(slice_size) != len(sliceable_head_dims): raise ValueError( @@ -727,14 +718,12 @@ def fn_recursive_retrieve_sliceable_dims(module: nn.Layer): size = slice_size[i] dim = sliceable_head_dims[i] if size is not None and size > dim: - raise ValueError( - f"size {size} has to be smaller or equal to {dim}.") + raise ValueError(f"size {size} has to be smaller or equal to {dim}.") # Recursively walk through all the children. # Any children which exposes the set_attention_slice method # gets the message - def fn_recursive_set_attention_slice(module: nn.Layer, - slice_size: List[int]): + def fn_recursive_set_attention_slice(module: nn.Layer, slice_size: List[int]): if hasattr(module, "set_attention_slice"): module.set_attention_slice(slice_size.pop()) @@ -747,24 +736,24 @@ def fn_recursive_set_attention_slice(module: nn.Layer, def _set_gradient_checkpointing(self, module, value=False): if isinstance( - module, - (CrossAttnDownBlockFlat, DownBlockFlat, CrossAttnUpBlockFlat, - UpBlockFlat), ): + module, + (CrossAttnDownBlockFlat, DownBlockFlat, CrossAttnUpBlockFlat, UpBlockFlat), + ): module.gradient_checkpointing = value def forward( - self, - sample: paddle.Tensor, - timestep: Union[paddle.Tensor, float, int], - encoder_hidden_states: paddle.Tensor, - class_labels: Optional[paddle.Tensor]=None, - timestep_cond: Optional[paddle.Tensor]=None, - attention_mask: Optional[paddle.Tensor]=None, - cross_attention_kwargs: Optional[Dict[str, Any]]=None, - down_block_additional_residuals: Optional[Tuple[ - paddle.Tensor]]=None, - mid_block_additional_residual: Optional[paddle.Tensor]=None, - return_dict: bool=True, ) -> Union[UNet2DConditionOutput, Tuple]: + self, + sample: paddle.Tensor, + timestep: Union[paddle.Tensor, float, int], + encoder_hidden_states: paddle.Tensor, + class_labels: Optional[paddle.Tensor] = None, + timestep_cond: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None, + mid_block_additional_residual: Optional[paddle.Tensor] = None, + return_dict: bool = True, + ) -> Union[UNet2DConditionOutput, Tuple]: r""" Args: sample (`paddle.Tensor`): (batch, channel, height, width) noisy inputs tensor @@ -795,8 +784,7 @@ def forward( upsample_size = None if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]): - logger.info( - "Forward upsample size to force interpolation output size.") + logger.info("Forward upsample size to force interpolation output size.") forward_upsample_size = True # prepare attention_mask @@ -816,7 +804,11 @@ def forward( timesteps = timesteps[None] # broadcast to batch dimension in a way that's compatible with ONNX/Core ML - timesteps = timesteps.expand([sample.shape[0], ]) + timesteps = timesteps.expand( + [ + sample.shape[0], + ] + ) t_emb = self.time_proj(timesteps) # timesteps does not contain any weights and will always return f32 tensors @@ -828,8 +820,7 @@ def forward( if self.class_embedding is not None: if class_labels is None: - raise ValueError( - "class_labels should be provided when num_class_embeds > 0") + raise ValueError("class_labels should be provided when num_class_embeds > 0") # maybe cast it to float16 class_labels = class_labels.cast(self.dtype) @@ -861,20 +852,15 @@ def forward( # 3. down - is_controlnet = (mid_block_additional_residual is not None and - down_block_additional_residuals is not None) - is_adapter = (mid_block_additional_residual is None and - down_block_additional_residuals is not None) + is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None + is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None - down_block_res_samples = (sample, ) + down_block_res_samples = (sample,) for downsample_block in self.down_blocks: - if (hasattr(downsample_block, "has_cross_attention") and - downsample_block.has_cross_attention): + if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention: additional_kwargs = {} if is_adapter and len(down_block_additional_residuals) > 0: - additional_kwargs[ - "additional_residuals"] = down_block_additional_residuals.pop( - 0) + additional_kwargs["additional_residuals"] = down_block_additional_residuals.pop(0) sample, res_samples = downsample_block( hidden_states=sample, @@ -882,10 +868,10 @@ def forward( encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, cross_attention_kwargs=cross_attention_kwargs, - **additional_kwargs, ) + **additional_kwargs, + ) else: - sample, res_samples = downsample_block( - hidden_states=sample, temb=emb) + sample, res_samples = downsample_block(hidden_states=sample, temb=emb) if is_adapter and len(down_block_additional_residuals) > 0: sample += down_block_additional_residuals.pop(0) @@ -896,10 +882,10 @@ def forward( new_down_block_res_samples = () for down_block_res_sample, down_block_additional_residual in zip( - down_block_res_samples, down_block_additional_residuals): - down_block_res_sample = ( - down_block_res_sample + down_block_additional_residual) - new_down_block_res_samples += (down_block_res_sample, ) + down_block_res_samples, down_block_additional_residuals + ): + down_block_res_sample = down_block_res_sample + down_block_additional_residual + new_down_block_res_samples += (down_block_res_sample,) down_block_res_samples = new_down_block_res_samples # 4. mid @@ -909,7 +895,8 @@ def forward( emb, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, - cross_attention_kwargs=cross_attention_kwargs, ) + cross_attention_kwargs=cross_attention_kwargs, + ) if is_controlnet: sample = sample + mid_block_additional_residual @@ -918,17 +905,15 @@ def forward( for i, upsample_block in enumerate(self.up_blocks): is_final_block = i == len(self.up_blocks) - 1 - res_samples = down_block_res_samples[-len(upsample_block.resnets):] - down_block_res_samples = down_block_res_samples[:-len( - upsample_block.resnets)] + res_samples = down_block_res_samples[-len(upsample_block.resnets) :] + down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)] # if we have not reached the final block and need to forward the # upsample size, we do it here if not is_final_block and forward_upsample_size: upsample_size = down_block_res_samples[-1].shape[2:] - if (hasattr(upsample_block, "has_cross_attention") and - upsample_block.has_cross_attention): + if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention: sample = upsample_block( hidden_states=sample, temb=emb, @@ -936,13 +921,15 @@ def forward( encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=cross_attention_kwargs, upsample_size=upsample_size, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) else: sample = upsample_block( hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, - upsample_size=upsample_size, ) + upsample_size=upsample_size, + ) # 6. post-process if self.conv_norm_out: sample = self.conv_norm_out(sample) @@ -950,72 +937,60 @@ def forward( sample = self.conv_out(sample) if not return_dict: - return (sample, ) + return (sample,) return UNet2DConditionOutput(sample=sample) class LinearMultiDim(nn.Linear): - def __init__(self, - in_features, - out_features=None, - second_dim=4, - *args, - **kwargs): - in_features = ([in_features, second_dim, 1] - if isinstance(in_features, int) else list(in_features)) + def __init__(self, in_features, out_features=None, second_dim=4, *args, **kwargs): + in_features = [in_features, second_dim, 1] if isinstance(in_features, int) else list(in_features) if out_features is None: out_features = in_features - out_features = ([out_features, second_dim, 1] if - isinstance(out_features, int) else list(out_features)) + out_features = [out_features, second_dim, 1] if isinstance(out_features, int) else list(out_features) self.in_features_multidim = in_features self.out_features_multidim = out_features self.n_dim = len(self.in_features_multidim) - super().__init__( - np.array(in_features).prod(), np.array(out_features).prod()) + super().__init__(np.array(in_features).prod(), np.array(out_features).prod()) self.in_features = self.weight.shape[0] def forward(self, input_tensor, *args, **kwargs): shape = input_tensor.shape - input_tensor = input_tensor.reshape( - [*shape[0:-self.n_dim], self.in_features]) + input_tensor = input_tensor.reshape([*shape[0 : -self.n_dim], self.in_features]) output_tensor = super().forward(input_tensor) - output_tensor = output_tensor.reshape( - [*shape[0:-self.n_dim], *self.out_features_multidim]) + output_tensor = output_tensor.reshape([*shape[0 : -self.n_dim], *self.out_features_multidim]) return output_tensor class ResnetBlockFlat(nn.Layer): def __init__( - self, - *, - in_channels, - out_channels=None, - dropout: float=0.0, - temb_channels: int=512, - groups: int=32, - groups_out=None, - pre_norm: bool=True, - eps: float=1e-6, - time_embedding_norm: str="default", - use_in_shortcut=None, - second_dim: int=4, - pre_temb_non_linearity: bool=False, - **kwargs, ): + self, + *, + in_channels, + out_channels=None, + dropout: float = 0.0, + temb_channels: int = 512, + groups: int = 32, + groups_out=None, + pre_norm: bool = True, + eps: float = 1e-6, + time_embedding_norm: str = "default", + use_in_shortcut=None, + second_dim: int = 4, + pre_temb_non_linearity: bool = False, + **kwargs, + ): super().__init__() self.pre_temb_non_linearity = pre_temb_non_linearity self.pre_norm = pre_norm self.pre_norm = True - in_channels = ([in_channels, second_dim, 1] - if isinstance(in_channels, int) else list(in_channels)) + in_channels = [in_channels, second_dim, 1] if isinstance(in_channels, int) else list(in_channels) self.in_channels_prod = np.array(in_channels).prod() self.channels_multidim = in_channels if out_channels is not None: - out_channels = ([out_channels, second_dim, 1] - if isinstance(out_channels, int) else - list(out_channels)) + out_channels = [out_channels, second_dim, 1] if isinstance(out_channels, int) else list(out_channels) out_channels_prod = np.array(out_channels).prod() self.out_channels_multidim = out_channels else: @@ -1026,26 +1001,23 @@ def __init__( if groups_out is None: groups_out = groups - self.norm1 = nn.GroupNorm( - num_groups=groups, num_channels=self.in_channels_prod, epsilon=eps) - self.conv1 = nn.Conv2D( - self.in_channels_prod, out_channels_prod, kernel_size=1, padding=0) + self.norm1 = nn.GroupNorm(num_groups=groups, num_channels=self.in_channels_prod, epsilon=eps) + self.conv1 = nn.Conv2D(self.in_channels_prod, out_channels_prod, kernel_size=1, padding=0) if temb_channels is not None: self.time_emb_proj = nn.Linear(temb_channels, out_channels_prod) else: self.time_emb_proj = None - self.norm2 = nn.GroupNorm( - num_groups=groups_out, num_channels=out_channels_prod, epsilon=eps) + self.norm2 = nn.GroupNorm(num_groups=groups_out, num_channels=out_channels_prod, epsilon=eps) self.dropout = nn.Dropout(dropout) - self.conv2 = nn.Conv2D( - out_channels_prod, out_channels_prod, kernel_size=1, padding=0) + self.conv2 = nn.Conv2D(out_channels_prod, out_channels_prod, kernel_size=1, padding=0) self.nonlinearity = nn.Silu() - self.use_in_shortcut = (self.in_channels_prod != out_channels_prod - if use_in_shortcut is None else use_in_shortcut) + self.use_in_shortcut = ( + self.in_channels_prod != out_channels_prod if use_in_shortcut is None else use_in_shortcut + ) self.conv_shortcut = None if self.use_in_shortcut: @@ -1054,14 +1026,14 @@ def __init__( out_channels_prod, kernel_size=1, stride=1, - padding=0, ) + padding=0, + ) self.n_dim = len(self.channels_multidim) def forward(self, input_tensor, temb=None): shape = input_tensor.shape - input_tensor = input_tensor.reshape( - [*shape[0:-self.n_dim], self.in_channels_prod, 1, 1]) + input_tensor = input_tensor.reshape([*shape[0 : -self.n_dim], self.in_channels_prod, 1, 1]) input_tensor = input_tensor.reshape([-1, self.in_channels_prod, 1, 1]) hidden_states = input_tensor @@ -1072,8 +1044,7 @@ def forward(self, input_tensor, temb=None): if temb is not None and self.time_emb_proj is not None: if not self.pre_temb_non_linearity: - temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, - None] + temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None] else: temb = self.time_emb_proj(temb)[:, :, None, None] hidden_states = hidden_states + temb @@ -1089,9 +1060,8 @@ def forward(self, input_tensor, temb=None): output_tensor = input_tensor + hidden_states - output_tensor = output_tensor.reshape([*shape[0:-self.n_dim], -1]) - output_tensor = output_tensor.reshape( - [*shape[0:-self.n_dim], *self.out_channels_multidim]) + output_tensor = output_tensor.reshape([*shape[0 : -self.n_dim], -1]) + output_tensor = output_tensor.reshape([*shape[0 : -self.n_dim], *self.out_channels_multidim]) return output_tensor @@ -1099,21 +1069,22 @@ def forward(self, input_tensor, temb=None): # Copied from ppdiffusers.models.unet_2d_blocks.DownBlock2D with DownBlock2D->DownBlockFlat, ResnetBlock2D->ResnetBlockFlat, Downsample2D->LinearMultiDim class DownBlockFlat(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - output_scale_factor: float=1.0, - add_downsample: bool=True, - downsample_padding: int=1, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + output_scale_factor: float = 1.0, + add_downsample: bool = True, + downsample_padding: int = 1, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() resnets = [] @@ -1131,19 +1102,24 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) self.resnets = nn.LayerList(resnets) if add_downsample: - self.downsamplers = nn.LayerList([ - LinearMultiDim( - out_channels, - use_conv=True, - out_channels=out_channels, - padding=downsample_padding, - name="op", ) - ]) + self.downsamplers = nn.LayerList( + [ + LinearMultiDim( + out_channels, + use_conv=True, + out_channels=out_channels, + padding=downsample_padding, + name="op", + ) + ] + ) else: self.downsamplers = None @@ -1153,8 +1129,7 @@ def forward(self, hidden_states, temb=None): output_states = () for resnet in self.resnets: - if (self.training and self.gradient_checkpointing and - not hidden_states.stop_gradient): + if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient: def create_custom_forward(module): def custom_forward(*inputs): @@ -1162,18 +1137,17 @@ def custom_forward(*inputs): return custom_forward - hidden_states = recompute( - create_custom_forward(resnet), hidden_states, temb) + hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb) else: hidden_states = resnet(hidden_states, temb) - output_states += (hidden_states, ) + output_states += (hidden_states,) if self.downsamplers is not None: for downsampler in self.downsamplers: hidden_states = downsampler(hidden_states) - output_states += (hidden_states, ) + output_states += (hidden_states,) return hidden_states, output_states @@ -1181,27 +1155,28 @@ def custom_forward(*inputs): # Copied from ppdiffusers.models.unet_2d_blocks.CrossAttnDownBlock2D with CrossAttnDownBlock2D->CrossAttnDownBlockFlat, ResnetBlock2D->ResnetBlockFlat, Downsample2D->LinearMultiDim class CrossAttnDownBlockFlat(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - attn_num_head_channels: int=1, - cross_attention_dim: int=1280, - output_scale_factor: float=1.0, - downsample_padding: int=1, - add_downsample: bool=True, - dual_cross_attention: bool=False, - use_linear_projection: bool=False, - only_cross_attention: bool=False, - upcast_attention: bool=False, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels: int = 1, + cross_attention_dim: int = 1280, + output_scale_factor: float = 1.0, + downsample_padding: int = 1, + add_downsample: bool = True, + dual_cross_attention: bool = False, + use_linear_projection: bool = False, + only_cross_attention: bool = False, + upcast_attention: bool = False, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() resnets = [] attentions = [] @@ -1223,7 +1198,9 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) if not dual_cross_attention: attentions.append( Transformer2DModel( @@ -1235,7 +1212,9 @@ def __init__( norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, - upcast_attention=upcast_attention, )) + upcast_attention=upcast_attention, + ) + ) else: attentions.append( DualTransformer2DModel( @@ -1244,32 +1223,38 @@ def __init__( in_channels=out_channels, num_layers=1, cross_attention_dim=cross_attention_dim, - norm_num_groups=resnet_groups, )) + norm_num_groups=resnet_groups, + ) + ) self.attentions = nn.LayerList(attentions) self.resnets = nn.LayerList(resnets) if add_downsample: - self.downsamplers = nn.LayerList([ - LinearMultiDim( - out_channels, - use_conv=True, - out_channels=out_channels, - padding=downsample_padding, - name="op", ) - ]) + self.downsamplers = nn.LayerList( + [ + LinearMultiDim( + out_channels, + use_conv=True, + out_channels=out_channels, + padding=downsample_padding, + name="op", + ) + ] + ) else: self.downsamplers = None self.gradient_checkpointing = False def forward( - self, - hidden_states, - temb=None, - encoder_hidden_states=None, - attention_mask=None, - cross_attention_kwargs=None, - additional_residuals=None, ): + self, + hidden_states, + temb=None, + encoder_hidden_states=None, + attention_mask=None, + cross_attention_kwargs=None, + additional_residuals=None, + ): # TODO(Patrick, William) - attention mask is not used output_states = () @@ -1285,22 +1270,22 @@ def custom_forward(*inputs): return custom_forward + hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb) hidden_states = recompute( - create_custom_forward(resnet), hidden_states, temb) - hidden_states = recompute( - create_custom_forward( - attn, return_dict=False), + create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states, - cross_attention_kwargs, ) # [0] + cross_attention_kwargs, + ) # [0] else: hidden_states = resnet(hidden_states, temb) hidden_states = attn( hidden_states, encoder_hidden_states=encoder_hidden_states, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample - output_states += (hidden_states, ) + output_states += (hidden_states,) if additional_residuals is not None: hidden_states += additional_residuals @@ -1309,7 +1294,7 @@ def custom_forward(*inputs): for downsampler in self.downsamplers: hidden_states = downsampler(hidden_states) - output_states += (hidden_states, ) + output_states += (hidden_states,) return hidden_states, output_states @@ -1317,27 +1302,27 @@ def custom_forward(*inputs): # Copied from ppdiffusers.models.unet_2d_blocks.UpBlock2D with UpBlock2D->UpBlockFlat, ResnetBlock2D->ResnetBlockFlat, Upsample2D->LinearMultiDim class UpBlockFlat(nn.Layer): def __init__( - self, - in_channels: int, - prev_output_channel: int, - out_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - output_scale_factor: float=1.0, - add_upsample: bool=True, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + prev_output_channel: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + output_scale_factor: float = 1.0, + add_upsample: bool = True, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() resnets = [] for i in range(num_layers): - res_skip_channels = in_channels if ( - i == num_layers - 1) else out_channels + res_skip_channels = in_channels if (i == num_layers - 1) else out_channels resnet_in_channels = prev_output_channel if i == 0 else out_channels resnets.append( @@ -1352,31 +1337,25 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) self.resnets = nn.LayerList(resnets) if add_upsample: - self.upsamplers = nn.LayerList([ - LinearMultiDim( - out_channels, use_conv=True, out_channels=out_channels) - ]) + self.upsamplers = nn.LayerList([LinearMultiDim(out_channels, use_conv=True, out_channels=out_channels)]) else: self.upsamplers = None self.gradient_checkpointing = False - def forward(self, - hidden_states, - res_hidden_states_tuple, - temb=None, - upsample_size=None): + def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None): for resnet in self.resnets: # pop res hidden states res_hidden_states = res_hidden_states_tuple[-1] res_hidden_states_tuple = res_hidden_states_tuple[:-1] - hidden_states = paddle.concat( - [hidden_states, res_hidden_states], axis=1) + hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1) if self.training and self.gradient_checkpointing: @@ -1386,8 +1365,7 @@ def custom_forward(*inputs): return custom_forward - hidden_states = recompute( - create_custom_forward(resnet), hidden_states, temb) + hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb) else: hidden_states = resnet(hidden_states, temb) @@ -1401,27 +1379,28 @@ def custom_forward(*inputs): # Copied from ppdiffusers.models.unet_2d_blocks.CrossAttnUpBlock2D with CrossAttnUpBlock2D->CrossAttnUpBlockFlat, ResnetBlock2D->ResnetBlockFlat, Upsample2D->LinearMultiDim class CrossAttnUpBlockFlat(nn.Layer): def __init__( - self, - in_channels: int, - out_channels: int, - prev_output_channel: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - attn_num_head_channels: int=1, - cross_attention_dim: int=1280, - output_scale_factor: float=1.0, - add_upsample: bool=True, - dual_cross_attention: bool=False, - use_linear_projection: bool=False, - only_cross_attention: bool=False, - upcast_attention: bool=False, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + out_channels: int, + prev_output_channel: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels: int = 1, + cross_attention_dim: int = 1280, + output_scale_factor: float = 1.0, + add_upsample: bool = True, + dual_cross_attention: bool = False, + use_linear_projection: bool = False, + only_cross_attention: bool = False, + upcast_attention: bool = False, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() resnets = [] attentions = [] @@ -1430,8 +1409,7 @@ def __init__( self.attn_num_head_channels = attn_num_head_channels for i in range(num_layers): - res_skip_channels = in_channels if ( - i == num_layers - 1) else out_channels + res_skip_channels = in_channels if (i == num_layers - 1) else out_channels resnet_in_channels = prev_output_channel if i == 0 else out_channels resnets.append( @@ -1446,7 +1424,9 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) if not dual_cross_attention: attentions.append( Transformer2DModel( @@ -1458,7 +1438,9 @@ def __init__( norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, - upcast_attention=upcast_attention, )) + upcast_attention=upcast_attention, + ) + ) else: attentions.append( DualTransformer2DModel( @@ -1467,36 +1449,35 @@ def __init__( in_channels=out_channels, num_layers=1, cross_attention_dim=cross_attention_dim, - norm_num_groups=resnet_groups, )) + norm_num_groups=resnet_groups, + ) + ) self.attentions = nn.LayerList(attentions) self.resnets = nn.LayerList(resnets) if add_upsample: - self.upsamplers = nn.LayerList([ - LinearMultiDim( - out_channels, use_conv=True, out_channels=out_channels) - ]) + self.upsamplers = nn.LayerList([LinearMultiDim(out_channels, use_conv=True, out_channels=out_channels)]) else: self.upsamplers = None self.gradient_checkpointing = False def forward( - self, - hidden_states, - res_hidden_states_tuple, - temb=None, - encoder_hidden_states=None, - cross_attention_kwargs=None, - upsample_size=None, - attention_mask=None, ): + self, + hidden_states, + res_hidden_states_tuple, + temb=None, + encoder_hidden_states=None, + cross_attention_kwargs=None, + upsample_size=None, + attention_mask=None, + ): # TODO(Patrick, William) - attention mask is not used for resnet, attn in zip(self.resnets, self.attentions): # pop res hidden states res_hidden_states = res_hidden_states_tuple[-1] res_hidden_states_tuple = res_hidden_states_tuple[:-1] - hidden_states = paddle.concat( - [hidden_states, res_hidden_states], axis=1) + hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1) if self.training and self.gradient_checkpointing: @@ -1509,20 +1490,20 @@ def custom_forward(*inputs): return custom_forward + hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb) hidden_states = recompute( - create_custom_forward(resnet), hidden_states, temb) - hidden_states = recompute( - create_custom_forward( - attn, return_dict=False), + create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states, - cross_attention_kwargs, ) # [0] + cross_attention_kwargs, + ) # [0] else: hidden_states = resnet(hidden_states, temb) hidden_states = attn( hidden_states, encoder_hidden_states=encoder_hidden_states, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample if self.upsamplers is not None: for upsampler in self.upsamplers: @@ -1534,29 +1515,29 @@ def custom_forward(*inputs): # Copied from ppdiffusers.models.unet_2d_blocks.UNetMidBlock2DCrossAttn with UNetMidBlock2DCrossAttn->UNetMidBlockFlatCrossAttn, ResnetBlock2D->ResnetBlockFlat class UNetMidBlockFlatCrossAttn(nn.Layer): def __init__( - self, - in_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - attn_num_head_channels: int=1, - output_scale_factor: float=1.0, - cross_attention_dim: int=1280, - dual_cross_attention: bool=False, - use_linear_projection: bool=False, - upcast_attention: bool=False, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels: int = 1, + output_scale_factor: float = 1.0, + cross_attention_dim: int = 1280, + dual_cross_attention: bool = False, + use_linear_projection: bool = False, + upcast_attention: bool = False, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() self.has_cross_attention = True self.attn_num_head_channels = attn_num_head_channels - resnet_groups = (resnet_groups if resnet_groups is not None else - min(in_channels // 4, 32)) + resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32) # there is always at least one resnet resnets = [ @@ -1571,7 +1552,8 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) ] attentions = [] @@ -1586,7 +1568,9 @@ def __init__( cross_attention_dim=cross_attention_dim, norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, - upcast_attention=upcast_attention, )) + upcast_attention=upcast_attention, + ) + ) else: attentions.append( DualTransformer2DModel( @@ -1595,7 +1579,9 @@ def __init__( in_channels=in_channels, num_layers=1, cross_attention_dim=cross_attention_dim, - norm_num_groups=resnet_groups, )) + norm_num_groups=resnet_groups, + ) + ) resnets.append( ResnetBlockFlat( in_channels=in_channels, @@ -1608,24 +1594,28 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) self.attentions = nn.LayerList(attentions) self.resnets = nn.LayerList(resnets) def forward( - self, - hidden_states, - temb=None, - encoder_hidden_states=None, - attention_mask=None, - cross_attention_kwargs=None, ): + self, + hidden_states, + temb=None, + encoder_hidden_states=None, + attention_mask=None, + cross_attention_kwargs=None, + ): hidden_states = self.resnets[0](hidden_states, temb) for attn, resnet in zip(self.attentions, self.resnets[1:]): hidden_states = attn( hidden_states, encoder_hidden_states=encoder_hidden_states, - cross_attention_kwargs=cross_attention_kwargs, ).sample + cross_attention_kwargs=cross_attention_kwargs, + ).sample hidden_states = resnet(hidden_states, temb) return hidden_states @@ -1634,30 +1624,30 @@ def forward( # Copied from ppdiffusers.models.unet_2d_blocks.UNetMidBlock2DSimpleCrossAttn with UNetMidBlock2DSimpleCrossAttn->UNetMidBlockFlatSimpleCrossAttn, ResnetBlock2D->ResnetBlockFlat class UNetMidBlockFlatSimpleCrossAttn(nn.Layer): def __init__( - self, - in_channels: int, - temb_channels: int, - dropout: float=0.0, - num_layers: int=1, - resnet_eps: float=1e-6, - resnet_time_scale_shift: str="default", - resnet_act_fn: str="swish", - resnet_groups: int=32, - resnet_pre_norm: bool=True, - attn_num_head_channels: int=1, - output_scale_factor: float=1.0, - cross_attention_dim: int=1280, - skip_time_act=False, - only_cross_attention=False, - cross_attention_norm=None, - resnet_pre_temb_non_linearity: bool=False, ): + self, + in_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels: int = 1, + output_scale_factor: float = 1.0, + cross_attention_dim: int = 1280, + skip_time_act=False, + only_cross_attention=False, + cross_attention_norm=None, + resnet_pre_temb_non_linearity: bool = False, + ): super().__init__() self.has_cross_attention = True self.attn_num_head_channels = attn_num_head_channels - resnet_groups = (resnet_groups if resnet_groups is not None else - min(in_channels // 4, 32)) + resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32) self.num_heads = in_channels // self.attn_num_head_channels @@ -1674,7 +1664,8 @@ def __init__( non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, ) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) ] attentions = [] @@ -1696,7 +1687,9 @@ def __init__( upcast_softmax=True, only_cross_attention=only_cross_attention, cross_attention_norm=cross_attention_norm, - processor=processor, )) + processor=processor, + ) + ) resnets.append( ResnetBlockFlat( in_channels=in_channels, @@ -1710,20 +1703,22 @@ def __init__( output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, skip_time_act=skip_time_act, - pre_temb_non_linearity=resnet_pre_temb_non_linearity, )) + pre_temb_non_linearity=resnet_pre_temb_non_linearity, + ) + ) self.attentions = nn.LayerList(attentions) self.resnets = nn.LayerList(resnets) def forward( - self, - hidden_states, - temb=None, - encoder_hidden_states=None, - attention_mask=None, - cross_attention_kwargs=None, ): - cross_attention_kwargs = (cross_attention_kwargs if - cross_attention_kwargs is not None else {}) + self, + hidden_states, + temb=None, + encoder_hidden_states=None, + attention_mask=None, + cross_attention_kwargs=None, + ): + cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} hidden_states = self.resnets[0](hidden_states, temb) for attn, resnet in zip(self.attentions, self.resnets[1:]): # attn @@ -1731,7 +1726,8 @@ def forward( hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, - **cross_attention_kwargs, ) + **cross_attention_kwargs, + ) # resnet hidden_states = resnet(hidden_states, temb) diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py index c09df819c2b79..43a40201892a1 100644 --- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py @@ -18,21 +18,27 @@ import paddle import PIL.Image -from paddlenlp.transformers import (CLIPImageProcessor, - CLIPTextModelWithProjection, CLIPTokenizer, - CLIPVisionModelWithProjection) +from paddlenlp.transformers import ( + CLIPImageProcessor, + CLIPTextModelWithProjection, + CLIPTokenizer, + CLIPVisionModelWithProjection, +) from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import logging from ..pipeline_utils import DiffusionPipeline from .modeling_text_unet import UNetFlatConditionModel -from .pipeline_versatile_diffusion_dual_guided import \ - VersatileDiffusionDualGuidedPipeline -from .pipeline_versatile_diffusion_image_variation import \ - VersatileDiffusionImageVariationPipeline -from .pipeline_versatile_diffusion_text_to_image import \ - VersatileDiffusionTextToImagePipeline +from .pipeline_versatile_diffusion_dual_guided import ( + VersatileDiffusionDualGuidedPipeline, +) +from .pipeline_versatile_diffusion_image_variation import ( + VersatileDiffusionImageVariationPipeline, +) +from .pipeline_versatile_diffusion_text_to_image import ( + VersatileDiffusionTextToImagePipeline, +) logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -77,15 +83,16 @@ class VersatileDiffusionPipeline(DiffusionPipeline): scheduler: KarrasDiffusionSchedulers def __init__( - self, - tokenizer: CLIPTokenizer, - image_feature_extractor: CLIPImageProcessor, - text_encoder: CLIPTextModelWithProjection, - image_encoder: CLIPVisionModelWithProjection, - image_unet: UNet2DConditionModel, - text_unet: UNet2DConditionModel, - vae: AutoencoderKL, - scheduler: KarrasDiffusionSchedulers, ): + self, + tokenizer: CLIPTokenizer, + image_feature_extractor: CLIPImageProcessor, + text_encoder: CLIPTextModelWithProjection, + image_encoder: CLIPVisionModelWithProjection, + image_unet: UNet2DConditionModel, + text_unet: UNet2DConditionModel, + vae: AutoencoderKL, + scheduler: KarrasDiffusionSchedulers, + ): super().__init__() self.register_modules( @@ -96,27 +103,28 @@ def __init__( image_unet=image_unet, text_unet=text_unet, vae=vae, - scheduler=scheduler, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + scheduler=scheduler, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) @paddle.no_grad() def image_variation( - self, - image: Union[paddle.Tensor, PIL.Image.Image], - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, ): + self, + image: Union[paddle.Tensor, PIL.Image.Image], + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + ): r""" Function invoked when calling the pipeline for generation. @@ -194,13 +202,8 @@ def image_variation( list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ - expected_components = inspect.signature( - VersatileDiffusionImageVariationPipeline.__init__).parameters.keys() - components = { - name: component - for name, component in self.components.items() - if name in expected_components - } + expected_components = inspect.signature(VersatileDiffusionImageVariationPipeline.__init__).parameters.keys() + components = {name: component for name, component in self.components.items() if name in expected_components} return VersatileDiffusionImageVariationPipeline(**components)( image=image, height=height, @@ -215,26 +218,27 @@ def image_variation( output_type=output_type, return_dict=return_dict, callback=callback, - callback_steps=callback_steps, ) + callback_steps=callback_steps, + ) @paddle.no_grad() def text_to_image( - self, - prompt: Union[str, List[str]], - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, ): + self, + prompt: Union[str, List[str]], + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + ): r""" Function invoked when calling the pipeline for generation. @@ -303,13 +307,8 @@ def text_to_image( list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ - expected_components = inspect.signature( - VersatileDiffusionTextToImagePipeline.__init__).parameters.keys() - components = { - name: component - for name, component in self.components.items() - if name in expected_components - } + expected_components = inspect.signature(VersatileDiffusionTextToImagePipeline.__init__).parameters.keys() + components = {name: component for name, component in self.components.items() if name in expected_components} temp_pipeline = VersatileDiffusionTextToImagePipeline(**components) output = temp_pipeline( prompt=prompt, @@ -325,7 +324,8 @@ def text_to_image( output_type=output_type, return_dict=return_dict, callback=callback, - callback_steps=callback_steps, ) + callback_steps=callback_steps, + ) # swap the attention blocks back to the original state temp_pipeline._swap_unet_attention_blocks() @@ -333,23 +333,23 @@ def text_to_image( @paddle.no_grad() def dual_guided( - self, - prompt: Union[PIL.Image.Image, List[PIL.Image.Image]], - image: Union[str, List[str]], - text_to_image_strength: float=0.5, - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, ): + self, + prompt: Union[PIL.Image.Image, List[PIL.Image.Image]], + image: Union[str, List[str]], + text_to_image_strength: float = 0.5, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + ): r""" Function invoked when calling the pipeline for generation. @@ -431,13 +431,8 @@ def dual_guided( returning a tuple, the first element is a list with the generated images. """ - expected_components = inspect.signature( - VersatileDiffusionDualGuidedPipeline.__init__).parameters.keys() - components = { - name: component - for name, component in self.components.items() - if name in expected_components - } + expected_components = inspect.signature(VersatileDiffusionDualGuidedPipeline.__init__).parameters.keys() + components = {name: component for name, component in self.components.items() if name in expected_components} temp_pipeline = VersatileDiffusionDualGuidedPipeline(**components) output = temp_pipeline( prompt=prompt, @@ -454,7 +449,8 @@ def dual_guided( output_type=output_type, return_dict=return_dict, callback=callback, - callback_steps=callback_steps, ) + callback_steps=callback_steps, + ) temp_pipeline._revert_dual_attention() return output diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py index a47088e2f9411..faf4c4f7232ed 100644 --- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +++ b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py @@ -19,12 +19,19 @@ import numpy as np import paddle import PIL -from paddlenlp.transformers import (CLIPImageProcessor, - CLIPTextModelWithProjection, CLIPTokenizer, - CLIPVisionModelWithProjection) - -from ...models import (AutoencoderKL, DualTransformer2DModel, - Transformer2DModel, UNet2DConditionModel) +from paddlenlp.transformers import ( + CLIPImageProcessor, + CLIPTextModelWithProjection, + CLIPTokenizer, + CLIPVisionModelWithProjection, +) + +from ...models import ( + AutoencoderKL, + DualTransformer2DModel, + Transformer2DModel, + UNet2DConditionModel, +) from ...schedulers import KarrasDiffusionSchedulers from ...utils import logging, randn_tensor from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput @@ -74,15 +81,16 @@ class VersatileDiffusionDualGuidedPipeline(DiffusionPipeline): _optional_components = ["text_unet"] def __init__( - self, - tokenizer: CLIPTokenizer, - image_feature_extractor: CLIPImageProcessor, - text_encoder: CLIPTextModelWithProjection, - image_encoder: CLIPVisionModelWithProjection, - image_unet: UNet2DConditionModel, - text_unet: UNetFlatConditionModel, - vae: AutoencoderKL, - scheduler: KarrasDiffusionSchedulers, ): + self, + tokenizer: CLIPTokenizer, + image_feature_extractor: CLIPImageProcessor, + text_encoder: CLIPTextModelWithProjection, + image_encoder: CLIPVisionModelWithProjection, + image_unet: UNet2DConditionModel, + text_unet: UNetFlatConditionModel, + vae: AutoencoderKL, + scheduler: KarrasDiffusionSchedulers, + ): super().__init__() self.register_modules( tokenizer=tokenizer, @@ -92,12 +100,13 @@ def __init__( image_unet=image_unet, text_unet=text_unet, vae=vae, - scheduler=scheduler, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + scheduler=scheduler, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if self.text_unet is not None and ( - "dual_cross_attention" not in self.image_unet.config or - not self.image_unet.config.dual_cross_attention): + "dual_cross_attention" not in self.image_unet.config or not self.image_unet.config.dual_cross_attention + ): # if loading from a universal checkpoint rather than a saved dual-guided pipeline self._convert_to_dual_attention() @@ -114,10 +123,8 @@ def _convert_to_dual_attention(self): parent_name, index = name.rsplit(".", 1) index = int(index) - image_transformer = self.image_unet.get_sublayer(parent_name)[ - index] - text_transformer = self.text_unet.get_sublayer(parent_name)[ - index] + image_transformer = self.image_unet.get_sublayer(parent_name)[index] + text_transformer = self.text_unet.get_sublayer(parent_name)[index] config = image_transformer.config dual_transformer = DualTransformer2DModel( @@ -132,12 +139,12 @@ def _convert_to_dual_attention(self): sample_size=config.sample_size, num_vector_embeds=config.num_vector_embeds, activation_fn=config.activation_fn, - num_embeds_ada_norm=config.num_embeds_ada_norm, ) + num_embeds_ada_norm=config.num_embeds_ada_norm, + ) dual_transformer.transformers[0] = image_transformer dual_transformer.transformers[1] = text_transformer - self.image_unet.get_sublayer(parent_name)[ - index] = dual_transformer + self.image_unet.get_sublayer(parent_name)[index] = dual_transformer self.image_unet.register_to_config(dual_cross_attention=True) def _revert_dual_attention(self): @@ -149,12 +156,10 @@ def _revert_dual_attention(self): if isinstance(module, DualTransformer2DModel): parent_name, index = name.rsplit(".", 1) index = int(index) - self.image_unet.get_sublayer(parent_name)[ - index] = module.transformers[0] + self.image_unet.get_sublayer(parent_name)[index] = module.transformers[0] self.image_unet.register_to_config(dual_cross_attention=False) - def _encode_text_prompt(self, prompt, num_images_per_prompt, - do_classifier_free_guidance): + def _encode_text_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance): r""" Encodes the prompt into text encoder hidden states. @@ -168,11 +173,9 @@ def _encode_text_prompt(self, prompt, num_images_per_prompt, """ def normalize_embeddings(encoder_output): - embeds = paddle.matmul(encoder_output.last_hidden_state, - self.text_encoder.text_projection) + embeds = paddle.matmul(encoder_output.last_hidden_state, self.text_encoder.text_projection) embeds_pooled = encoder_output.text_embeds - embeds = embeds / paddle.norm( - embeds_pooled.unsqueeze(1), axis=-1, keepdim=True) + embeds = embeds / paddle.norm(embeds_pooled.unsqueeze(1), axis=-1, keepdim=True) return embeds batch_size = len(prompt) @@ -182,35 +185,35 @@ def normalize_embeddings(encoder_output): padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, untruncated_ids): - removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = normalize_embeddings(prompt_embeds) # duplicate text embeddings for each generation per prompt, using mps friendly method bs_embed, seq_len, _ = prompt_embeds.shape prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance: @@ -221,37 +224,33 @@ def normalize_embeddings(encoder_output): padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) - negative_prompt_embeds = normalize_embeddings( - negative_prompt_embeds) + attention_mask=attention_mask, + ) + negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds) # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds - def _encode_image_prompt(self, prompt, num_images_per_prompt, - do_classifier_free_guidance): + def _encode_image_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance): r""" Encodes the prompt into vision encoder hidden states. @@ -265,8 +264,7 @@ def _encode_image_prompt(self, prompt, num_images_per_prompt, """ def normalize_embeddings(encoder_output): - embeds = self.image_encoder.vision_model.ln_post( - encoder_output.last_hidden_state) + embeds = self.image_encoder.vision_model.ln_post(encoder_output.last_hidden_state) embeds = paddle.matmul(embeds, self.image_encoder.vision_projection) embeds_pooled = embeds[:, 0:1] embeds = embeds / paddle.norm(embeds_pooled, axis=-1, keepdim=True) @@ -275,8 +273,7 @@ def normalize_embeddings(encoder_output): batch_size = len(prompt) if isinstance(prompt, list) else 1 # get prompt text embeddings - image_input = self.image_feature_extractor( - images=prompt, return_tensors="pd") + image_input = self.image_feature_extractor(images=prompt, return_tensors="pd") pixel_values = image_input.pixel_values.cast(self.image_encoder.dtype) image_embeddings = self.image_encoder(pixel_values) image_embeddings = normalize_embeddings(image_embeddings) @@ -284,32 +281,25 @@ def normalize_embeddings(encoder_output): # duplicate image embeddings for each generation per prompt, using mps friendly method bs_embed, seq_len, _ = image_embeddings.shape image_embeddings = image_embeddings.tile([1, num_images_per_prompt, 1]) - image_embeddings = image_embeddings.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + image_embeddings = image_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance: uncond_images = [np.zeros((512, 512, 3)) + 0.5] * batch_size - uncond_images = self.image_feature_extractor( - images=uncond_images, return_tensors="pd") - pixel_values = uncond_images.pixel_values.cast( - self.image_encoder.dtype) + uncond_images = self.image_feature_extractor(images=uncond_images, return_tensors="pd") + pixel_values = uncond_images.pixel_values.cast(self.image_encoder.dtype) negative_prompt_embeds = self.image_encoder(pixel_values) - negative_prompt_embeds = normalize_embeddings( - negative_prompt_embeds) + negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds) # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and conditional embeddings into a single batch # to avoid doing two forward passes - image_embeddings = paddle.concat( - [negative_prompt_embeds, image_embeddings]) + image_embeddings = paddle.concat([negative_prompt_embeds, image_embeddings]) return image_embeddings @@ -329,60 +319,51 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs def check_inputs(self, prompt, image, height, width, callback_steps): - if (not isinstance(prompt, str) and - not isinstance(prompt, PIL.Image.Image) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` `PIL.Image` or `list` but is {type(prompt)}" - ) - if (not isinstance(image, str) and - not isinstance(image, PIL.Image.Image) and - not isinstance(image, list)): - raise ValueError( - f"`image` has to be of type `str` `PIL.Image` or `list` but is {type(image)}" - ) + if not isinstance(prompt, str) and not isinstance(prompt, PIL.Image.Image) and not isinstance(prompt, list): + raise ValueError(f"`prompt` has to be of type `str` `PIL.Image` or `list` but is {type(prompt)}") + if not isinstance(image, str) and not isinstance(image, PIL.Image.Image) and not isinstance(image, list): + raise ValueError(f"`image` has to be of type `str` `PIL.Image` or `list` but is {type(image)}") if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = ( batch_size, num_channels_latents, height // self.vae_scale_factor, - width // self.vae_scale_factor, ) + width // self.vae_scale_factor, + ) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" @@ -396,44 +377,39 @@ def prepare_latents( latents = latents * self.scheduler.init_noise_sigma return latents - def set_transformer_params(self, - mix_ratio: float=0.5, - condition_types: Tuple=("text", "image")): + def set_transformer_params(self, mix_ratio: float = 0.5, condition_types: Tuple = ("text", "image")): for name, module in self.image_unet.named_sublayers(include_self=True): if isinstance(module, DualTransformer2DModel): module.mix_ratio = mix_ratio for i, type in enumerate(condition_types): if type == "text": - module.condition_lengths[ - i] = self.text_encoder.config.max_position_embeddings - module.transformer_index_for_condition[ - i] = 1 # use the second (text) transformer + module.condition_lengths[i] = self.text_encoder.config.max_position_embeddings + module.transformer_index_for_condition[i] = 1 # use the second (text) transformer else: module.condition_lengths[i] = 257 - module.transformer_index_for_condition[ - i] = 0 # use the first (image) transformer + module.transformer_index_for_condition[i] = 0 # use the first (image) transformer @paddle.no_grad() def __call__( - self, - prompt: Union[PIL.Image.Image, List[PIL.Image.Image]], - image: Union[str, List[str]], - text_to_image_strength: float=0.5, - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - **kwargs, ): + self, + prompt: Union[PIL.Image.Image, List[PIL.Image.Image]], + image: Union[str, List[str]], + text_to_image_strength: float = 0.5, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + **kwargs, + ): r""" Function invoked when calling the pipeline for generation. @@ -532,12 +508,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompts - prompt_embeds = self._encode_text_prompt(prompt, num_images_per_prompt, - do_classifier_free_guidance) - image_embeddings = self._encode_image_prompt( - image, num_images_per_prompt, do_classifier_free_guidance) - dual_prompt_embeddings = paddle.concat( - [prompt_embeds, image_embeddings], axis=1) + prompt_embeds = self._encode_text_prompt(prompt, num_images_per_prompt, do_classifier_free_guidance) + image_embeddings = self._encode_image_prompt(image, num_images_per_prompt, do_classifier_free_guidance) + dual_prompt_embeddings = paddle.concat([prompt_embeds, image_embeddings], axis=1) prompt_types = ("text", "image") # 4. Prepare timesteps @@ -553,7 +526,8 @@ def __call__( width, dual_prompt_embeddings.dtype, generator, - latents, ) + latents, + ) # 6. Prepare extra step kwargs. extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) @@ -564,26 +538,19 @@ def __call__( # 8. Denoising loop for i, t in enumerate(self.progress_bar(timesteps)): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) - if do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual - noise_pred = self.image_unet( - latent_model_input, - t, - encoder_hidden_states=dual_prompt_embeddings).sample + noise_pred = self.image_unet(latent_model_input, t, encoder_hidden_states=dual_prompt_embeddings).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided if callback is not None and i % callback_steps == 0: @@ -597,6 +564,6 @@ def __call__( image = self.numpy_to_pil(image) if not return_dict: - return (image, ) + return (image,) return ImagePipelineOutput(images=image) diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py index 668f748dfa42a..fc9d645fc7991 100644 --- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +++ b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py @@ -19,8 +19,7 @@ import numpy as np import paddle import PIL -from paddlenlp.transformers import (CLIPImageProcessor, - CLIPVisionModelWithProjection) +from paddlenlp.transformers import CLIPImageProcessor, CLIPVisionModelWithProjection from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers @@ -57,27 +56,30 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline): scheduler: KarrasDiffusionSchedulers def __init__( - self, - image_feature_extractor: CLIPImageProcessor, - image_encoder: CLIPVisionModelWithProjection, - image_unet: UNet2DConditionModel, - vae: AutoencoderKL, - scheduler: KarrasDiffusionSchedulers, ): + self, + image_feature_extractor: CLIPImageProcessor, + image_encoder: CLIPVisionModelWithProjection, + image_unet: UNet2DConditionModel, + vae: AutoencoderKL, + scheduler: KarrasDiffusionSchedulers, + ): super().__init__() self.register_modules( image_feature_extractor=image_feature_extractor, image_encoder=image_encoder, image_unet=image_unet, vae=vae, - scheduler=scheduler, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + scheduler=scheduler, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) def _encode_image_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt, + ): r""" Encodes the prompt into text encoder hidden states. @@ -94,8 +96,7 @@ def _encode_image_prompt( """ def normalize_embeddings(encoder_output): - embeds = self.image_encoder.vision_model.ln_post( - encoder_output.last_hidden_state) + embeds = self.image_encoder.vision_model.ln_post(encoder_output.last_hidden_state) embeds = paddle.matmul(embeds, self.image_encoder.vision_projection) embeds_pooled = embeds[:, 0:1] embeds = embeds / paddle.norm(embeds_pooled, axis=-1, keepdim=True) @@ -107,8 +108,7 @@ def normalize_embeddings(encoder_output): batch_size = len(prompt) if isinstance(prompt, list) else 1 # get prompt text embeddings - image_input = self.image_feature_extractor( - images=prompt, return_tensors="pd") + image_input = self.image_feature_extractor(images=prompt, return_tensors="pd") pixel_values = image_input.pixel_values.cast(self.image_encoder.dtype) image_embeddings = self.image_encoder(pixel_values) image_embeddings = normalize_embeddings(image_embeddings) @@ -116,8 +116,7 @@ def normalize_embeddings(encoder_output): # duplicate image embeddings for each generation per prompt, using mps friendly method bs_embed, seq_len, _ = image_embeddings.shape image_embeddings = image_embeddings.tile([1, num_images_per_prompt, 1]) - image_embeddings = image_embeddings.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + image_embeddings = image_embeddings.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance: @@ -127,37 +126,33 @@ def normalize_embeddings(encoder_output): elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, PIL.Image.Image): uncond_images = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_images = negative_prompt - uncond_images = self.image_feature_extractor( - images=uncond_images, return_tensors="pd") - pixel_values = uncond_images.pixel_values.cast( - self.image_encoder.dtype) + uncond_images = self.image_feature_extractor(images=uncond_images, return_tensors="pd") + pixel_values = uncond_images.pixel_values.cast(self.image_encoder.dtype) negative_prompt_embeds = self.image_encoder(pixel_values) - negative_prompt_embeds = normalize_embeddings( - negative_prompt_embeds) + negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds) # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and conditional embeddings into a single batch # to avoid doing two forward passes - image_embeddings = paddle.concat( - [negative_prompt_embeds, image_embeddings]) + image_embeddings = paddle.concat([negative_prompt_embeds, image_embeddings]) return image_embeddings @@ -177,50 +172,51 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_image_variation.StableDiffusionImageVariationPipeline.check_inputs def check_inputs(self, image, height, width, callback_steps): - if (not isinstance(image, paddle.Tensor) and - not isinstance(image, PIL.Image.Image) and - not isinstance(image, list)): + if ( + not isinstance(image, paddle.Tensor) + and not isinstance(image, PIL.Image.Image) + and not isinstance(image, list) + ): raise ValueError( "`image` has to be of type `paddle.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is" - f" {type(image)}") + f" {type(image)}" + ) if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = [ batch_size, num_channels_latents, @@ -242,23 +238,23 @@ def prepare_latents( @paddle.no_grad() def __call__( - self, - image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor], - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - **kwargs, ): + self, + image: Union[PIL.Image.Image, List[PIL.Image.Image], paddle.Tensor], + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + **kwargs, + ): r""" Function invoked when calling the pipeline for generation. @@ -352,8 +348,8 @@ def __call__( # 3. Encode input prompt image_embeddings = self._encode_image_prompt( - image, num_images_per_prompt, do_classifier_free_guidance, - negative_prompt) + image, num_images_per_prompt, do_classifier_free_guidance, negative_prompt + ) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -368,7 +364,8 @@ def __call__( width, image_embeddings.dtype, generator, - latents, ) + latents, + ) # 6. Prepare extra step kwargs. extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) @@ -376,25 +373,19 @@ def __call__( # 7. Denoising loop for i, t in enumerate(self.progress_bar(timesteps)): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) - if do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual - noise_pred = self.image_unet( - latent_model_input, t, - encoder_hidden_states=image_embeddings).sample + noise_pred = self.image_unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided if callback is not None and i % callback_steps == 0: @@ -408,6 +399,6 @@ def __call__( image = self.numpy_to_pil(image) if not return_dict: - return (image, ) + return (image,) return ImagePipelineOutput(images=image) diff --git a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py index 1524df9f993ed..0d4999c94b24c 100644 --- a/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +++ b/ppdiffusers/ppdiffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py @@ -17,8 +17,11 @@ from typing import Callable, List, Optional, Union import paddle -from paddlenlp.transformers import (CLIPImageProcessor, - CLIPTextModelWithProjection, CLIPTokenizer) +from paddlenlp.transformers import ( + CLIPImageProcessor, + CLIPTextModelWithProjection, + CLIPTokenizer, +) from ...models import AutoencoderKL, Transformer2DModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers @@ -67,13 +70,14 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline): _optional_components = ["text_unet"] def __init__( - self, - tokenizer: CLIPTokenizer, - text_encoder: CLIPTextModelWithProjection, - image_unet: UNet2DConditionModel, - text_unet: UNetFlatConditionModel, - vae: AutoencoderKL, - scheduler: KarrasDiffusionSchedulers, ): + self, + tokenizer: CLIPTokenizer, + text_encoder: CLIPTextModelWithProjection, + image_unet: UNet2DConditionModel, + text_unet: UNetFlatConditionModel, + vae: AutoencoderKL, + scheduler: KarrasDiffusionSchedulers, + ): super().__init__() self.register_modules( tokenizer=tokenizer, @@ -81,8 +85,9 @@ def __init__( image_unet=image_unet, text_unet=text_unet, vae=vae, - scheduler=scheduler, ) - self.vae_scale_factor = 2**(len(self.vae.config.block_out_channels) - 1) + scheduler=scheduler, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if self.text_unet is not None: self._swap_unet_attention_blocks() @@ -97,19 +102,22 @@ def _swap_unet_attention_blocks(self): index = int(index) ( self.image_unet.get_sublayer(parent_name)[index], - self.text_unet.get_sublayer(parent_name)[index], ) = ( - self.text_unet.get_sublayer(parent_name)[index], - self.image_unet.get_sublayer(parent_name)[index], ) + self.text_unet.get_sublayer(parent_name)[index], + ) = ( + self.text_unet.get_sublayer(parent_name)[index], + self.image_unet.get_sublayer(parent_name)[index], + ) def remove_unused_weights(self): self.register_modules(text_unet=None) def _encode_prompt( - self, - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt, ): + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt, + ): r""" Encodes the prompt into text encoder hidden states. @@ -126,11 +134,9 @@ def _encode_prompt( """ def normalize_embeddings(encoder_output): - embeds = paddle.matmul(encoder_output.last_hidden_state, - self.text_encoder.text_projection) + embeds = paddle.matmul(encoder_output.last_hidden_state, self.text_encoder.text_projection) embeds_pooled = encoder_output.text_embeds - embeds = embeds / paddle.norm( - embeds_pooled.unsqueeze(1), axis=-1, keepdim=True) + embeds = embeds / paddle.norm(embeds_pooled.unsqueeze(1), axis=-1, keepdim=True) return embeds batch_size = len(prompt) if isinstance(prompt, list) else 1 @@ -140,35 +146,35 @@ def normalize_embeddings(encoder_output): padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer( - prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids - if untruncated_ids.shape[-1] >= text_input_ids.shape[ - -1] and not paddle.equal_all(text_input_ids, untruncated_ids): - removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1:-1]) + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = text_inputs.attention_mask else: attention_mask = None prompt_embeds = self.text_encoder( text_input_ids, - attention_mask=attention_mask, ) + attention_mask=attention_mask, + ) prompt_embeds = normalize_embeddings(prompt_embeds) # duplicate text embeddings for each generation per prompt, using mps friendly method bs_embed, seq_len, _ = prompt_embeds.shape prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance: @@ -178,14 +184,16 @@ def normalize_embeddings(encoder_output): elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}.") + f" {type(prompt)}." + ) elif isinstance(negative_prompt, str): uncond_tokens = [negative_prompt] elif batch_size != len(negative_prompt): raise ValueError( f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`.") + " the batch size of `prompt`." + ) else: uncond_tokens = negative_prompt @@ -195,32 +203,29 @@ def normalize_embeddings(encoder_output): padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) - if (hasattr(self.text_encoder.config, "use_attention_mask") and - self.text_encoder.config.use_attention_mask): + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: attention_mask = uncond_input.attention_mask else: attention_mask = None negative_prompt_embeds = self.text_encoder( uncond_input.input_ids, - attention_mask=attention_mask, ) - negative_prompt_embeds = normalize_embeddings( - negative_prompt_embeds) + attention_mask=attention_mask, + ) + negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds) # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds @@ -240,54 +245,50 @@ def prepare_extra_step_kwargs(self, generator, eta): # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = "eta" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = "generator" in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: extra_step_kwargs["generator"] = generator return extra_step_kwargs # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs def check_inputs( - self, - prompt, - height, - width, - callback_steps, - negative_prompt=None, - prompt_embeds=None, - negative_prompt_embeds=None, ): + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f"`height` and `width` have to be divisible by 8 but are {height} and {width}." - ) + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) if prompt is not None and prompt_embeds is not None: raise ValueError( f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two.") + " only forward one of the two." + ) elif prompt is None and prompt_embeds is None: raise ValueError( "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." ) - elif prompt is not None and (not isinstance(prompt, str) and - not isinstance(prompt, list)): - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") if negative_prompt is not None and negative_prompt_embeds is not None: raise ValueError( @@ -300,18 +301,20 @@ def check_inputs( raise ValueError( "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}.") + f" {negative_prompt_embeds.shape}." + ) # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents( - self, - batch_size, - num_channels_latents, - height, - width, - dtype, - generator, - latents=None, ): + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + generator, + latents=None, + ): shape = [ batch_size, num_channels_latents, @@ -333,23 +336,23 @@ def prepare_latents( @paddle.no_grad() def __call__( - self, - prompt: Union[str, List[str]], - height: Optional[int]=None, - width: Optional[int]=None, - num_inference_steps: int=50, - guidance_scale: float=7.5, - negative_prompt: Optional[Union[str, List[str]]]=None, - num_images_per_prompt: Optional[int]=1, - eta: float=0.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, - **kwargs, ): + self, + prompt: Union[str, List[str]], + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + **kwargs, + ): r""" Function invoked when calling the pipeline for generation. @@ -434,9 +437,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt - prompt_embeds = self._encode_prompt(prompt, num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt) + prompt_embeds = self._encode_prompt( + prompt, num_images_per_prompt, do_classifier_free_guidance, negative_prompt + ) # 4. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -451,7 +454,8 @@ def __call__( width, prompt_embeds.dtype, generator, - latents, ) + latents, + ) # 6. Prepare extra step kwargs. extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) @@ -459,25 +463,19 @@ def __call__( # 7. Denoising loop for i, t in enumerate(self.progress_bar(timesteps)): # expand the latents if we are doing classifier free guidance - latent_model_input = (paddle.concat([latents] * 2) - if do_classifier_free_guidance else latents) - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = paddle.concat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual - noise_pred = self.image_unet( - latent_model_input, t, - encoder_hidden_states=prompt_embeds).sample + noise_pred = self.image_unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * ( - noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, - **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # call the callback, if provided if callback is not None and i % callback_steps == 0: @@ -491,6 +489,6 @@ def __call__( image = self.numpy_to_pil(image) if not return_dict: - return (image, ) + return (image,) return ImagePipelineOutput(images=image) diff --git a/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/__init__.py b/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/__init__.py index 4a1b00a7eb0fa..f7426c40427c0 100644 --- a/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/__init__.py +++ b/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/__init__.py @@ -17,5 +17,7 @@ from ...utils import is_paddle_available, is_paddlenlp_available if is_paddle_available() and is_paddlenlp_available(): - from .pipeline_vq_diffusion import (LearnedClassifierFreeSamplingEmbeddings, - VQDiffusionPipeline) + from .pipeline_vq_diffusion import ( + LearnedClassifierFreeSamplingEmbeddings, + VQDiffusionPipeline, + ) diff --git a/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py index f8d1fc09518db..e97be223237f9 100644 --- a/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py +++ b/ppdiffusers/ppdiffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py @@ -42,23 +42,23 @@ class LearnedClassifierFreeSamplingEmbeddings(ModelMixin, ConfigMixin): @register_to_config def __init__( - self, - learnable: bool, - hidden_size: Optional[int]=None, - length: Optional[int]=None, ): + self, + learnable: bool, + hidden_size: Optional[int] = None, + length: Optional[int] = None, + ): super().__init__() self.learnable = learnable if self.learnable: - assert (hidden_size is not None - ), "learnable=True requires `hidden_size` to be set" + assert hidden_size is not None, "learnable=True requires `hidden_size` to be set" assert length is not None, "learnable=True requires `length` to be set" embeddings = paddle.zeros([length, hidden_size]) self.embeddings = self.create_parameter( - embeddings.shape, - default_initializer=nn.initializer.Assign(embeddings)) + embeddings.shape, default_initializer=nn.initializer.Assign(embeddings) + ) else: self.embeddings = None @@ -95,13 +95,13 @@ class VQDiffusionPipeline(DiffusionPipeline): scheduler: VQDiffusionScheduler def __init__( - self, - vqvae: VQModel, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - transformer: Transformer2DModel, - scheduler: VQDiffusionScheduler, - learned_classifier_free_sampling_embeddings: LearnedClassifierFreeSamplingEmbeddings, + self, + vqvae: VQModel, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + transformer: Transformer2DModel, + scheduler: VQDiffusionScheduler, + learned_classifier_free_sampling_embeddings: LearnedClassifierFreeSamplingEmbeddings, ): super().__init__() @@ -114,8 +114,7 @@ def __init__( learned_classifier_free_sampling_embeddings=learned_classifier_free_sampling_embeddings, ) - def _encode_prompt(self, prompt, num_images_per_prompt, - do_classifier_free_guidance): + def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance): batch_size = len(prompt) if isinstance(prompt, list) else 1 # get prompt text embeddings @@ -123,16 +122,17 @@ def _encode_prompt(self, prompt, num_images_per_prompt, prompt, padding="max_length", max_length=self.tokenizer.model_max_length, - return_tensors="pd", ) + return_tensors="pd", + ) text_input_ids = text_inputs.input_ids if text_input_ids.shape[-1] > self.tokenizer.model_max_length: - removed_text = self.tokenizer.batch_decode( - text_input_ids[:, self.tokenizer.model_max_length:]) + removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :]) logger.warning( "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}") - text_input_ids = text_input_ids[:, :self.tokenizer.model_max_length] + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length] prompt_embeds = self.text_encoder(text_input_ids)[0] # NOTE: This additional step of normalizing the text embeddings is from VQ-Diffusion. @@ -141,21 +141,17 @@ def _encode_prompt(self, prompt, num_images_per_prompt, # # CLIP normalizing the pooled output. # https://github.com/huggingface/transformers/blob/d92e22d1f28324f513f3080e5c47c071a3916721/src/transformers/models/clip/modeling_clip.py#L1052-L1053 - prompt_embeds = prompt_embeds / prompt_embeds.norm( - axis=-1, keepdim=True) + prompt_embeds = prompt_embeds / prompt_embeds.norm(axis=-1, keepdim=True) # duplicate text embeddings for each generation per prompt, using mps friendly method bs_embed, seq_len, _ = prompt_embeds.shape prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) - prompt_embeds = prompt_embeds.reshape( - [bs_embed * num_images_per_prompt, seq_len, -1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) if do_classifier_free_guidance: if self.learned_classifier_free_sampling_embeddings.learnable: - negative_prompt_embeds = ( - self.learned_classifier_free_sampling_embeddings.embeddings) - negative_prompt_embeds = negative_prompt_embeds.unsqueeze( - 0).tile([batch_size, 1, 1]) + negative_prompt_embeds = self.learned_classifier_free_sampling_embeddings.embeddings + negative_prompt_embeds = negative_prompt_embeds.unsqueeze(0).tile([batch_size, 1, 1]) else: uncond_tokens = [""] * batch_size @@ -165,45 +161,39 @@ def _encode_prompt(self, prompt, num_images_per_prompt, padding="max_length", max_length=max_length, truncation=True, - return_tensors="pd", ) - negative_prompt_embeds = self.text_encoder( - uncond_input.input_ids)[0] + return_tensors="pd", + ) + negative_prompt_embeds = self.text_encoder(uncond_input.input_ids)[0] # See comment for normalizing text embeddings - negative_prompt_embeds = (negative_prompt_embeds / - negative_prompt_embeds.norm( - axis=-1, keepdim=True)) + negative_prompt_embeds = negative_prompt_embeds / negative_prompt_embeds.norm(axis=-1, keepdim=True) # duplicate unconditional embeddings for each generation per prompt, using mps friendly method seq_len = negative_prompt_embeds.shape[1] - negative_prompt_embeds = negative_prompt_embeds.tile( - [1, num_images_per_prompt, 1]) - negative_prompt_embeds = negative_prompt_embeds.reshape( - [batch_size * num_images_per_prompt, seq_len, -1]) + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch # to avoid doing two forward passes - prompt_embeds = paddle.concat( - [negative_prompt_embeds, prompt_embeds]) + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) return prompt_embeds @paddle.no_grad() def __call__( - self, - prompt: Union[str, List[str]], - num_inference_steps: int=100, - guidance_scale: float=5.0, - truncation_rate: float=1.0, - num_images_per_prompt: int=1, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - latents: Optional[paddle.Tensor]=None, - output_type: Optional[str]="pil", - return_dict: bool=True, - callback: Optional[Callable[[int, int, paddle.Tensor], None]]=None, - callback_steps: Optional[int]=1, ) -> Union[ImagePipelineOutput, - Tuple]: + self, + prompt: Union[str, List[str]], + num_inference_steps: int = 100, + guidance_scale: float = 5.0, + truncation_rate: float = 1.0, + num_images_per_prompt: int = 1, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: Optional[int] = 1, + ) -> Union[ImagePipelineOutput, Tuple]: """ Function invoked when calling the pipeline for generation. @@ -252,23 +242,21 @@ def __call__( elif isinstance(prompt, list): batch_size = len(prompt) else: - raise ValueError( - f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" - ) + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") batch_size = batch_size * num_images_per_prompt do_classifier_free_guidance = guidance_scale > 1.0 - prompt_embeds = self._encode_prompt(prompt, num_images_per_prompt, - do_classifier_free_guidance) + prompt_embeds = self._encode_prompt(prompt, num_images_per_prompt, do_classifier_free_guidance) if (callback_steps is None) or ( - callback_steps is not None and - (not isinstance(callback_steps, int) or callback_steps <= 0)): + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): raise ValueError( f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}.") + f" {type(callback_steps)}." + ) # get the initial completely masked latents unless the user supplied it @@ -278,14 +266,12 @@ def __call__( latents = paddle.full(latents_shape, mask_class, dtype="int64") else: if latents.shape != latents_shape: - raise ValueError( - f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}" - ) - if (latents < 0).any() or ( - latents >= self.transformer.num_vector_embeds).any(): + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}") + if (latents < 0).any() or (latents >= self.transformer.num_vector_embeds).any(): raise ValueError( "Unexpected latents value(s). All latents be valid embedding indices i.e. in the range 0," - f" {self.transformer.num_vector_embeds - 1} (inclusive).") + f" {self.transformer.num_vector_embeds - 1} (inclusive)." + ) # set timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -296,20 +282,15 @@ def __call__( for i, t in enumerate(self.progress_bar(timesteps_tensor)): # expand the sample if we are doing classifier free guidance - latent_model_input = (paddle.concat([sample] * 2) - if do_classifier_free_guidance else sample) + latent_model_input = paddle.concat([sample] * 2) if do_classifier_free_guidance else sample # predict the un-noised image # model_output == `log_p_x_0` - model_output = self.transformer( - latent_model_input, - encoder_hidden_states=prompt_embeds, - timestep=t).sample + model_output = self.transformer(latent_model_input, encoder_hidden_states=prompt_embeds, timestep=t).sample if do_classifier_free_guidance: model_output_uncond, model_output_text = model_output.chunk(2) - model_output = model_output_uncond + guidance_scale * ( - model_output_text - model_output_uncond) + model_output = model_output_uncond + guidance_scale * (model_output_text - model_output_uncond) model_output -= logsumexp(model_output, axis=1, keepdim=True) model_output = self.truncate(model_output, truncation_rate) @@ -318,9 +299,7 @@ def __call__( model_output = model_output.clip(-70) # compute the previous noisy sample x_t -> x_t-1 - sample = self.scheduler.step( - model_output, timestep=t, sample=sample, - generator=generator).prev_sample + sample = self.scheduler.step(model_output, timestep=t, sample=sample, generator=generator).prev_sample # call the callback, if provided if callback is not None and i % callback_steps == 0: @@ -331,9 +310,9 @@ def __call__( batch_size, self.transformer.height, self.transformer.width, - embedding_channels, ) - embeddings = self.vqvae.quantize.get_codebook_entry( - sample, shape=embeddings_shape) + embedding_channels, + ) + embeddings = self.vqvae.quantize.get_codebook_entry(sample, shape=embeddings_shape) image = self.vqvae.decode(embeddings, force_not_quantize=True).sample image = (image / 2 + 0.5).clip(0, 1) @@ -343,34 +322,29 @@ def __call__( image = self.numpy_to_pil(image) if not return_dict: - return (image, ) + return (image,) return ImagePipelineOutput(images=image) - def truncate(self, log_p_x_0: paddle.Tensor, - truncation_rate: float) -> paddle.Tensor: + def truncate(self, log_p_x_0: paddle.Tensor, truncation_rate: float) -> paddle.Tensor: """ Truncates log_p_x_0 such that for each column vector, the total cumulative probability is `truncation_rate` The lowest probabilities that would increase the cumulative probability above `truncation_rate` are set to zero. """ - sorted_log_p_x_0, indices = paddle.topk( - log_p_x_0, k=log_p_x_0.shape[1], axis=1) + sorted_log_p_x_0, indices = paddle.topk(log_p_x_0, k=log_p_x_0.shape[1], axis=1) sorted_p_x_0 = paddle.exp(sorted_log_p_x_0) - keep_mask = ( - sorted_p_x_0.cumsum(axis=1) < truncation_rate).cast("int64") + keep_mask = (sorted_p_x_0.cumsum(axis=1) < truncation_rate).cast("int64") # Ensure that at least the largest probability is not zeroed out all_true = paddle.full_like(keep_mask[:, 0:1, :], 1) keep_mask = paddle.concat((all_true, keep_mask), axis=1) keep_mask = keep_mask[:, :-1, :] - keep_mask = paddle.take_along_axis( - keep_mask, indices.argsort(1), - axis=1).cast("bool") # keep_mask.gather(indices.argsort(1), axis=1) + keep_mask = paddle.take_along_axis(keep_mask, indices.argsort(1), axis=1).cast( + "bool" + ) # keep_mask.gather(indices.argsort(1), axis=1) rv = log_p_x_0.clone() # rv[~keep_mask] = -INF # -inf = log(0) - rv = paddle.where( - keep_mask, rv, paddle.to_tensor( - -INF, dtype="float32")) + rv = paddle.where(keep_mask, rv, paddle.to_tensor(-INF, dtype="float32")) return rv diff --git a/ppdiffusers/ppdiffusers/schedulers/__init__.py b/ppdiffusers/ppdiffusers/schedulers/__init__.py index dd064c0187497..682e58fcc57df 100644 --- a/ppdiffusers/ppdiffusers/schedulers/__init__.py +++ b/ppdiffusers/ppdiffusers/schedulers/__init__.py @@ -13,8 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ..utils import (OptionalDependencyNotAvailable, is_paddle_available, - is_scipy_available) +from ..utils import ( + OptionalDependencyNotAvailable, + is_paddle_available, + is_scipy_available, +) try: if not is_paddle_available(): @@ -22,8 +25,9 @@ except OptionalDependencyNotAvailable: from ..utils.dummy_paddle_objects import * # noqa F403 else: - from .preconfig.preconfig_scheduling_euler_ancestral_discrete import \ - PreconfigEulerAncestralDiscreteScheduler + from .preconfig.preconfig_scheduling_euler_ancestral_discrete import ( + PreconfigEulerAncestralDiscreteScheduler, + ) from .scheduling_ddim import DDIMScheduler from .scheduling_ddim_inverse import DDIMInverseScheduler from .scheduling_ddpm import DDPMScheduler @@ -31,13 +35,11 @@ from .scheduling_dpmsolver_multistep import DPMSolverMultistepScheduler from .scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler from .scheduling_dpmsolver_unidiffuser import DPMSolverUniDiffuserScheduler - from .scheduling_euler_ancestral_discrete import \ - EulerAncestralDiscreteScheduler + from .scheduling_euler_ancestral_discrete import EulerAncestralDiscreteScheduler from .scheduling_euler_discrete import EulerDiscreteScheduler from .scheduling_heun_discrete import HeunDiscreteScheduler from .scheduling_ipndm import IPNDMScheduler - from .scheduling_k_dpm_2_ancestral_discrete import \ - KDPM2AncestralDiscreteScheduler + from .scheduling_k_dpm_2_ancestral_discrete import KDPM2AncestralDiscreteScheduler from .scheduling_k_dpm_2_discrete import KDPM2DiscreteScheduler from .scheduling_karras_ve import KarrasVeScheduler from .scheduling_pndm import PNDMScheduler @@ -55,6 +57,7 @@ except OptionalDependencyNotAvailable: from ..utils.dummy_paddle_and_scipy_objects import * # noqa F403 else: - from .preconfig.preconfig_scheduling_lms_discrete import \ - PreconfigLMSDiscreteScheduler + from .preconfig.preconfig_scheduling_lms_discrete import ( + PreconfigLMSDiscreteScheduler, + ) from .scheduling_lms_discrete import LMSDiscreteScheduler diff --git a/ppdiffusers/ppdiffusers/schedulers/preconfig/__init__.py b/ppdiffusers/ppdiffusers/schedulers/preconfig/__init__.py index 0af0ad582bd99..ecff93753b32d 100644 --- a/ppdiffusers/ppdiffusers/schedulers/preconfig/__init__.py +++ b/ppdiffusers/ppdiffusers/schedulers/preconfig/__init__.py @@ -14,8 +14,11 @@ # limitations under the License. # flake8: noqa -from ...utils import (OptionalDependencyNotAvailable, is_paddle_available, - is_scipy_available) +from ...utils import ( + OptionalDependencyNotAvailable, + is_paddle_available, + is_scipy_available, +) try: if not is_paddle_available(): @@ -23,13 +26,13 @@ except OptionalDependencyNotAvailable: from ...utils.dummy_paddle_objects import * # noqa F403 else: - from .preconfig_scheduling_euler_ancestral_discrete import \ - PreconfigEulerAncestralDiscreteScheduler + from .preconfig_scheduling_euler_ancestral_discrete import ( + PreconfigEulerAncestralDiscreteScheduler, + ) try: if not (is_paddle_available() and is_scipy_available()): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from ...utils.dummy_paddle_and_scipy_objects import * # noqa F403 else: - from .preconfig_scheduling_lms_discrete import \ - PreconfigLMSDiscreteScheduler + from .preconfig_scheduling_lms_discrete import PreconfigLMSDiscreteScheduler diff --git a/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_euler_ancestral_discrete.py b/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_euler_ancestral_discrete.py index 53de9a57c4178..a925526d76b33 100644 --- a/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_euler_ancestral_discrete.py +++ b/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_euler_ancestral_discrete.py @@ -47,8 +47,7 @@ class PreconfigEulerAncestralDiscreteSchedulerOutput(BaseOutput): # Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, - max_beta=0.999) -> paddle.Tensor: +def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor: """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -67,7 +66,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, """ def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2 + return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 betas = [] for i in range(num_diffusion_timesteps): @@ -108,38 +107,40 @@ class PreconfigEulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - num_train_timesteps: int=1000, - beta_start: float=0.0001, - beta_end: float=0.02, - beta_schedule: str="linear", - trained_betas: Optional[Union[np.ndarray, List[float]]]=None, - prediction_type: str="epsilon", - preconfig: bool=True, ): + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.0001, + beta_end: float = 0.02, + beta_schedule: str = "linear", + trained_betas: Optional[Union[np.ndarray, List[float]]] = None, + prediction_type: str = "epsilon", + preconfig: bool = True, + ): if trained_betas is not None: self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32) elif beta_schedule == "linear": - self.betas = paddle.linspace( - beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) + self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) elif beta_schedule == "scaled_linear": # this schedule is very specific to the latent diffusion model. - self.betas = (paddle.linspace( - beta_start**0.5, - beta_end**0.5, - num_train_timesteps, - dtype=paddle.float32, )**2) + self.betas = ( + paddle.linspace( + beta_start**0.5, + beta_end**0.5, + num_train_timesteps, + dtype=paddle.float32, + ) + ** 2 + ) elif beta_schedule == "squaredcos_cap_v2": # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError( - f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = paddle.cumprod(self.alphas, 0) - sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)** - 0.5) + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32) self.sigmas = paddle.to_tensor(sigmas) @@ -148,18 +149,15 @@ def __init__( # setable values self.num_inference_steps = None - timesteps = np.linspace( - 0, num_train_timesteps - 1, num_train_timesteps, - dtype=float)[::-1].copy() + timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy() self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32) self.is_scale_input_called = False self.preconfig = preconfig self.step_index_offset = 0 - def scale_model_input(self, - sample: paddle.Tensor, - timestep: Union[float, paddle.Tensor], - **kwargs) -> paddle.Tensor: + def scale_model_input( + self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor], **kwargs + ) -> paddle.Tensor: """ Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm. @@ -178,7 +176,7 @@ def scale_model_input(self, if not self.preconfig: sigma = self.sigmas[step_index] - sample = sample / ((sigma**2 + 1)**0.5) + sample = sample / ((sigma**2 + 1) ** 0.5) return sample else: if step_index > (len(self.latent_scales) - 1): @@ -196,13 +194,8 @@ def set_timesteps(self, num_inference_steps: int): self.num_inference_steps = num_inference_steps self.step_index_offset = 0 - timesteps = np.linspace( - 0, - self.config.num_train_timesteps - 1, - num_inference_steps, - dtype=float)[::-1].copy() - sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)** - 0.5) + timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy() + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas) sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32) self.sigmas = paddle.to_tensor(sigmas) @@ -213,23 +206,21 @@ def set_timesteps(self, num_inference_steps: int): for step_index_i in range(len(self.timesteps)): sigma_from = self.sigmas[step_index_i] sigma_to = self.sigmas[step_index_i + 1] - sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / - sigma_from**2)**0.5 - sigma_down = (sigma_to**2 - sigma_up**2)**0.5 + sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5 + sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5 self.sigma_up.append(sigma_up) self.sigma_down.append(sigma_down) - self.latent_scales = 1 / ((self.sigmas**2 + 1)**0.5) + self.latent_scales = 1 / ((self.sigmas**2 + 1) ** 0.5) def step( - self, - model_output: paddle.Tensor, - timestep: Union[float, paddle.Tensor], - sample: paddle.Tensor, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - return_dict: bool=True, - **kwargs, ) -> Union[PreconfigEulerAncestralDiscreteSchedulerOutput, - Tuple]: + self, + model_output: paddle.Tensor, + timestep: Union[float, paddle.Tensor], + sample: paddle.Tensor, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + return_dict: bool = True, + **kwargs, + ) -> Union[PreconfigEulerAncestralDiscreteSchedulerOutput, Tuple]: """ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). @@ -251,7 +242,8 @@ def step( if not self.is_scale_input_called: logger.warning( "The `scale_model_input` function should be called before `step` to ensure correct denoising. " - "See `StableDiffusionPipeline` for a usage example.") + "See `StableDiffusionPipeline` for a usage example." + ) if kwargs.get("return_pred_original_sample") is not None: return_pred_original_sample = kwargs["return_pred_original_sample"] else: @@ -270,11 +262,9 @@ def step( pred_original_sample = sample - sigma * model_output elif self.config.prediction_type == "v_prediction": # * c_out + input * c_skip - pred_original_sample = model_output * (-sigma / ( - sigma**2 + 1)**0.5) + (sample / (sigma**2 + 1)) + pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1)) elif self.config.prediction_type == "sample": - raise NotImplementedError( - "prediction_type not implemented yet: sample") + raise NotImplementedError("prediction_type not implemented yet: sample") else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`" @@ -283,38 +273,37 @@ def step( if not self.preconfig: sigma_from = self.sigmas[step_index] sigma_to = self.sigmas[step_index + 1] - sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from - **2)**0.5 - sigma_down = (sigma_to**2 - sigma_up**2)**0.5 + sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5 + sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5 else: sigma_up = self.sigma_up[step_index] sigma_down = self.sigma_down[step_index] # 2. Convert to an ODE derivative dt = sigma_down - sigma prev_sample = sample + derivative * dt - noise = randn_tensor( - model_output.shape, dtype=model_output.dtype, generator=generator) + noise = randn_tensor(model_output.shape, dtype=model_output.dtype, generator=generator) prev_sample = prev_sample + noise * sigma_up if not return_dict: if not return_pred_original_sample: - return (prev_sample, ) + return (prev_sample,) else: return (prev_sample, pred_original_sample) return PreconfigEulerAncestralDiscreteSchedulerOutput( - prev_sample=prev_sample, pred_original_sample=pred_original_sample) + prev_sample=prev_sample, pred_original_sample=pred_original_sample + ) def add_noise( - self, - original_samples: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor, ) -> paddle.Tensor: + self, + original_samples: paddle.Tensor, + noise: paddle.Tensor, + timesteps: paddle.Tensor, + ) -> paddle.Tensor: # Make sure sigmas and timesteps have the same dtype as original_samples self.sigmas = self.sigmas.cast(original_samples.dtype) schedule_timesteps = self.timesteps - step_indices = [(schedule_timesteps == t).nonzero().item() - for t in timesteps] + step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps] sigma = self.sigmas[step_indices].flatten() while len(sigma.shape) < len(original_samples.shape): diff --git a/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_lms_discrete.py b/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_lms_discrete.py index dd6c73e2e7250..16f74fcb6860f 100644 --- a/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_lms_discrete.py +++ b/ppdiffusers/ppdiffusers/schedulers/preconfig/preconfig_scheduling_lms_discrete.py @@ -65,7 +65,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): """ def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2 + return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 betas = [] for i in range(num_diffusion_timesteps): @@ -106,38 +106,40 @@ class PreconfigLMSDiscreteScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - num_train_timesteps: int=1000, - beta_start: float=0.0001, - beta_end: float=0.02, - beta_schedule: str="linear", - trained_betas: Optional[Union[np.ndarray, List[float]]]=None, - prediction_type: str="epsilon", - preconfig=True, ): + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.0001, + beta_end: float = 0.02, + beta_schedule: str = "linear", + trained_betas: Optional[Union[np.ndarray, List[float]]] = None, + prediction_type: str = "epsilon", + preconfig=True, + ): if trained_betas is not None: self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32) elif beta_schedule == "linear": - self.betas = paddle.linspace( - beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) + self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) elif beta_schedule == "scaled_linear": # this schedule is very specific to the latent diffusion model. - self.betas = (paddle.linspace( - beta_start**0.5, - beta_end**0.5, - num_train_timesteps, - dtype=paddle.float32, )**2) + self.betas = ( + paddle.linspace( + beta_start**0.5, + beta_end**0.5, + num_train_timesteps, + dtype=paddle.float32, + ) + ** 2 + ) elif beta_schedule == "squaredcos_cap_v2": # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError( - f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = paddle.cumprod(self.alphas, 0) - sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)** - 0.5) + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32) self.sigmas = paddle.to_tensor(sigmas) @@ -146,18 +148,15 @@ def __init__( # setable values self.num_inference_steps = None - timesteps = np.linspace( - 0, num_train_timesteps - 1, num_train_timesteps, - dtype=float)[::-1].copy() + timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy() self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32) self.derivatives = [] self.is_scale_input_called = False self.preconfig = preconfig - def scale_model_input(self, - sample: paddle.Tensor, - timestep: Union[float, paddle.Tensor], - **kwargs) -> paddle.Tensor: + def scale_model_input( + self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor], **kwargs + ) -> paddle.Tensor: """ Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the K-LMS algorithm. @@ -175,7 +174,7 @@ def scale_model_input(self, self.is_scale_input_called = True if not self.preconfig: sigma = self.sigmas[step_index] - sample = sample / ((sigma**2 + 1)**0.5) + sample = sample / ((sigma**2 + 1) ** 0.5) return sample else: return sample * self.latent_scales[step_index] @@ -195,16 +194,14 @@ def lms_derivative(tau): for k in range(order): if current_order == k: continue - prod *= (tau - self.sigmas[t - k]) / ( - self.sigmas[t - current_order] - self.sigmas[t - k]) + prod *= (tau - self.sigmas[t - k]) / (self.sigmas[t - current_order] - self.sigmas[t - k]) return prod - integrated_coeff = integrate.quad( - lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0] + integrated_coeff = integrate.quad(lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0] return integrated_coeff - def set_timesteps(self, num_inference_steps: int, preconfig_order: int=4): + def set_timesteps(self, num_inference_steps: int, preconfig_order: int = 4): """ Sets the timesteps used for the diffusion chain. Supporting function to be run before inference. @@ -214,13 +211,8 @@ def set_timesteps(self, num_inference_steps: int, preconfig_order: int=4): """ self.num_inference_steps = num_inference_steps - timesteps = np.linspace( - 0, - self.config.num_train_timesteps - 1, - num_inference_steps, - dtype=float)[::-1].copy() - sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)** - 0.5) + timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy() + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas) sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32) self.sigmas = paddle.to_tensor(sigmas) @@ -230,24 +222,22 @@ def set_timesteps(self, num_inference_steps: int, preconfig_order: int=4): if self.preconfig: self.order = preconfig_order self.lms_coeffs = [] - self.latent_scales = [ - 1.0 / ((sigma**2 + 1)**0.5) for sigma in self.sigmas - ] + self.latent_scales = [1.0 / ((sigma**2 + 1) ** 0.5) for sigma in self.sigmas] for step_index in range(self.num_inference_steps): order = min(step_index + 1, preconfig_order) - self.lms_coeffs.append([ - self.get_lms_coefficient(order, step_index, curr_order) - for curr_order in range(order) - ]) + self.lms_coeffs.append( + [self.get_lms_coefficient(order, step_index, curr_order) for curr_order in range(order)] + ) def step( - self, - model_output: paddle.Tensor, - timestep: Union[float, paddle.Tensor], - sample: paddle.Tensor, - order: int=4, - return_dict: bool=True, - **kwargs, ) -> Union[PreconfigLMSDiscreteSchedulerOutput, Tuple]: + self, + model_output: paddle.Tensor, + timestep: Union[float, paddle.Tensor], + sample: paddle.Tensor, + order: int = 4, + return_dict: bool = True, + **kwargs, + ) -> Union[PreconfigLMSDiscreteSchedulerOutput, Tuple]: """ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). @@ -272,7 +262,8 @@ def step( if not self.is_scale_input_called: warnings.warn( "The `scale_model_input` function should be called before `step` to ensure correct denoising. " - "See `StableDiffusionPipeline` for a usage example.") + "See `StableDiffusionPipeline` for a usage example." + ) if kwargs.get("return_pred_original_sample") is not None: return_pred_original_sample = kwargs["return_pred_original_sample"] else: @@ -292,8 +283,7 @@ def step( pred_original_sample = sample - sigma * model_output elif self.config.prediction_type == "v_prediction": # * c_out + input * c_skip - pred_original_sample = model_output * (-sigma / ( - sigma**2 + 1)**0.5) + (sample / (sigma**2 + 1)) + pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1)) elif self.config.prediction_type == "sample": pred_original_sample = model_output else: @@ -310,42 +300,37 @@ def step( if not self.preconfig: # 3. If not preconfiged, compute linear multistep coefficients. order = min(step_index + 1, order) - lms_coeffs = [ - self.get_lms_coefficient(order, step_index, curr_order) - for curr_order in range(order) - ] + lms_coeffs = [self.get_lms_coefficient(order, step_index, curr_order) for curr_order in range(order)] # 4. Compute previous sample based on the derivatives path prev_sample = sample + sum( - coeff * derivative - for coeff, derivative in zip(lms_coeffs, - reversed(self.derivatives))) + coeff * derivative for coeff, derivative in zip(lms_coeffs, reversed(self.derivatives)) + ) else: # 3. If preconfiged, direct compute previous sample based on the derivatives path prev_sample = sample + sum( coeff * derivative - for coeff, derivative in zip(self.lms_coeffs[step_index], - reversed(self.derivatives))) + for coeff, derivative in zip(self.lms_coeffs[step_index], reversed(self.derivatives)) + ) if not return_dict: if not return_pred_original_sample: - return (prev_sample, ) + return (prev_sample,) else: return (prev_sample, pred_original_sample) - return PreconfigLMSDiscreteSchedulerOutput( - prev_sample=prev_sample, pred_original_sample=pred_original_sample) + return PreconfigLMSDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample) def add_noise( - self, - original_samples: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor, ) -> paddle.Tensor: + self, + original_samples: paddle.Tensor, + noise: paddle.Tensor, + timesteps: paddle.Tensor, + ) -> paddle.Tensor: # Make sure sigmas and timesteps have the same dtype as original_samples sigmas = self.sigmas.cast(original_samples.dtype) schedule_timesteps = self.timesteps - step_indices = [(schedule_timesteps == t).nonzero().item() - for t in timesteps] + step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps] sigma = sigmas[step_indices].flatten() while len(sigma.shape) < len(original_samples.shape): diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim.py index b4929d761f687..9bb46c472ca10 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim.py @@ -48,8 +48,7 @@ class DDIMSchedulerOutput(BaseOutput): # Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, - max_beta=0.999) -> paddle.Tensor: +def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor: """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -68,7 +67,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, """ def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2 + return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 betas = [] for i in range(num_diffusion_timesteps): @@ -131,38 +130,41 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - num_train_timesteps: int=1000, - beta_start: float=0.0001, - beta_end: float=0.02, - beta_schedule: str="linear", - trained_betas: Optional[Union[np.ndarray, List[float]]]=None, - clip_sample: bool=True, - set_alpha_to_one: bool=True, - steps_offset: int=0, - prediction_type: str="epsilon", - thresholding: bool=False, - dynamic_thresholding_ratio: float=0.995, - clip_sample_range: float=1.0, - sample_max_value: float=1.0, ): + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.0001, + beta_end: float = 0.02, + beta_schedule: str = "linear", + trained_betas: Optional[Union[np.ndarray, List[float]]] = None, + clip_sample: bool = True, + set_alpha_to_one: bool = True, + steps_offset: int = 0, + prediction_type: str = "epsilon", + thresholding: bool = False, + dynamic_thresholding_ratio: float = 0.995, + clip_sample_range: float = 1.0, + sample_max_value: float = 1.0, + ): if trained_betas is not None: self.betas = paddle.to_tensor(trained_betas, dtype="float32") elif beta_schedule == "linear": - self.betas = paddle.linspace( - beta_start, beta_end, num_train_timesteps, dtype="float32") + self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype="float32") elif beta_schedule == "scaled_linear": # this schedule is very specific to the latent diffusion model. - self.betas = (paddle.linspace( - beta_start**0.5, - beta_end**0.5, - num_train_timesteps, - dtype="float32", )**2) + self.betas = ( + paddle.linspace( + beta_start**0.5, + beta_end**0.5, + num_train_timesteps, + dtype="float32", + ) + ** 2 + ) elif beta_schedule == "squaredcos_cap_v2": # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError( - f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = paddle.cumprod(self.alphas, 0) @@ -171,20 +173,16 @@ def __init__( # For the final step, there is no previous alphas_cumprod because we are already at 0 # `set_alpha_to_one` decides whether we set this parameter simply to one or # whether we use the final alpha of the "non-previous" one. - self.final_alpha_cumprod = (paddle.to_tensor(1.0) if set_alpha_to_one - else self.alphas_cumprod[0]) + self.final_alpha_cumprod = paddle.to_tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0] # standard deviation of the initial noise distribution self.init_noise_sigma = 1.0 # setable values self.num_inference_steps = None - self.timesteps = paddle.to_tensor( - np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)) + self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)) - def scale_model_input(self, - sample: paddle.Tensor, - timestep: Optional[int]=None) -> paddle.Tensor: + def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. @@ -200,13 +198,11 @@ def scale_model_input(self, def _get_variance(self, timestep, prev_timestep): alpha_prod_t = self.alphas_cumprod[timestep] - alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep] if - prev_timestep >= 0 else self.final_alpha_cumprod) + alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod beta_prod_t = 1 - alpha_prod_t beta_prod_t_prev = 1 - alpha_prod_t_prev - variance = (beta_prod_t_prev / beta_prod_t) * ( - 1 - alpha_prod_t / alpha_prod_t_prev) + variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev) return variance @@ -232,8 +228,7 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor: abs_sample = sample.abs() # "a certain percentile absolute pixel value" - s = paddle.quantile( - abs_sample, self.config.dynamic_thresholding_ratio, axis=1) + s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1) # paddle.clip donot support min > max if self.config.sample_max_value < 1: s = paddle.ones_like(s) * self.config.sample_max_value @@ -242,11 +237,8 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor: s, min=1, max=self.config.sample_max_value ) # When clip to min=1, equivalent to standard clipping to [-1, 1] - s = s.unsqueeze( - 1) # (batch_size, 1) because clip will broadcast along axis=0 - sample = ( - paddle.clip(sample, -s, s) / - s) # "we threshold xt0 to the range [-s, s] and then divide by s" + s = s.unsqueeze(1) # (batch_size, 1) because clip will broadcast along axis=0 + sample = paddle.clip(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s" sample = paddle.reshape(sample, [batch_size, channels, height, width]) sample = paddle.cast(sample, dtype) @@ -266,27 +258,28 @@ def set_timesteps(self, num_inference_steps: int): raise ValueError( f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:" f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle" - f" maximal {self.config.num_train_timesteps} timesteps.") + f" maximal {self.config.num_train_timesteps} timesteps." + ) self.num_inference_steps = num_inference_steps step_ratio = self.config.num_train_timesteps // self.num_inference_steps # creates integer timesteps by multiplying by ratio # casting to int to avoid issues when num_inference_step is power of 3 - timesteps = ((np.arange(0, num_inference_steps) * step_ratio) - .round()[::-1].copy().astype(np.int64)) + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64) self.timesteps = paddle.to_tensor(timesteps) self.timesteps += self.config.steps_offset def step( - self, - model_output: paddle.Tensor, - timestep: int, - sample: paddle.Tensor, - eta: float=0.0, - use_clipped_model_output: bool=False, - generator=None, - variance_noise: Optional[paddle.Tensor]=None, - return_dict: bool=True, ) -> Union[DDIMSchedulerOutput, Tuple]: + self, + model_output: paddle.Tensor, + timestep: int, + sample: paddle.Tensor, + eta: float = 0.0, + use_clipped_model_output: bool = False, + generator=None, + variance_noise: Optional[paddle.Tensor] = None, + return_dict: bool = True, + ) -> Union[DDIMSchedulerOutput, Tuple]: """ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). @@ -330,118 +323,104 @@ def step( # - pred_prev_sample -> "x_t-1" # 1. get previous step value (=t-1) - prev_timestep = (timestep - self.config.num_train_timesteps // - self.num_inference_steps) + prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps # 2. compute alphas, betas alpha_prod_t = self.alphas_cumprod[timestep] - alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep] if - prev_timestep >= 0 else self.final_alpha_cumprod) + alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod beta_prod_t = 1 - alpha_prod_t # 3. compute predicted original sample from predicted noise also called # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf if self.config.prediction_type == "epsilon": - pred_original_sample = (sample - beta_prod_t** - (0.5) * model_output) / alpha_prod_t**(0.5) + pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) pred_epsilon = model_output elif self.config.prediction_type == "sample": pred_original_sample = model_output - pred_epsilon = (sample - alpha_prod_t** - (0.5) * pred_original_sample) / beta_prod_t**(0.5) + pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5) elif self.config.prediction_type == "v_prediction": - pred_original_sample = (alpha_prod_t**0.5) * sample - ( - beta_prod_t**0.5) * model_output - pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t** - 0.5) * sample + pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output + pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or" - " `v_prediction`") + " `v_prediction`" + ) # 4. Clip or threshold "predicted x_0" if self.config.thresholding: pred_original_sample = self._threshold_sample(pred_original_sample) elif self.config.clip_sample: pred_original_sample = pred_original_sample.clip( - -self.config.clip_sample_range, self.config.clip_sample_range) + -self.config.clip_sample_range, self.config.clip_sample_range + ) # 5. compute variance: "sigma_t(η)" -> see formula (16) # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1) variance = self._get_variance(timestep, prev_timestep) - std_dev_t = eta * variance**(0.5) + std_dev_t = eta * variance ** (0.5) if use_clipped_model_output: # the pred_epsilon is always re-derived from the clipped x_0 in Glide - pred_epsilon = (sample - alpha_prod_t** - (0.5) * pred_original_sample) / beta_prod_t**(0.5) + pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5) # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf - pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2)**( - 0.5) * pred_epsilon + pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf - prev_sample = (alpha_prod_t_prev** - (0.5) * pred_original_sample + pred_sample_direction) + prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction if eta > 0: if variance_noise is not None and generator is not None: raise ValueError( "Cannot pass both generator and variance_noise. Please make sure that either `generator` or" - " `variance_noise` stays `None`.") + " `variance_noise` stays `None`." + ) if variance_noise is None: - variance_noise = randn_tensor( - model_output.shape, - generator=generator, - dtype=model_output.dtype) + variance_noise = randn_tensor(model_output.shape, generator=generator, dtype=model_output.dtype) variance = std_dev_t * variance_noise prev_sample = prev_sample + variance if not return_dict: - return (prev_sample, ) + return (prev_sample,) - return DDIMSchedulerOutput( - prev_sample=prev_sample, pred_original_sample=pred_original_sample) + return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample) def add_noise( - self, - original_samples: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor, ) -> paddle.Tensor: + self, + original_samples: paddle.Tensor, + noise: paddle.Tensor, + timesteps: paddle.Tensor, + ) -> paddle.Tensor: # Make sure alphas_cumprod and timestep have same dtype as original_samples alphas_cumprod = self.alphas_cumprod.cast(dtype=original_samples.dtype) - sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5 + sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 sqrt_alpha_prod = sqrt_alpha_prod.flatten() while len(sqrt_alpha_prod.shape) < len(original_samples.shape): sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) - sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5 + sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() - while len(sqrt_one_minus_alpha_prod.shape) < len( - original_samples.shape): + while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape): sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) - noisy_samples = (sqrt_alpha_prod * original_samples + - sqrt_one_minus_alpha_prod * noise) + noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise return noisy_samples - def get_velocity(self, - sample: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor) -> paddle.Tensor: + def get_velocity(self, sample: paddle.Tensor, noise: paddle.Tensor, timesteps: paddle.Tensor) -> paddle.Tensor: # Make sure alphas_cumprod and timestep have same dtype as sample alphas_cumprod = self.alphas_cumprod.cast(dtype=sample.dtype) - sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5 + sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 sqrt_alpha_prod = sqrt_alpha_prod.flatten() while len(sqrt_alpha_prod.shape) < len(sample.shape): sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) - sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5 + sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape): sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim_inverse.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim_inverse.py index a64c94d782e46..8dfd896087d08 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim_inverse.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_ddim_inverse.py @@ -47,8 +47,7 @@ class DDIMSchedulerOutput(BaseOutput): # Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, - max_beta=0.999) -> paddle.Tensor: +def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor: """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -67,7 +66,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, """ def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2 + return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 betas = [] for i in range(num_diffusion_timesteps): @@ -119,45 +118,46 @@ class DDIMInverseScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - num_train_timesteps: int=1000, - beta_start: float=0.0001, - beta_end: float=0.02, - beta_schedule: str="linear", - trained_betas: Optional[Union[np.ndarray, List[float]]]=None, - clip_sample: bool=True, - set_alpha_to_zero: bool=True, - steps_offset: int=0, - prediction_type: str="epsilon", - clip_sample_range: float=1.0, - **kwargs, ): + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.0001, + beta_end: float = 0.02, + beta_schedule: str = "linear", + trained_betas: Optional[Union[np.ndarray, List[float]]] = None, + clip_sample: bool = True, + set_alpha_to_zero: bool = True, + steps_offset: int = 0, + prediction_type: str = "epsilon", + clip_sample_range: float = 1.0, + **kwargs, + ): if kwargs.get("set_alpha_to_one", None) is not None: - deprecation_message = "The `set_alpha_to_one` argument is deprecated. Please use `set_alpha_to_zero` instead." - deprecate( - "set_alpha_to_one", - "1.0.0", - deprecation_message, - standard_warn=False) + deprecation_message = ( + "The `set_alpha_to_one` argument is deprecated. Please use `set_alpha_to_zero` instead." + ) + deprecate("set_alpha_to_one", "1.0.0", deprecation_message, standard_warn=False) set_alpha_to_zero = kwargs["set_alpha_to_one"] if trained_betas is not None: self.betas = paddle.to_tensor(trained_betas, dtype="float32") elif beta_schedule == "linear": - self.betas = paddle.linspace( - beta_start, beta_end, num_train_timesteps, dtype="float32") + self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype="float32") elif beta_schedule == "scaled_linear": # this schedule is very specific to the latent diffusion model. - self.betas = (paddle.linspace( - beta_start**0.5, - beta_end**0.5, - num_train_timesteps, - dtype="float32", )**2) + self.betas = ( + paddle.linspace( + beta_start**0.5, + beta_end**0.5, + num_train_timesteps, + dtype="float32", + ) + ** 2 + ) elif beta_schedule == "squaredcos_cap_v2": # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError( - f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = paddle.cumprod(self.alphas, 0) @@ -167,20 +167,16 @@ def __init__( # `set_alpha_to_zero` decides whether we set this parameter simply to zero # in this case, self.step() just output the predicted noise # or whether we use the final alpha of the "non-previous" one. - self.final_alpha_cumprod = (paddle.to_tensor(0.0) if set_alpha_to_zero - else self.alphas_cumprod[-1]) + self.final_alpha_cumprod = paddle.to_tensor(0.0) if set_alpha_to_zero else self.alphas_cumprod[-1] # standard deviation of the initial noise distribution self.init_noise_sigma = 1.0 # setable values self.num_inference_steps = None - self.timesteps = paddle.to_tensor( - np.arange(0, num_train_timesteps).copy().astype(np.int64)) + self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps).copy().astype(np.int64)) - def scale_model_input(self, - sample: paddle.Tensor, - timestep: Optional[int]=None) -> paddle.Tensor: + def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. @@ -207,75 +203,73 @@ def set_timesteps(self, num_inference_steps: int): raise ValueError( f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:" f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle" - f" maximal {self.config.num_train_timesteps} timesteps.") + f" maximal {self.config.num_train_timesteps} timesteps." + ) self.num_inference_steps = num_inference_steps step_ratio = self.config.num_train_timesteps // self.num_inference_steps # creates integer timesteps by multiplying by ratio # casting to int to avoid issues when num_inference_step is power of 3 - timesteps = ((np.arange(0, num_inference_steps) * step_ratio).round() - .copy().astype(np.int64)) + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round().copy().astype(np.int64) self.timesteps = paddle.to_tensor(timesteps) self.timesteps += self.config.steps_offset def step( - self, - model_output: paddle.Tensor, - timestep: int, - sample: paddle.Tensor, - eta: float=0.0, - use_clipped_model_output: bool=False, - variance_noise: Optional[paddle.Tensor]=None, - return_dict: bool=True, ) -> Union[DDIMSchedulerOutput, Tuple]: + self, + model_output: paddle.Tensor, + timestep: int, + sample: paddle.Tensor, + eta: float = 0.0, + use_clipped_model_output: bool = False, + variance_noise: Optional[paddle.Tensor] = None, + return_dict: bool = True, + ) -> Union[DDIMSchedulerOutput, Tuple]: # 1. get previous step value (=t+1) - prev_timestep = (timestep + self.config.num_train_timesteps // - self.num_inference_steps) + prev_timestep = timestep + self.config.num_train_timesteps // self.num_inference_steps # 2. compute alphas, betas # change original implementation to exactly match noise levels for analogous forward process alpha_prod_t = self.alphas_cumprod[timestep] - alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep] - if prev_timestep < self.config.num_train_timesteps - else self.final_alpha_cumprod) + alpha_prod_t_prev = ( + self.alphas_cumprod[prev_timestep] + if prev_timestep < self.config.num_train_timesteps + else self.final_alpha_cumprod + ) beta_prod_t = 1 - alpha_prod_t # 3. compute predicted original sample from predicted noise also called # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf if self.config.prediction_type == "epsilon": - pred_original_sample = (sample - beta_prod_t** - (0.5) * model_output) / alpha_prod_t**(0.5) + pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) pred_epsilon = model_output elif self.config.prediction_type == "sample": pred_original_sample = model_output - pred_epsilon = (sample - alpha_prod_t** - (0.5) * pred_original_sample) / beta_prod_t**(0.5) + pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5) elif self.config.prediction_type == "v_prediction": - pred_original_sample = (alpha_prod_t**0.5) * sample - ( - beta_prod_t**0.5) * model_output - pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t** - 0.5) * sample + pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output + pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or" - " `v_prediction`") + " `v_prediction`" + ) # 4. Clip or threshold "predicted x_0" if self.config.clip_sample: pred_original_sample = pred_original_sample.clip( - -self.config.clip_sample_range, self.config.clip_sample_range) + -self.config.clip_sample_range, self.config.clip_sample_range + ) # 5. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf - pred_sample_direction = (1 - alpha_prod_t_prev)**(0.5) * pred_epsilon + pred_sample_direction = (1 - alpha_prod_t_prev) ** (0.5) * pred_epsilon # 6. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf - prev_sample = (alpha_prod_t_prev** - (0.5) * pred_original_sample + pred_sample_direction) + prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction if not return_dict: return (prev_sample, pred_original_sample) - return DDIMSchedulerOutput( - prev_sample=prev_sample, pred_original_sample=pred_original_sample) + return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample) def __len__(self): return self.config.num_train_timesteps diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddpm.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_ddpm.py index a3917f57615f8..167ae05b5b169 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_ddpm.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_ddpm.py @@ -65,7 +65,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): """ def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2 + return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 betas = [] for i in range(num_diffusion_timesteps): @@ -123,31 +123,35 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - num_train_timesteps: int=1000, - beta_start: float=0.0001, - beta_end: float=0.02, - beta_schedule: str="linear", - trained_betas: Optional[Union[np.ndarray, List[float]]]=None, - variance_type: str="fixed_small", - clip_sample: bool=True, - prediction_type: str="epsilon", - thresholding: bool=False, - dynamic_thresholding_ratio: float=0.995, - clip_sample_range: float=1.0, - sample_max_value: float=1.0, ): + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.0001, + beta_end: float = 0.02, + beta_schedule: str = "linear", + trained_betas: Optional[Union[np.ndarray, List[float]]] = None, + variance_type: str = "fixed_small", + clip_sample: bool = True, + prediction_type: str = "epsilon", + thresholding: bool = False, + dynamic_thresholding_ratio: float = 0.995, + clip_sample_range: float = 1.0, + sample_max_value: float = 1.0, + ): if trained_betas is not None: self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32) elif beta_schedule == "linear": - self.betas = paddle.linspace( - beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) + self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) elif beta_schedule == "scaled_linear": # this schedule is very specific to the latent diffusion model. - self.betas = (paddle.linspace( - beta_start**0.5, - beta_end**0.5, - num_train_timesteps, - dtype=paddle.float32, )**2) + self.betas = ( + paddle.linspace( + beta_start**0.5, + beta_end**0.5, + num_train_timesteps, + dtype=paddle.float32, + ) + ** 2 + ) elif beta_schedule == "squaredcos_cap_v2": # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) @@ -156,8 +160,7 @@ def __init__( betas = paddle.linspace(-6, 6, num_train_timesteps) self.betas = F.sigmoid(betas) * (beta_end - beta_start) + beta_start else: - raise NotImplementedError( - f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = paddle.cumprod(self.alphas, 0) @@ -169,14 +172,11 @@ def __init__( # setable values self.custom_timesteps = False self.num_inference_steps = None - self.timesteps = paddle.to_tensor( - np.arange(0, num_train_timesteps)[::-1].copy()) + self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps)[::-1].copy()) self.variance_type = variance_type - def scale_model_input(self, - sample: paddle.Tensor, - timestep: Optional[int]=None) -> paddle.Tensor: + def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. @@ -191,9 +191,10 @@ def scale_model_input(self, return sample def set_timesteps( - self, - num_inference_steps: Optional[int]=None, - timesteps: Optional[List[int]]=None, ): + self, + num_inference_steps: Optional[int] = None, + timesteps: Optional[List[int]] = None, + ): """ Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference. @@ -207,20 +208,18 @@ def set_timesteps( must be `None`. """ if num_inference_steps is not None and timesteps is not None: - raise ValueError( - "Can only pass one of `num_inference_steps` or `custom_timesteps`." - ) + raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.") if timesteps is not None: for i in range(1, len(timesteps)): if timesteps[i] >= timesteps[i - 1]: - raise ValueError( - "`custom_timesteps` must be in descending order.") + raise ValueError("`custom_timesteps` must be in descending order.") if timesteps[0] >= self.config.num_train_timesteps: raise ValueError( f"`timesteps` must start before `self.config.train_timesteps`:" - f" {self.config.num_train_timesteps}.") + f" {self.config.num_train_timesteps}." + ) timesteps = np.array(timesteps, dtype=np.int64) self.custom_timesteps = True @@ -229,11 +228,11 @@ def set_timesteps( raise ValueError( f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:" f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle" - f" maximal {self.config.num_train_timesteps} timesteps.") + f" maximal {self.config.num_train_timesteps} timesteps." + ) self.num_inference_steps = num_inference_steps step_ratio = self.config.num_train_timesteps // self.num_inference_steps - timesteps = ((np.arange(0, num_inference_steps) * step_ratio) - .round()[::-1].copy().astype(np.int64)) + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64) self.custom_timesteps = False self.timesteps = paddle.to_tensor(timesteps) @@ -242,8 +241,7 @@ def _get_variance(self, t, predicted_variance=None, variance_type=None): prev_t = self.previous_timestep(t) alpha_prod_t = self.alphas_cumprod[t] - alpha_prod_t_prev = self.alphas_cumprod[ - prev_t] if prev_t >= 0 else self.one + alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one current_beta_t = 1 - alpha_prod_t / alpha_prod_t_prev # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf) @@ -301,8 +299,7 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor: abs_sample = sample.abs() # "a certain percentile absolute pixel value" - s = paddle.quantile( - abs_sample, self.config.dynamic_thresholding_ratio, axis=1) + s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1) # paddle.clip donot support min > max if self.config.sample_max_value < 1: s = paddle.ones_like(s) * self.config.sample_max_value @@ -310,11 +307,8 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor: s = paddle.clip( s, min=1, max=self.config.sample_max_value ) # When clip to min=1, equivalent to standard clipping to [-1, 1] - s = s.unsqueeze( - 1) # (batch_size, 1) because clip will broadcast along axis=0 - sample = ( - paddle.clip(sample, -s, s) / - s) # "we threshold xt0 to the range [-s, s] and then divide by s" + s = s.unsqueeze(1) # (batch_size, 1) because clip will broadcast along axis=0 + sample = paddle.clip(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s" sample = paddle.reshape(sample, [batch_size, channels, height, width]) sample = paddle.cast(sample, dtype) @@ -322,12 +316,13 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor: return sample def step( - self, - model_output: paddle.Tensor, - timestep: int, - sample: paddle.Tensor, - generator=None, - return_dict: bool=True, ) -> Union[DDPMSchedulerOutput, Tuple]: + self, + model_output: paddle.Tensor, + timestep: int, + sample: paddle.Tensor, + generator=None, + return_dict: bool = True, + ) -> Union[DDPMSchedulerOutput, Tuple]: """ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). @@ -349,19 +344,17 @@ def step( t = timestep prev_t = self.previous_timestep(t) - if model_output.shape[1] == sample.shape[ - 1] * 2 and self.variance_type in [ - "learned", - "learned_range", - ]: + if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in [ + "learned", + "learned_range", + ]: model_output, predicted_variance = model_output.chunk(2, axis=1) else: predicted_variance = None # 1. compute alphas, betas alpha_prod_t = self.alphas_cumprod[t] - alpha_prod_t_prev = self.alphas_cumprod[ - prev_t] if prev_t >= 0 else self.one + alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one beta_prod_t = 1 - alpha_prod_t beta_prod_t_prev = 1 - alpha_prod_t_prev current_alpha_t = alpha_prod_t / alpha_prod_t_prev @@ -370,17 +363,16 @@ def step( # 2. compute predicted original sample from predicted noise also called # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf if self.config.prediction_type == "epsilon": - pred_original_sample = (sample - beta_prod_t** - (0.5) * model_output) / alpha_prod_t**(0.5) + pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) elif self.config.prediction_type == "sample": pred_original_sample = model_output elif self.config.prediction_type == "v_prediction": - pred_original_sample = (alpha_prod_t**0.5) * sample - ( - beta_prod_t**0.5) * model_output + pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or" - " `v_prediction` for the DDPMScheduler.") + " `v_prediction` for the DDPMScheduler." + ) # 3. Clip or threshold "predicted x_0" if self.config.thresholding: @@ -389,84 +381,69 @@ def step( pred_original_sample = paddle.clip( pred_original_sample, -self.config.clip_sample_range, - self.config.clip_sample_range, ) + self.config.clip_sample_range, + ) # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf - pred_original_sample_coeff = (alpha_prod_t_prev - **(0.5) * current_beta_t) / beta_prod_t - current_sample_coeff = current_alpha_t**( - 0.5) * beta_prod_t_prev / beta_prod_t + pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * current_beta_t) / beta_prod_t + current_sample_coeff = current_alpha_t ** (0.5) * beta_prod_t_prev / beta_prod_t # 5. Compute predicted previous sample µ_t # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf - pred_prev_sample = (pred_original_sample_coeff * pred_original_sample + - current_sample_coeff * sample) + pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample # 6. Add noise variance = 0 if t > 0: - variance_noise = randn_tensor( - model_output.shape, - generator=generator, - dtype=model_output.dtype) + variance_noise = randn_tensor(model_output.shape, generator=generator, dtype=model_output.dtype) if self.variance_type == "fixed_small_log": - variance = (self._get_variance( - t, predicted_variance=predicted_variance) * variance_noise) + variance = self._get_variance(t, predicted_variance=predicted_variance) * variance_noise elif self.variance_type == "learned_range": - variance = self._get_variance( - t, predicted_variance=predicted_variance) + variance = self._get_variance(t, predicted_variance=predicted_variance) variance = paddle.exp(0.5 * variance) * variance_noise else: - variance = (self._get_variance( - t, predicted_variance=predicted_variance) - **0.5) * variance_noise + variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * variance_noise pred_prev_sample = pred_prev_sample + variance if not return_dict: - return (pred_prev_sample, ) + return (pred_prev_sample,) - return DDPMSchedulerOutput( - prev_sample=pred_prev_sample, - pred_original_sample=pred_original_sample) + return DDPMSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample) def add_noise( - self, - original_samples: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor, ) -> paddle.Tensor: + self, + original_samples: paddle.Tensor, + noise: paddle.Tensor, + timesteps: paddle.Tensor, + ) -> paddle.Tensor: # Make sure alphas_cumprod and timestep have same dtype as original_samples alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype) - sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5 + sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 sqrt_alpha_prod = sqrt_alpha_prod.flatten() while len(sqrt_alpha_prod.shape) < len(original_samples.shape): sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) - sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5 + sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() - while len(sqrt_one_minus_alpha_prod.shape) < len( - original_samples.shape): + while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape): sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) - noisy_samples = (sqrt_alpha_prod * original_samples + - sqrt_one_minus_alpha_prod * noise) + noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise return noisy_samples - def get_velocity(self, - sample: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor) -> paddle.Tensor: + def get_velocity(self, sample: paddle.Tensor, noise: paddle.Tensor, timesteps: paddle.Tensor) -> paddle.Tensor: # Make sure alphas_cumprod and timestep have same dtype as original_samples alphas_cumprod = self.alphas_cumprod.cast(sample.dtype) - sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5 + sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 sqrt_alpha_prod = sqrt_alpha_prod.flatten() while len(sqrt_alpha_prod.shape) < len(sample.shape): sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) - sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5 + sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape): sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) @@ -485,9 +462,9 @@ def previous_timestep(self, timestep): else: prev_t = self.timesteps[index + 1] else: - num_inference_steps = (self.num_inference_steps - if self.num_inference_steps else - self.config.num_train_timesteps) + num_inference_steps = ( + self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps + ) prev_t = timestep - self.config.num_train_timesteps // num_inference_steps return prev_t diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_deis_multistep.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_deis_multistep.py index 845b209a9bc2d..7d4b5802fb447 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_deis_multistep.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_deis_multistep.py @@ -23,8 +23,7 @@ import paddle from ..configuration_utils import ConfigMixin, register_to_config -from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin, - SchedulerOutput) +from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput # Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -47,7 +46,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): """ def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2 + return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 betas = [] for i in range(num_diffusion_timesteps): @@ -113,38 +112,41 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - num_train_timesteps: int=1000, - beta_start: float=0.0001, - beta_end: float=0.02, - beta_schedule: str="linear", - trained_betas: Optional[np.ndarray]=None, - solver_order: int=2, - prediction_type: str="epsilon", - thresholding: bool=False, - dynamic_thresholding_ratio: float=0.995, - sample_max_value: float=1.0, - algorithm_type: str="deis", - solver_type: str="logrho", - lower_order_final: bool=True, ): + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.0001, + beta_end: float = 0.02, + beta_schedule: str = "linear", + trained_betas: Optional[np.ndarray] = None, + solver_order: int = 2, + prediction_type: str = "epsilon", + thresholding: bool = False, + dynamic_thresholding_ratio: float = 0.995, + sample_max_value: float = 1.0, + algorithm_type: str = "deis", + solver_type: str = "logrho", + lower_order_final: bool = True, + ): if trained_betas is not None: self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32) elif beta_schedule == "linear": - self.betas = paddle.linspace( - beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) + self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) elif beta_schedule == "scaled_linear": # this schedule is very specific to the latent diffusion model. - self.betas = (paddle.linspace( - beta_start**0.5, - beta_end**0.5, - num_train_timesteps, - dtype=paddle.float32, )**2) + self.betas = ( + paddle.linspace( + beta_start**0.5, + beta_end**0.5, + num_train_timesteps, + dtype=paddle.float32, + ) + ** 2 + ) elif beta_schedule == "squaredcos_cap_v2": # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError( - f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = paddle.cumprod(self.alphas, 0) @@ -161,23 +163,17 @@ def __init__( if algorithm_type in ["dpmsolver", "dpmsolver++"]: self.register_to_config(algorithm_type="deis") else: - raise NotImplementedError( - f"{algorithm_type} does is not implemented for {self.__class__}" - ) + raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}") if solver_type not in ["logrho"]: if solver_type in ["midpoint", "heun", "bh1", "bh2"]: self.register_to_config(solver_type="logrho") else: - raise NotImplementedError( - f"solver type {solver_type} does is not implemented for {self.__class__}" - ) + raise NotImplementedError(f"solver type {solver_type} does is not implemented for {self.__class__}") # setable values self.num_inference_steps = None - timesteps = np.linspace( - 0, num_train_timesteps - 1, num_train_timesteps, - dtype=np.float32)[::-1].copy() + timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy() self.timesteps = paddle.to_tensor(timesteps) self.model_outputs = [None] * solver_order self.lower_order_nums = 0 @@ -190,9 +186,12 @@ def set_timesteps(self, num_inference_steps: int): num_inference_steps (`int`): the number of diffusion steps used when generating samples with a pre-trained model. """ - timesteps = (np.linspace(0, self.config.num_train_timesteps - 1, - num_inference_steps + 1).round()[::-1][:-1] - .copy().astype(np.int64)) + timesteps = ( + np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1) + .round()[::-1][:-1] + .copy() + .astype(np.int64) + ) # when num_inference_steps == num_train_timesteps, we can end up with # duplicates in timesteps. @@ -203,7 +202,9 @@ def set_timesteps(self, num_inference_steps: int): self.num_inference_steps = len(timesteps) - self.model_outputs = [None, ] * self.config.solver_order + self.model_outputs = [ + None, + ] * self.config.solver_order self.lower_order_nums = 0 def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor: @@ -228,8 +229,7 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor: abs_sample = sample.abs() # "a certain percentile absolute pixel value" - s = paddle.quantile( - abs_sample, self.config.dynamic_thresholding_ratio, axis=1) + s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1) # paddle.clip donot support min > max if self.config.sample_max_value < 1: s = paddle.ones_like(s) * self.config.sample_max_value @@ -237,21 +237,15 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor: s = paddle.clip( s, min=1, max=self.config.sample_max_value ) # When clip to min=1, equivalent to standard clipping to [-1, 1] - s = s.unsqueeze( - 1) # (batch_size, 1) because clip will broadcast along axis=0 - sample = ( - paddle.clip(sample, -s, s) / - s) # "we threshold xt0 to the range [-s, s] and then divide by s" + s = s.unsqueeze(1) # (batch_size, 1) because clip will broadcast along axis=0 + sample = paddle.clip(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s" sample = paddle.reshape(sample, [batch_size, channels, height, width]) sample = paddle.cast(sample, dtype) return sample - def convert_model_output(self, - model_output: paddle.Tensor, - timestep: int, - sample: paddle.Tensor) -> paddle.Tensor: + def convert_model_output(self, model_output: paddle.Tensor, timestep: int, sample: paddle.Tensor) -> paddle.Tensor: """ Convert the model output to the corresponding type that the algorithm DEIS needs. @@ -275,7 +269,8 @@ def convert_model_output(self, else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or" - " `v_prediction` for the DEISMultistepScheduler.") + " `v_prediction` for the DEISMultistepScheduler." + ) if self.config.thresholding: x0_pred = self._threshold_sample(x0_pred) @@ -287,11 +282,12 @@ def convert_model_output(self, raise NotImplementedError("only support log-rho multistep deis now") def deis_first_order_update( - self, - model_output: paddle.Tensor, - timestep: int, - prev_timestep: int, - sample: paddle.Tensor, ) -> paddle.Tensor: + self, + model_output: paddle.Tensor, + timestep: int, + prev_timestep: int, + sample: paddle.Tensor, + ) -> paddle.Tensor: """ One step for the first-order DEIS (equivalent to DDIM). @@ -305,24 +301,23 @@ def deis_first_order_update( Returns: `paddle.Tensor`: the sample tensor at the previous timestep. """ - lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[ - timestep] + lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[timestep] alpha_t, alpha_s = self.alpha_t[prev_timestep], self.alpha_t[timestep] sigma_t, _ = self.sigma_t[prev_timestep], self.sigma_t[timestep] h = lambda_t - lambda_s if self.config.algorithm_type == "deis": - x_t = (alpha_t / alpha_s) * sample - (sigma_t * (paddle.exp(h) - 1.0 - )) * model_output + x_t = (alpha_t / alpha_s) * sample - (sigma_t * (paddle.exp(h) - 1.0)) * model_output else: raise NotImplementedError("only support log-rho multistep deis now") return x_t def multistep_deis_second_order_update( - self, - model_output_list: List[paddle.Tensor], - timestep_list: List[int], - prev_timestep: int, - sample: paddle.Tensor, ) -> paddle.Tensor: + self, + model_output_list: List[paddle.Tensor], + timestep_list: List[int], + prev_timestep: int, + sample: paddle.Tensor, + ) -> paddle.Tensor: """ One step for the second-order multistep DEIS. @@ -342,28 +337,28 @@ def multistep_deis_second_order_update( alpha_t, alpha_s0, alpha_s1 = ( self.alpha_t[t], self.alpha_t[s0], - self.alpha_t[s1], ) + self.alpha_t[s1], + ) sigma_t, sigma_s0, sigma_s1 = ( self.sigma_t[t], self.sigma_t[s0], - self.sigma_t[s1], ) + self.sigma_t[s1], + ) rho_t, rho_s0, rho_s1 = ( sigma_t / alpha_t, sigma_s0 / alpha_s0, - sigma_s1 / alpha_s1, ) + sigma_s1 / alpha_s1, + ) if self.config.algorithm_type == "deis": def ind_fn(t, b, c): # Integrate[(log(t) - log(c)) / (log(b) - log(c)), {t}] - return (t * (-paddle.log(c) + paddle.log(t) - 1) / - (paddle.log(b) - paddle.log(c))) + return t * (-paddle.log(c) + paddle.log(t) - 1) / (paddle.log(b) - paddle.log(c)) - coef1 = ind_fn(rho_t, rho_s0, rho_s1) - ind_fn(rho_s0, rho_s0, - rho_s1) - coef2 = ind_fn(rho_t, rho_s1, rho_s0) - ind_fn(rho_s0, rho_s1, - rho_s0) + coef1 = ind_fn(rho_t, rho_s0, rho_s1) - ind_fn(rho_s0, rho_s0, rho_s1) + coef2 = ind_fn(rho_t, rho_s1, rho_s0) - ind_fn(rho_s0, rho_s1, rho_s0) x_t = alpha_t * (sample / alpha_s0 + coef1 * m0 + coef2 * m1) return x_t @@ -371,11 +366,12 @@ def ind_fn(t, b, c): raise NotImplementedError("only support log-rho multistep deis now") def multistep_deis_third_order_update( - self, - model_output_list: List[paddle.Tensor], - timestep_list: List[int], - prev_timestep: int, - sample: paddle.Tensor, ) -> paddle.Tensor: + self, + model_output_list: List[paddle.Tensor], + timestep_list: List[int], + prev_timestep: int, + sample: paddle.Tensor, + ) -> paddle.Tensor: """ One step for the third-order multistep DEIS. @@ -394,57 +390,60 @@ def multistep_deis_third_order_update( prev_timestep, timestep_list[-1], timestep_list[-2], - timestep_list[-3], ) - m0, m1, m2 = model_output_list[-1], model_output_list[ - -2], model_output_list[-3] + timestep_list[-3], + ) + m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3] alpha_t, alpha_s0, alpha_s1, alpha_s2 = ( self.alpha_t[t], self.alpha_t[s0], self.alpha_t[s1], - self.alpha_t[s2], ) + self.alpha_t[s2], + ) sigma_t, sigma_s0, sigma_s1, simga_s2 = ( self.sigma_t[t], self.sigma_t[s0], self.sigma_t[s1], - self.sigma_t[s2], ) + self.sigma_t[s2], + ) rho_t, rho_s0, rho_s1, rho_s2 = ( sigma_t / alpha_t, sigma_s0 / alpha_s0, sigma_s1 / alpha_s1, - simga_s2 / alpha_s2, ) + simga_s2 / alpha_s2, + ) if self.config.algorithm_type == "deis": def ind_fn(t, b, c, d): # Integrate[(log(t) - log(c))(log(t) - log(d)) / (log(b) - log(c))(log(b) - log(d)), {t}] numerator = t * ( - paddle.log(c) * (paddle.log(d) - paddle.log(t) + 1 - ) - paddle.log(d) * paddle.log(t) + - paddle.log(d) + paddle.log(t)**2 - 2 * paddle.log(t) + 2) - denominator = (paddle.log(b) - paddle.log(c)) * ( - paddle.log(b) - paddle.log(d)) + paddle.log(c) * (paddle.log(d) - paddle.log(t) + 1) + - paddle.log(d) * paddle.log(t) + + paddle.log(d) + + paddle.log(t) ** 2 + - 2 * paddle.log(t) + + 2 + ) + denominator = (paddle.log(b) - paddle.log(c)) * (paddle.log(b) - paddle.log(d)) return numerator / denominator - coef1 = ind_fn(rho_t, rho_s0, rho_s1, rho_s2) - ind_fn( - rho_s0, rho_s0, rho_s1, rho_s2) - coef2 = ind_fn(rho_t, rho_s1, rho_s2, rho_s0) - ind_fn( - rho_s0, rho_s1, rho_s2, rho_s0) - coef3 = ind_fn(rho_t, rho_s2, rho_s0, rho_s1) - ind_fn( - rho_s0, rho_s2, rho_s0, rho_s1) + coef1 = ind_fn(rho_t, rho_s0, rho_s1, rho_s2) - ind_fn(rho_s0, rho_s0, rho_s1, rho_s2) + coef2 = ind_fn(rho_t, rho_s1, rho_s2, rho_s0) - ind_fn(rho_s0, rho_s1, rho_s2, rho_s0) + coef3 = ind_fn(rho_t, rho_s2, rho_s0, rho_s1) - ind_fn(rho_s0, rho_s2, rho_s0, rho_s1) - x_t = alpha_t * ( - sample / alpha_s0 + coef1 * m0 + coef2 * m1 + coef3 * m2) + x_t = alpha_t * (sample / alpha_s0 + coef1 * m0 + coef2 * m1 + coef3 * m2) return x_t else: raise NotImplementedError("only support log-rho multistep deis now") def step( - self, - model_output: paddle.Tensor, - timestep: int, - sample: paddle.Tensor, - return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]: + self, + model_output: paddle.Tensor, + timestep: int, + sample: paddle.Tensor, + return_dict: bool = True, + ) -> Union[SchedulerOutput, Tuple]: """ Step function propagating the sample with the multistep DEIS. @@ -470,29 +469,26 @@ def step( step_index = len(self.timesteps) - 1 else: step_index = step_index.item() - prev_timestep = (0 if step_index == len(self.timesteps) - 1 else - self.timesteps[step_index + 1]) - lower_order_final = ((step_index == len(self.timesteps) - 1) and - self.config.lower_order_final and - len(self.timesteps) < 15) - lower_order_second = ((step_index == len(self.timesteps) - 2) and - self.config.lower_order_final and - len(self.timesteps) < 15) + prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1] + lower_order_final = ( + (step_index == len(self.timesteps) - 1) and self.config.lower_order_final and len(self.timesteps) < 15 + ) + lower_order_second = ( + (step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15 + ) model_output = self.convert_model_output(model_output, timestep, sample) for i in range(self.config.solver_order - 1): self.model_outputs[i] = self.model_outputs[i + 1] self.model_outputs[-1] = model_output - if (self.config.solver_order == 1 or self.lower_order_nums < 1 or - lower_order_final): - prev_sample = self.deis_first_order_update(model_output, timestep, - prev_timestep, sample) - elif (self.config.solver_order == 2 or self.lower_order_nums < 2 or - lower_order_second): + if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final: + prev_sample = self.deis_first_order_update(model_output, timestep, prev_timestep, sample) + elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second: timestep_list = [self.timesteps[step_index - 1], timestep] prev_sample = self.multistep_deis_second_order_update( - self.model_outputs, timestep_list, prev_timestep, sample) + self.model_outputs, timestep_list, prev_timestep, sample + ) else: timestep_list = [ self.timesteps[step_index - 2], @@ -500,18 +496,18 @@ def step( timestep, ] prev_sample = self.multistep_deis_third_order_update( - self.model_outputs, timestep_list, prev_timestep, sample) + self.model_outputs, timestep_list, prev_timestep, sample + ) if self.lower_order_nums < self.config.solver_order: self.lower_order_nums += 1 if not return_dict: - return (prev_sample, ) + return (prev_sample,) return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: paddle.Tensor, *args, - **kwargs) -> paddle.Tensor: + def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. @@ -525,26 +521,25 @@ def scale_model_input(self, sample: paddle.Tensor, *args, return sample def add_noise( - self, - original_samples: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor, ) -> paddle.Tensor: + self, + original_samples: paddle.Tensor, + noise: paddle.Tensor, + timesteps: paddle.Tensor, + ) -> paddle.Tensor: # Make sure alphas_cumprod and timestep have same dtype as original_samples alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype) - sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5 + sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 sqrt_alpha_prod = sqrt_alpha_prod.flatten() while len(sqrt_alpha_prod.shape) < len(original_samples.shape): sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) - sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5 + sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() - while len(sqrt_one_minus_alpha_prod.shape) < len( - original_samples.shape): + while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape): sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) - noisy_samples = (sqrt_alpha_prod * original_samples + - sqrt_one_minus_alpha_prod * noise) + noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise return noisy_samples def __len__(self): diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_multistep.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_multistep.py index 9b360646172d5..5ebc674044afa 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_multistep.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_multistep.py @@ -22,8 +22,7 @@ import paddle from ..configuration_utils import ConfigMixin, register_to_config -from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin, - SchedulerOutput) +from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput # Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -46,7 +45,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): """ def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2 + return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 betas = [] for i in range(num_diffusion_timesteps): @@ -127,39 +126,42 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - num_train_timesteps: int=1000, - beta_start: float=0.0001, - beta_end: float=0.02, - beta_schedule: str="linear", - trained_betas: Optional[Union[np.ndarray, List[float]]]=None, - solver_order: int=2, - prediction_type: str="epsilon", - thresholding: bool=False, - dynamic_thresholding_ratio: float=0.995, - sample_max_value: float=1.0, - algorithm_type: str="dpmsolver++", - solver_type: str="midpoint", - lower_order_final: bool=True, - use_karras_sigmas: Optional[bool]=False, ): + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.0001, + beta_end: float = 0.02, + beta_schedule: str = "linear", + trained_betas: Optional[Union[np.ndarray, List[float]]] = None, + solver_order: int = 2, + prediction_type: str = "epsilon", + thresholding: bool = False, + dynamic_thresholding_ratio: float = 0.995, + sample_max_value: float = 1.0, + algorithm_type: str = "dpmsolver++", + solver_type: str = "midpoint", + lower_order_final: bool = True, + use_karras_sigmas: Optional[bool] = False, + ): if trained_betas is not None: self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32) elif beta_schedule == "linear": - self.betas = paddle.linspace( - beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) + self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) elif beta_schedule == "scaled_linear": # this schedule is very specific to the latent diffusion model. - self.betas = (paddle.linspace( - beta_start**0.5, - beta_end**0.5, - num_train_timesteps, - dtype=paddle.float32, )**2) + self.betas = ( + paddle.linspace( + beta_start**0.5, + beta_end**0.5, + num_train_timesteps, + dtype=paddle.float32, + ) + ** 2 + ) elif beta_schedule == "squaredcos_cap_v2": # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError( - f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = paddle.cumprod(self.alphas, 0) @@ -176,23 +178,17 @@ def __init__( if algorithm_type == "deis": self.register_to_config(algorithm_type="dpmsolver++") else: - raise NotImplementedError( - f"{algorithm_type} does is not implemented for {self.__class__}" - ) + raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}") if solver_type not in ["midpoint", "heun"]: if solver_type in ["logrho", "bh1", "bh2"]: self.register_to_config(solver_type="midpoint") else: - raise NotImplementedError( - f"{solver_type} does is not implemented for {self.__class__}" - ) + raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}") # setable values self.num_inference_steps = None - timesteps = np.linspace( - 0, num_train_timesteps - 1, num_train_timesteps, - dtype=np.float32)[::-1].copy() + timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy() self.timesteps = paddle.to_tensor(timesteps) self.model_outputs = [None] * solver_order self.lower_order_nums = 0 @@ -206,18 +202,17 @@ def set_timesteps(self, num_inference_steps: int): num_inference_steps (`int`): the number of diffusion steps used when generating samples with a pre-trained model. """ - timesteps = (np.linspace(0, self.config.num_train_timesteps - 1, - num_inference_steps + 1).round()[::-1][:-1] - .copy().astype(np.int64)) + timesteps = ( + np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1) + .round()[::-1][:-1] + .copy() + .astype(np.int64) + ) if self.use_karras_sigmas: - sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) - **0.5) + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) log_sigmas = np.log(sigmas) - sigmas = self._convert_to_karras( - in_sigmas=sigmas, num_inference_steps=num_inference_steps) - timesteps = np.array( - [self._sigma_to_t(sigma, log_sigmas) - for sigma in sigmas]).round() + sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps) + timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round() timesteps = np.flip(timesteps).copy().astype(np.int64) # when num_inference_steps == num_train_timesteps, we can end up with @@ -229,7 +224,9 @@ def set_timesteps(self, num_inference_steps: int): self.num_inference_steps = len(timesteps) - self.model_outputs = [None, ] * self.config.solver_order + self.model_outputs = [ + None, + ] * self.config.solver_order self.lower_order_nums = 0 def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor: @@ -254,8 +251,7 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor: abs_sample = sample.abs() # "a certain percentile absolute pixel value" - s = paddle.quantile( - abs_sample, self.config.dynamic_thresholding_ratio, axis=1) + s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1) # paddle.clip donot support min > max if self.config.sample_max_value < 1: s = paddle.ones_like(s) * self.config.sample_max_value @@ -263,11 +259,8 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor: s = paddle.clip( s, min=1, max=self.config.sample_max_value ) # When clip to min=1, equivalent to standard clipping to [-1, 1] - s = s.unsqueeze( - 1) # (batch_size, 1) because clip will broadcast along axis=0 - sample = ( - paddle.clip(sample, -s, s) / - s) # "we threshold xt0 to the range [-s, s] and then divide by s" + s = s.unsqueeze(1) # (batch_size, 1) because clip will broadcast along axis=0 + sample = paddle.clip(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s" sample = paddle.reshape(sample, [batch_size, channels, height, width]) sample = paddle.cast(sample, dtype) @@ -282,9 +275,7 @@ def _sigma_to_t(self, sigma, log_sigmas): dists = log_sigma - log_sigmas[:, np.newaxis] # get sigmas range - low_idx = (np.cumsum( - (dists >= 0), axis=0).argmax(axis=0) - .clip(max=log_sigmas.shape[0] - 2)) + low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2) high_idx = low_idx + 1 low = log_sigmas[low_idx] @@ -299,8 +290,7 @@ def _sigma_to_t(self, sigma, log_sigmas): t = t.reshape(sigma.shape) return t - def _convert_to_karras(self, in_sigmas: paddle.Tensor, - num_inference_steps) -> paddle.Tensor: + def _convert_to_karras(self, in_sigmas: paddle.Tensor, num_inference_steps) -> paddle.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" sigma_min = in_sigmas[-1].item() @@ -308,15 +298,12 @@ def _convert_to_karras(self, in_sigmas: paddle.Tensor, rho = 7.0 # 7.0 is the value used in the paper ramp = np.linspace(0, 1, num_inference_steps) - min_inv_rho = sigma_min**(1 / rho) - max_inv_rho = sigma_max**(1 / rho) - sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho))**rho + min_inv_rho = sigma_min ** (1 / rho) + max_inv_rho = sigma_max ** (1 / rho) + sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho return sigmas - def convert_model_output(self, - model_output: paddle.Tensor, - timestep: int, - sample: paddle.Tensor) -> paddle.Tensor: + def convert_model_output(self, model_output: paddle.Tensor, timestep: int, sample: paddle.Tensor) -> paddle.Tensor: """ Convert the model output to the corresponding type that the algorithm (DPM-Solver / DPM-Solver++) needs. @@ -339,19 +326,18 @@ def convert_model_output(self, # DPM-Solver++ needs to solve an integral of the data prediction model. if self.config.algorithm_type == "dpmsolver++": if self.config.prediction_type == "epsilon": - alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[ - timestep] + alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep] x0_pred = (sample - sigma_t * model_output) / alpha_t elif self.config.prediction_type == "sample": x0_pred = model_output elif self.config.prediction_type == "v_prediction": - alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[ - timestep] + alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep] x0_pred = alpha_t * sample - sigma_t * model_output else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or" - " `v_prediction` for the DPMSolverMultistepScheduler.") + " `v_prediction` for the DPMSolverMultistepScheduler." + ) if self.config.thresholding: x0_pred = self._threshold_sample(x0_pred) @@ -362,26 +348,26 @@ def convert_model_output(self, if self.config.prediction_type == "epsilon": return model_output elif self.config.prediction_type == "sample": - alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[ - timestep] + alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep] epsilon = (sample - alpha_t * model_output) / sigma_t return epsilon elif self.config.prediction_type == "v_prediction": - alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[ - timestep] + alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep] epsilon = alpha_t * model_output + sigma_t * sample return epsilon else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or" - " `v_prediction` for the DPMSolverMultistepScheduler.") + " `v_prediction` for the DPMSolverMultistepScheduler." + ) def dpm_solver_first_order_update( - self, - model_output: paddle.Tensor, - timestep: int, - prev_timestep: int, - sample: paddle.Tensor, ) -> paddle.Tensor: + self, + model_output: paddle.Tensor, + timestep: int, + prev_timestep: int, + sample: paddle.Tensor, + ) -> paddle.Tensor: """ One step for the first-order DPM-Solver (equivalent to DDIM). @@ -397,25 +383,23 @@ def dpm_solver_first_order_update( Returns: `paddle.Tensor`: the sample tensor at the previous timestep. """ - lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[ - timestep] + lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[timestep] alpha_t, alpha_s = self.alpha_t[prev_timestep], self.alpha_t[timestep] sigma_t, sigma_s = self.sigma_t[prev_timestep], self.sigma_t[timestep] h = lambda_t - lambda_s if self.config.algorithm_type == "dpmsolver++": - x_t = (sigma_t / sigma_s) * sample - (alpha_t * ( - paddle.exp(-h) - 1.0)) * model_output + x_t = (sigma_t / sigma_s) * sample - (alpha_t * (paddle.exp(-h) - 1.0)) * model_output elif self.config.algorithm_type == "dpmsolver": - x_t = (alpha_t / alpha_s) * sample - (sigma_t * (paddle.exp(h) - 1.0 - )) * model_output + x_t = (alpha_t / alpha_s) * sample - (sigma_t * (paddle.exp(h) - 1.0)) * model_output return x_t def multistep_dpm_solver_second_order_update( - self, - model_output_list: List[paddle.Tensor], - timestep_list: List[int], - prev_timestep: int, - sample: paddle.Tensor, ) -> paddle.Tensor: + self, + model_output_list: List[paddle.Tensor], + timestep_list: List[int], + prev_timestep: int, + sample: paddle.Tensor, + ) -> paddle.Tensor: """ One step for the second-order multistep DPM-Solver. @@ -435,7 +419,8 @@ def multistep_dpm_solver_second_order_update( lambda_t, lambda_s0, lambda_s1 = ( self.lambda_t[t], self.lambda_t[s0], - self.lambda_t[s1], ) + self.lambda_t[s1], + ) alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0] sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0] h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1 @@ -444,31 +429,40 @@ def multistep_dpm_solver_second_order_update( if self.config.algorithm_type == "dpmsolver++": # See https://arxiv.org/abs/2211.01095 for detailed derivations if self.config.solver_type == "midpoint": - x_t = ((sigma_t / sigma_s0) * sample - - (alpha_t * (paddle.exp(-h) - 1.0)) * D0 - 0.5 * - (alpha_t * (paddle.exp(-h) - 1.0)) * D1) + x_t = ( + (sigma_t / sigma_s0) * sample + - (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + - 0.5 * (alpha_t * (paddle.exp(-h) - 1.0)) * D1 + ) elif self.config.solver_type == "heun": - x_t = ((sigma_t / sigma_s0) * sample - - (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + (alpha_t * ( - (paddle.exp(-h) - 1.0) / h + 1.0)) * D1) + x_t = ( + (sigma_t / sigma_s0) * sample + - (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1 + ) elif self.config.algorithm_type == "dpmsolver": # See https://arxiv.org/abs/2206.00927 for detailed derivations if self.config.solver_type == "midpoint": - x_t = ((alpha_t / alpha_s0) * sample - - (sigma_t * (paddle.exp(h) - 1.0)) * D0 - 0.5 * - (sigma_t * (paddle.exp(h) - 1.0)) * D1) + x_t = ( + (alpha_t / alpha_s0) * sample + - (sigma_t * (paddle.exp(h) - 1.0)) * D0 + - 0.5 * (sigma_t * (paddle.exp(h) - 1.0)) * D1 + ) elif self.config.solver_type == "heun": - x_t = ((alpha_t / alpha_s0) * sample - - (sigma_t * (paddle.exp(h) - 1.0)) * D0 - (sigma_t * ( - (paddle.exp(h) - 1.0) / h - 1.0)) * D1) + x_t = ( + (alpha_t / alpha_s0) * sample + - (sigma_t * (paddle.exp(h) - 1.0)) * D0 + - (sigma_t * ((paddle.exp(h) - 1.0) / h - 1.0)) * D1 + ) return x_t def multistep_dpm_solver_third_order_update( - self, - model_output_list: List[paddle.Tensor], - timestep_list: List[int], - prev_timestep: int, - sample: paddle.Tensor, ) -> paddle.Tensor: + self, + model_output_list: List[paddle.Tensor], + timestep_list: List[int], + prev_timestep: int, + sample: paddle.Tensor, + ) -> paddle.Tensor: """ One step for the third-order multistep DPM-Solver. @@ -487,14 +481,15 @@ def multistep_dpm_solver_third_order_update( prev_timestep, timestep_list[-1], timestep_list[-2], - timestep_list[-3], ) - m0, m1, m2 = model_output_list[-1], model_output_list[ - -2], model_output_list[-3] + timestep_list[-3], + ) + m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3] lambda_t, lambda_s0, lambda_s1, lambda_s2 = ( self.lambda_t[t], self.lambda_t[s0], self.lambda_t[s1], - self.lambda_t[s2], ) + self.lambda_t[s2], + ) alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0] sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0] h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2 @@ -505,24 +500,29 @@ def multistep_dpm_solver_third_order_update( D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1) if self.config.algorithm_type == "dpmsolver++": # See https://arxiv.org/abs/2206.00927 for detailed derivations - x_t = ((sigma_t / sigma_s0) * sample - - (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + (alpha_t * ( - (paddle.exp(-h) - 1.0) / h + 1.0)) * D1 - (alpha_t * ( - (paddle.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2) + x_t = ( + (sigma_t / sigma_s0) * sample + - (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1 + - (alpha_t * ((paddle.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2 + ) elif self.config.algorithm_type == "dpmsolver": # See https://arxiv.org/abs/2206.00927 for detailed derivations - x_t = ((alpha_t / alpha_s0) * sample - - (sigma_t * (paddle.exp(h) - 1.0)) * D0 - (sigma_t * ( - (paddle.exp(h) - 1.0) / h - 1.0)) * D1 - (sigma_t * ( - (paddle.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2) + x_t = ( + (alpha_t / alpha_s0) * sample + - (sigma_t * (paddle.exp(h) - 1.0)) * D0 + - (sigma_t * ((paddle.exp(h) - 1.0) / h - 1.0)) * D1 + - (sigma_t * ((paddle.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2 + ) return x_t def step( - self, - model_output: paddle.Tensor, - timestep: int, - sample: paddle.Tensor, - return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]: + self, + model_output: paddle.Tensor, + timestep: int, + sample: paddle.Tensor, + return_dict: bool = True, + ) -> Union[SchedulerOutput, Tuple]: """ Step function propagating the sample with the multistep DPM-Solver. @@ -548,29 +548,26 @@ def step( step_index = len(self.timesteps) - 1 else: step_index = step_index.item() - prev_timestep = (0 if step_index == len(self.timesteps) - 1 else - self.timesteps[step_index + 1]) - lower_order_final = ((step_index == len(self.timesteps) - 1) and - self.config.lower_order_final and - len(self.timesteps) < 15) - lower_order_second = ((step_index == len(self.timesteps) - 2) and - self.config.lower_order_final and - len(self.timesteps) < 15) + prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1] + lower_order_final = ( + (step_index == len(self.timesteps) - 1) and self.config.lower_order_final and len(self.timesteps) < 15 + ) + lower_order_second = ( + (step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15 + ) model_output = self.convert_model_output(model_output, timestep, sample) for i in range(self.config.solver_order - 1): self.model_outputs[i] = self.model_outputs[i + 1] self.model_outputs[-1] = model_output - if (self.config.solver_order == 1 or self.lower_order_nums < 1 or - lower_order_final): - prev_sample = self.dpm_solver_first_order_update( - model_output, timestep, prev_timestep, sample) - elif (self.config.solver_order == 2 or self.lower_order_nums < 2 or - lower_order_second): + if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final: + prev_sample = self.dpm_solver_first_order_update(model_output, timestep, prev_timestep, sample) + elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second: timestep_list = [self.timesteps[step_index - 1], timestep] prev_sample = self.multistep_dpm_solver_second_order_update( - self.model_outputs, timestep_list, prev_timestep, sample) + self.model_outputs, timestep_list, prev_timestep, sample + ) else: timestep_list = [ self.timesteps[step_index - 2], @@ -578,18 +575,18 @@ def step( timestep, ] prev_sample = self.multistep_dpm_solver_third_order_update( - self.model_outputs, timestep_list, prev_timestep, sample) + self.model_outputs, timestep_list, prev_timestep, sample + ) if self.lower_order_nums < self.config.solver_order: self.lower_order_nums += 1 if not return_dict: - return (prev_sample, ) + return (prev_sample,) return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: paddle.Tensor, *args, - **kwargs) -> paddle.Tensor: + def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. @@ -603,26 +600,25 @@ def scale_model_input(self, sample: paddle.Tensor, *args, return sample def add_noise( - self, - original_samples: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor, ) -> paddle.Tensor: + self, + original_samples: paddle.Tensor, + noise: paddle.Tensor, + timesteps: paddle.Tensor, + ) -> paddle.Tensor: # Make sure alphas_cumprod and timestep have same dtype as original_samples alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype) - sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5 + sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 sqrt_alpha_prod = sqrt_alpha_prod.flatten() while len(sqrt_alpha_prod.shape) < len(original_samples.shape): sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) - sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5 + sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() - while len(sqrt_one_minus_alpha_prod.shape) < len( - original_samples.shape): + while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape): sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) - noisy_samples = (sqrt_alpha_prod * original_samples + - sqrt_one_minus_alpha_prod * noise) + noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise return noisy_samples def __len__(self): diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_singlestep.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_singlestep.py index 499d2e90373b9..0e99f01aa230b 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_singlestep.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_singlestep.py @@ -22,8 +22,7 @@ import paddle from ..configuration_utils import ConfigMixin, register_to_config -from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin, - SchedulerOutput) +from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput # Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -46,7 +45,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): """ def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2 + return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 betas = [] for i in range(num_diffusion_timesteps): @@ -123,38 +122,41 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - num_train_timesteps: int=1000, - beta_start: float=0.0001, - beta_end: float=0.02, - beta_schedule: str="linear", - trained_betas: Optional[np.ndarray]=None, - solver_order: int=2, - prediction_type: str="epsilon", - thresholding: bool=False, - dynamic_thresholding_ratio: float=0.995, - sample_max_value: float=1.0, - algorithm_type: str="dpmsolver++", - solver_type: str="midpoint", - lower_order_final: bool=True, ): + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.0001, + beta_end: float = 0.02, + beta_schedule: str = "linear", + trained_betas: Optional[np.ndarray] = None, + solver_order: int = 2, + prediction_type: str = "epsilon", + thresholding: bool = False, + dynamic_thresholding_ratio: float = 0.995, + sample_max_value: float = 1.0, + algorithm_type: str = "dpmsolver++", + solver_type: str = "midpoint", + lower_order_final: bool = True, + ): if trained_betas is not None: self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32) elif beta_schedule == "linear": - self.betas = paddle.linspace( - beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) + self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) elif beta_schedule == "scaled_linear": # this schedule is very specific to the latent diffusion model. - self.betas = (paddle.linspace( - beta_start**0.5, - beta_end**0.5, - num_train_timesteps, - dtype=paddle.float32, )**2) + self.betas = ( + paddle.linspace( + beta_start**0.5, + beta_end**0.5, + num_train_timesteps, + dtype=paddle.float32, + ) + ** 2 + ) elif beta_schedule == "squaredcos_cap_v2": # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError( - f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = paddle.cumprod(self.alphas, 0) @@ -171,22 +173,16 @@ def __init__( if algorithm_type == "deis": self.register_to_config(algorithm_type="dpmsolver++") else: - raise NotImplementedError( - f"{algorithm_type} does is not implemented for {self.__class__}" - ) + raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}") if solver_type not in ["midpoint", "heun"]: if solver_type in ["logrho", "bh1", "bh2"]: self.register_to_config(solver_type="midpoint") else: - raise NotImplementedError( - f"{solver_type} does is not implemented for {self.__class__}" - ) + raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}") # setable values self.num_inference_steps = None - timesteps = np.linspace( - 0, num_train_timesteps - 1, num_train_timesteps, - dtype=np.float32)[::-1].copy() + timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy() self.timesteps = paddle.to_tensor(timesteps) self.model_outputs = [None] * solver_order self.sample = None @@ -248,8 +244,7 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor: abs_sample = sample.abs() # "a certain percentile absolute pixel value" - s = paddle.quantile( - abs_sample, self.config.dynamic_thresholding_ratio, axis=1) + s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1) # paddle.clip donot support min > max if self.config.sample_max_value < 1: s = paddle.ones_like(s) * self.config.sample_max_value @@ -257,11 +252,8 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor: s = paddle.clip( s, min=1, max=self.config.sample_max_value ) # When clip to min=1, equivalent to standard clipping to [-1, 1] - s = s.unsqueeze( - 1) # (batch_size, 1) because clip will broadcast along axis=0 - sample = ( - paddle.clip(sample, -s, s) / - s) # "we threshold xt0 to the range [-s, s] and then divide by s" + s = s.unsqueeze(1) # (batch_size, 1) because clip will broadcast along axis=0 + sample = paddle.clip(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s" sample = paddle.reshape(sample, [batch_size, channels, height, width]) sample = paddle.cast(sample, dtype) @@ -277,18 +269,18 @@ def set_timesteps(self, num_inference_steps: int): the number of diffusion steps used when generating samples with a pre-trained model. """ self.num_inference_steps = num_inference_steps - timesteps = (np.linspace(0, self.config.num_train_timesteps - 1, - num_inference_steps + 1).round()[::-1][:-1] - .copy().astype(np.int64)) + timesteps = ( + np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1) + .round()[::-1][:-1] + .copy() + .astype(np.int64) + ) self.timesteps = paddle.to_tensor(timesteps) self.model_outputs = [None] * self.config.solver_order self.sample = None self.orders = self.get_order_list(num_inference_steps) - def convert_model_output(self, - model_output: paddle.Tensor, - timestep: int, - sample: paddle.Tensor) -> paddle.Tensor: + def convert_model_output(self, model_output: paddle.Tensor, timestep: int, sample: paddle.Tensor) -> paddle.Tensor: """ Convert the model output to the corresponding type that the algorithm (DPM-Solver / DPM-Solver++) needs. @@ -311,19 +303,18 @@ def convert_model_output(self, # DPM-Solver++ needs to solve an integral of the data prediction model. if self.config.algorithm_type == "dpmsolver++": if self.config.prediction_type == "epsilon": - alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[ - timestep] + alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep] x0_pred = (sample - sigma_t * model_output) / alpha_t elif self.config.prediction_type == "sample": x0_pred = model_output elif self.config.prediction_type == "v_prediction": - alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[ - timestep] + alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep] x0_pred = alpha_t * sample - sigma_t * model_output else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or" - " `v_prediction` for the DPMSolverSinglestepScheduler.") + " `v_prediction` for the DPMSolverSinglestepScheduler." + ) if self.config.thresholding: x0_pred = self._threshold_sample(x0_pred) @@ -334,26 +325,26 @@ def convert_model_output(self, if self.config.prediction_type == "epsilon": return model_output elif self.config.prediction_type == "sample": - alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[ - timestep] + alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep] epsilon = (sample - alpha_t * model_output) / sigma_t return epsilon elif self.config.prediction_type == "v_prediction": - alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[ - timestep] + alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep] epsilon = alpha_t * model_output + sigma_t * sample return epsilon else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or" - " `v_prediction` for the DPMSolverSinglestepScheduler.") + " `v_prediction` for the DPMSolverSinglestepScheduler." + ) def dpm_solver_first_order_update( - self, - model_output: paddle.Tensor, - timestep: int, - prev_timestep: int, - sample: paddle.Tensor, ) -> paddle.Tensor: + self, + model_output: paddle.Tensor, + timestep: int, + prev_timestep: int, + sample: paddle.Tensor, + ) -> paddle.Tensor: """ One step for the first-order DPM-Solver (equivalent to DDIM). @@ -369,25 +360,23 @@ def dpm_solver_first_order_update( Returns: `paddle.Tensor`: the sample tensor at the previous timestep. """ - lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[ - timestep] + lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[timestep] alpha_t, alpha_s = self.alpha_t[prev_timestep], self.alpha_t[timestep] sigma_t, sigma_s = self.sigma_t[prev_timestep], self.sigma_t[timestep] h = lambda_t - lambda_s if self.config.algorithm_type == "dpmsolver++": - x_t = (sigma_t / sigma_s) * sample - (alpha_t * ( - paddle.exp(-h) - 1.0)) * model_output + x_t = (sigma_t / sigma_s) * sample - (alpha_t * (paddle.exp(-h) - 1.0)) * model_output elif self.config.algorithm_type == "dpmsolver": - x_t = (alpha_t / alpha_s) * sample - (sigma_t * (paddle.exp(h) - 1.0 - )) * model_output + x_t = (alpha_t / alpha_s) * sample - (sigma_t * (paddle.exp(h) - 1.0)) * model_output return x_t def singlestep_dpm_solver_second_order_update( - self, - model_output_list: List[paddle.Tensor], - timestep_list: List[int], - prev_timestep: int, - sample: paddle.Tensor, ) -> paddle.Tensor: + self, + model_output_list: List[paddle.Tensor], + timestep_list: List[int], + prev_timestep: int, + sample: paddle.Tensor, + ) -> paddle.Tensor: """ One step for the second-order singlestep DPM-Solver. @@ -409,7 +398,8 @@ def singlestep_dpm_solver_second_order_update( lambda_t, lambda_s0, lambda_s1 = ( self.lambda_t[t], self.lambda_t[s0], - self.lambda_t[s1], ) + self.lambda_t[s1], + ) alpha_t, alpha_s1 = self.alpha_t[t], self.alpha_t[s1] sigma_t, sigma_s1 = self.sigma_t[t], self.sigma_t[s1] h, h_0 = lambda_t - lambda_s1, lambda_s0 - lambda_s1 @@ -418,31 +408,40 @@ def singlestep_dpm_solver_second_order_update( if self.config.algorithm_type == "dpmsolver++": # See https://arxiv.org/abs/2211.01095 for detailed derivations if self.config.solver_type == "midpoint": - x_t = ((sigma_t / sigma_s1) * sample - - (alpha_t * (paddle.exp(-h) - 1.0)) * D0 - 0.5 * - (alpha_t * (paddle.exp(-h) - 1.0)) * D1) + x_t = ( + (sigma_t / sigma_s1) * sample + - (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + - 0.5 * (alpha_t * (paddle.exp(-h) - 1.0)) * D1 + ) elif self.config.solver_type == "heun": - x_t = ((sigma_t / sigma_s1) * sample - - (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + (alpha_t * ( - (paddle.exp(-h) - 1.0) / h + 1.0)) * D1) + x_t = ( + (sigma_t / sigma_s1) * sample + - (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1 + ) elif self.config.algorithm_type == "dpmsolver": # See https://arxiv.org/abs/2206.00927 for detailed derivations if self.config.solver_type == "midpoint": - x_t = ((alpha_t / alpha_s1) * sample - - (sigma_t * (paddle.exp(h) - 1.0)) * D0 - 0.5 * - (sigma_t * (paddle.exp(h) - 1.0)) * D1) + x_t = ( + (alpha_t / alpha_s1) * sample + - (sigma_t * (paddle.exp(h) - 1.0)) * D0 + - 0.5 * (sigma_t * (paddle.exp(h) - 1.0)) * D1 + ) elif self.config.solver_type == "heun": - x_t = ((alpha_t / alpha_s1) * sample - - (sigma_t * (paddle.exp(h) - 1.0)) * D0 - (sigma_t * ( - (paddle.exp(h) - 1.0) / h - 1.0)) * D1) + x_t = ( + (alpha_t / alpha_s1) * sample + - (sigma_t * (paddle.exp(h) - 1.0)) * D0 + - (sigma_t * ((paddle.exp(h) - 1.0) / h - 1.0)) * D1 + ) return x_t def singlestep_dpm_solver_third_order_update( - self, - model_output_list: List[paddle.Tensor], - timestep_list: List[int], - prev_timestep: int, - sample: paddle.Tensor, ) -> paddle.Tensor: + self, + model_output_list: List[paddle.Tensor], + timestep_list: List[int], + prev_timestep: int, + sample: paddle.Tensor, + ) -> paddle.Tensor: """ One step for the third-order singlestep DPM-Solver. @@ -463,14 +462,15 @@ def singlestep_dpm_solver_third_order_update( prev_timestep, timestep_list[-1], timestep_list[-2], - timestep_list[-3], ) - m0, m1, m2 = model_output_list[-1], model_output_list[ - -2], model_output_list[-3] + timestep_list[-3], + ) + m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3] lambda_t, lambda_s0, lambda_s1, lambda_s2 = ( self.lambda_t[t], self.lambda_t[s0], self.lambda_t[s1], - self.lambda_t[s2], ) + self.lambda_t[s2], + ) alpha_t, alpha_s2 = self.alpha_t[t], self.alpha_t[s2] sigma_t, sigma_s2 = self.sigma_t[t], self.sigma_t[s2] h, h_0, h_1 = lambda_t - lambda_s2, lambda_s0 - lambda_s2, lambda_s1 - lambda_s2 @@ -482,35 +482,43 @@ def singlestep_dpm_solver_third_order_update( if self.config.algorithm_type == "dpmsolver++": # See https://arxiv.org/abs/2206.00927 for detailed derivations if self.config.solver_type == "midpoint": - x_t = ((sigma_t / sigma_s2) * sample - - (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + (alpha_t * ( - (paddle.exp(-h) - 1.0) / h + 1.0)) * D1_1) + x_t = ( + (sigma_t / sigma_s2) * sample + - (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1_1 + ) elif self.config.solver_type == "heun": x_t = ( - (sigma_t / sigma_s2) * sample - - (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + (alpha_t * ( - (paddle.exp(-h) - 1.0) / h + 1.0)) * D1 - (alpha_t * ( - (paddle.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2) + (sigma_t / sigma_s2) * sample + - (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1 + - (alpha_t * ((paddle.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2 + ) elif self.config.algorithm_type == "dpmsolver": # See https://arxiv.org/abs/2206.00927 for detailed derivations if self.config.solver_type == "midpoint": - x_t = ((alpha_t / alpha_s2) * sample - - (sigma_t * (paddle.exp(h) - 1.0)) * D0 - (sigma_t * ( - (paddle.exp(h) - 1.0) / h - 1.0)) * D1_1) + x_t = ( + (alpha_t / alpha_s2) * sample + - (sigma_t * (paddle.exp(h) - 1.0)) * D0 + - (sigma_t * ((paddle.exp(h) - 1.0) / h - 1.0)) * D1_1 + ) elif self.config.solver_type == "heun": - x_t = ((alpha_t / alpha_s2) * sample - - (sigma_t * (paddle.exp(h) - 1.0)) * D0 - (sigma_t * ( - (paddle.exp(h) - 1.0) / h - 1.0)) * D1 - (sigma_t * ( - (paddle.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2) + x_t = ( + (alpha_t / alpha_s2) * sample + - (sigma_t * (paddle.exp(h) - 1.0)) * D0 + - (sigma_t * ((paddle.exp(h) - 1.0) / h - 1.0)) * D1 + - (sigma_t * ((paddle.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2 + ) return x_t def singlestep_dpm_solver_update( - self, - model_output_list: List[paddle.Tensor], - timestep_list: List[int], - prev_timestep: int, - sample: paddle.Tensor, - order: int, ) -> paddle.Tensor: + self, + model_output_list: List[paddle.Tensor], + timestep_list: List[int], + prev_timestep: int, + sample: paddle.Tensor, + order: int, + ) -> paddle.Tensor: """ One step for the singlestep DPM-Solver. @@ -528,23 +536,25 @@ def singlestep_dpm_solver_update( `paddle.Tensor`: the sample tensor at the previous timestep. """ if order == 1: - return self.dpm_solver_first_order_update( - model_output_list[-1], timestep_list[-1], prev_timestep, sample) + return self.dpm_solver_first_order_update(model_output_list[-1], timestep_list[-1], prev_timestep, sample) elif order == 2: return self.singlestep_dpm_solver_second_order_update( - model_output_list, timestep_list, prev_timestep, sample) + model_output_list, timestep_list, prev_timestep, sample + ) elif order == 3: return self.singlestep_dpm_solver_third_order_update( - model_output_list, timestep_list, prev_timestep, sample) + model_output_list, timestep_list, prev_timestep, sample + ) else: raise ValueError(f"Order must be 1, 2, 3, got {order}") def step( - self, - model_output: paddle.Tensor, - timestep: int, - sample: paddle.Tensor, - return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]: + self, + model_output: paddle.Tensor, + timestep: int, + sample: paddle.Tensor, + return_dict: bool = True, + ) -> Union[SchedulerOutput, Tuple]: """ Step function propagating the sample with the singlestep DPM-Solver. @@ -570,8 +580,7 @@ def step( step_index = len(self.timesteps) - 1 else: step_index = step_index.item() - prev_timestep = (0 if step_index == len(self.timesteps) - 1 else - self.timesteps[step_index + 1]) + prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1] model_output = self.convert_model_output(model_output, timestep, sample) for i in range(self.config.solver_order - 1): @@ -583,20 +592,17 @@ def step( if order == 1: self.sample = sample - timestep_list = [ - self.timesteps[step_index - i] for i in range(order - 1, 0, -1) - ] + [timestep] + timestep_list = [self.timesteps[step_index - i] for i in range(order - 1, 0, -1)] + [timestep] prev_sample = self.singlestep_dpm_solver_update( - self.model_outputs, timestep_list, prev_timestep, self.sample, - order) + self.model_outputs, timestep_list, prev_timestep, self.sample, order + ) if not return_dict: - return (prev_sample, ) + return (prev_sample,) return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: paddle.Tensor, *args, - **kwargs) -> paddle.Tensor: + def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. @@ -610,26 +616,25 @@ def scale_model_input(self, sample: paddle.Tensor, *args, return sample def add_noise( - self, - original_samples: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor, ) -> paddle.Tensor: + self, + original_samples: paddle.Tensor, + noise: paddle.Tensor, + timesteps: paddle.Tensor, + ) -> paddle.Tensor: # Make sure alphas_cumprod and timestep have same dtype as original_samples alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype) - sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5 + sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 sqrt_alpha_prod = sqrt_alpha_prod.flatten() while len(sqrt_alpha_prod.shape) < len(original_samples.shape): sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) - sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5 + sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() - while len(sqrt_one_minus_alpha_prod.shape) < len( - original_samples.shape): + while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape): sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) - noisy_samples = (sqrt_alpha_prod * original_samples + - sqrt_one_minus_alpha_prod * noise) + noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise return noisy_samples def __len__(self): diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_unidiffuser.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_unidiffuser.py index 82931a90d6eff..eccdbb7bfdcf4 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_unidiffuser.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_dpmsolver_unidiffuser.py @@ -18,17 +18,14 @@ import paddle from ..configuration_utils import ConfigMixin, register_to_config -from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin, - SchedulerOutput) +from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput def logaddexp(x, y): - return paddle.log(1 + paddle.exp( - paddle.minimum(x, y) - paddle.maximum(x, y))) + paddle.maximum(x, y) + return paddle.log(1 + paddle.exp(paddle.minimum(x, y) - paddle.maximum(x, y))) + paddle.maximum(x, y) -def interpolate_fn(x: paddle.Tensor, xp: paddle.Tensor, - yp: paddle.Tensor) -> paddle.Tensor: +def interpolate_fn(x: paddle.Tensor, xp: paddle.Tensor, yp: paddle.Tensor) -> paddle.Tensor: """Performs piecewise linear interpolation for x, using xp and yp keypoints (knots). Performs separate interpolation for each channel. Args: @@ -45,8 +42,7 @@ def interpolate_fn(x: paddle.Tensor, xp: paddle.Tensor, >>> calibrate1d(paddle.to_tensor([[-10]]), paddle.to_tensor([[0.0, 1.0]]), paddle.to_tensor([[0.0, 2.0]])) tensor([[-20.0000]]) """ - x_breakpoints = paddle.concat( - [x.unsqueeze(2), xp.unsqueeze(0).tile((x.shape[0], 1, 1))], axis=2) + x_breakpoints = paddle.concat([x.unsqueeze(2), xp.unsqueeze(0).tile((x.shape[0], 1, 1))], axis=2) num_x_points = xp.shape[1] sorted_x_breakpoints = paddle.sort(x_breakpoints, axis=2) x_indices = paddle.argsort(x_breakpoints, axis=2) @@ -58,29 +54,26 @@ def interpolate_fn(x: paddle.Tensor, xp: paddle.Tensor, paddle.where( paddle.equal(x_idx, num_x_points), paddle.to_tensor([num_x_points - 2]), - cand_start_idx, ), ) - end_idx = paddle.where( - paddle.equal(start_idx, cand_start_idx), start_idx + 2, start_idx + 1) - start_x = paddle.take_along_axis( - arr=sorted_x_breakpoints, axis=2, - indices=start_idx.unsqueeze(axis=2)).squeeze(axis=2) - end_x = paddle.take_along_axis( - arr=sorted_x_breakpoints, axis=2, - indices=end_idx.unsqueeze(axis=2)).squeeze(axis=2) + cand_start_idx, + ), + ) + end_idx = paddle.where(paddle.equal(start_idx, cand_start_idx), start_idx + 2, start_idx + 1) + start_x = paddle.take_along_axis(arr=sorted_x_breakpoints, axis=2, indices=start_idx.unsqueeze(axis=2)).squeeze( + axis=2 + ) + end_x = paddle.take_along_axis(arr=sorted_x_breakpoints, axis=2, indices=end_idx.unsqueeze(axis=2)).squeeze(axis=2) start_idx2 = paddle.where( paddle.equal(x_idx, 0), paddle.to_tensor([0]), paddle.where( paddle.equal(x_idx, num_x_points), paddle.to_tensor([num_x_points - 2]), - cand_start_idx, ), ) + cand_start_idx, + ), + ) y_positions_expanded = yp.unsqueeze(0).expand([x.shape[0], -1, -1]) - start_y = paddle.take_along_axis( - y_positions_expanded, axis=2, - indices=start_idx2.unsqueeze(2)).squeeze(2) - end_y = paddle.take_along_axis( - y_positions_expanded, axis=2, - indices=(start_idx2 + 1).unsqueeze(2)).squeeze(2) + start_y = paddle.take_along_axis(y_positions_expanded, axis=2, indices=start_idx2.unsqueeze(2)).squeeze(2) + end_y = paddle.take_along_axis(y_positions_expanded, axis=2, indices=(start_idx2 + 1).unsqueeze(2)).squeeze(2) cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x) return cand @@ -128,35 +121,38 @@ class DPMSolverUniDiffuserScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - num_train_timesteps: int=1000, - beta_start: float=0.00085, - beta_end: float=0.0120, - method="multistep", - schedule: str="discrete", - beta_schedule: str="scaled_linear", - trained_betas: Optional[Union[np.ndarray, List[float]]]=None, - prediction_type: str="epsilon", - algorithm_type: str="dpmsolver++", - solver_type: str="midpoint", ): + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.00085, + beta_end: float = 0.0120, + method="multistep", + schedule: str = "discrete", + beta_schedule: str = "scaled_linear", + trained_betas: Optional[Union[np.ndarray, List[float]]] = None, + prediction_type: str = "epsilon", + algorithm_type: str = "dpmsolver++", + solver_type: str = "midpoint", + ): if trained_betas is not None: self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32) if beta_schedule == "scaled_linear": # this schedule is very specific to the unidiffuser model. - self.betas = (paddle.linspace( - beta_start**0.5, - beta_end**0.5, - num_train_timesteps, - dtype=paddle.float32, )**2) + self.betas = ( + paddle.linspace( + beta_start**0.5, + beta_end**0.5, + num_train_timesteps, + dtype=paddle.float32, + ) + ** 2 + ) else: - raise NotImplementedError( - f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") if schedule == "discrete": log_alphas = 0.5 * paddle.log(1 - self.betas).cumsum(axis=0) self.total_N = len(log_alphas) - self.t_discrete = paddle.linspace(1.0 / self.total_N, 1.0, - self.total_N).reshape([1, -1]) + self.t_discrete = paddle.linspace(1.0 / self.total_N, 1.0, self.total_N).reshape([1, -1]) self.log_alpha_discrete = log_alphas.reshape((1, -1)) else: raise ValueError @@ -172,16 +168,12 @@ def __init__( if algorithm_type == "deis": algorithm_type = "dpmsolver++" else: - raise NotImplementedError( - f"{algorithm_type} does is not implemented for {self.__class__}" - ) + raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}") if solver_type not in ["midpoint"]: if solver_type in ["logrho", "bh1", "bh2"]: solver_type = "midpoint" else: - raise NotImplementedError( - f"{solver_type} does is not implemented for {self.__class__}" - ) + raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}") # standard deviation of the initial noise distribution self.init_noise_sigma = 1.0 @@ -196,7 +188,8 @@ def marginal_log_mean_coeff(self, t): return interpolate_fn( t.reshape((-1, 1)), self.t_discrete.clone(), - self.log_alpha_discrete.clone(), ).reshape((-1, )) + self.log_alpha_discrete.clone(), + ).reshape((-1,)) else: raise ValueError @@ -207,8 +200,7 @@ def marginal_std(self, t): """ Compute sigma_t of a given continuous-time label t in [0, T]. """ - return paddle.sqrt(1.0 - paddle.exp(2.0 * self.marginal_log_mean_coeff( - t))) + return paddle.sqrt(1.0 - paddle.exp(2.0 * self.marginal_log_mean_coeff(t))) def marginal_lambda(self, t): """ @@ -220,12 +212,13 @@ def marginal_lambda(self, t): def inverse_lambda(self, lamb): if self.schedule == "discrete": - log_alpha = -0.5 * logaddexp(paddle.zeros((1, )), -2.0 * lamb) + log_alpha = -0.5 * logaddexp(paddle.zeros((1,)), -2.0 * lamb) t = interpolate_fn( log_alpha.reshape((-1, 1)), paddle.flip(self.log_alpha_discrete.clone(), [1]), - paddle.flip(self.t_discrete.clone(), [1]), ) - return t.reshape((-1, )) + paddle.flip(self.t_discrete.clone(), [1]), + ) + return t.reshape((-1,)) else: raise ValueError @@ -243,10 +236,7 @@ def set_timesteps(self, num_inference_steps: int): self.noise_prev_list = [] self.t_prev_list = [] - def convert_model_output(self, - model_output: paddle.Tensor, - timestep: int, - sample: paddle.Tensor) -> paddle.Tensor: + def convert_model_output(self, model_output: paddle.Tensor, timestep: int, sample: paddle.Tensor) -> paddle.Tensor: """ Convert the model output to the corresponding type that the algorithm (DPM-Solver / DPM-Solver++) needs. @@ -267,17 +257,17 @@ def convert_model_output(self, `paddle.Tensor`: the converted model output. """ # DPM-Solver++ needs to solve an integral of the data prediction model. - alpha_t, sigma_t = self.marginal_alpha(timestep), self.marginal_std( - timestep) + alpha_t, sigma_t = self.marginal_alpha(timestep), self.marginal_std(timestep) x0_pred = (sample - sigma_t * model_output) / alpha_t return x0_pred def dpm_solver_first_order_update( - self, - model_output: paddle.Tensor, - timestep: int, - prev_timestep: int, - sample: paddle.Tensor, ) -> paddle.Tensor: + self, + model_output: paddle.Tensor, + timestep: int, + prev_timestep: int, + sample: paddle.Tensor, + ) -> paddle.Tensor: """ One step for the first-order DPM-Solver (equivalent to DDIM). @@ -293,27 +283,25 @@ def dpm_solver_first_order_update( Returns: `paddle.Tensor`: the sample tensor at the previous timestep. """ - lambda_t, lambda_s = self.marginal_lambda( - timestep), self.marginal_lambda(prev_timestep) + lambda_t, lambda_s = self.marginal_lambda(timestep), self.marginal_lambda(prev_timestep) alpha_t = self.marginal_log_mean_coeff(timestep) - sigma_t, sigma_s = self.marginal_std(timestep), self.marginal_std( - prev_timestep) + sigma_t, sigma_s = self.marginal_std(timestep), self.marginal_std(prev_timestep) alpha_t = paddle.exp(alpha_t) h = lambda_t - lambda_s if self.config.algorithm_type == "dpmsolver++": - x_t = (sigma_t / sigma_s) * sample - (alpha_t * ( - paddle.exp(-h) - 1.0)) * model_output + x_t = (sigma_t / sigma_s) * sample - (alpha_t * (paddle.exp(-h) - 1.0)) * model_output else: raise ValueError return x_t def multistep_dpm_solver_second_order_update( - self, - model_output_list: List[paddle.Tensor], - timestep_list: List[int], - prev_timestep: int, - sample: paddle.Tensor, ) -> paddle.Tensor: + self, + model_output_list: List[paddle.Tensor], + timestep_list: List[int], + prev_timestep: int, + sample: paddle.Tensor, + ) -> paddle.Tensor: """ One step for the second-order multistep DPM-Solver. @@ -333,7 +321,8 @@ def multistep_dpm_solver_second_order_update( lambda_t, lambda_s0, lambda_s1 = ( self.marginal_lambda(t), self.marginal_lambda(s0), - self.marginal_lambda(s1), ) + self.marginal_lambda(s1), + ) log_alpha_t = self.marginal_log_mean_coeff(t) sigma_t, sigma_s0 = self.marginal_std(t), self.marginal_std(s0) h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1 @@ -343,19 +332,22 @@ def multistep_dpm_solver_second_order_update( if self.config.algorithm_type == "dpmsolver++": # See https://arxiv.org/abs/2211.01095 for detailed derivations if self.config.solver_type == "midpoint": - x_t = ((sigma_t / sigma_s0) * sample - - (alpha_t * (paddle.exp(-h) - 1.0)) * D0 - 0.5 * - (alpha_t * (paddle.exp(-h) - 1.0)) * D1) + x_t = ( + (sigma_t / sigma_s0) * sample + - (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + - 0.5 * (alpha_t * (paddle.exp(-h) - 1.0)) * D1 + ) else: raise ValueError return x_t def multistep_dpm_solver_third_order_update( - self, - model_output_list: List[paddle.Tensor], - timestep_list: List[int], - prev_timestep: int, - sample: paddle.Tensor, ) -> paddle.Tensor: + self, + model_output_list: List[paddle.Tensor], + timestep_list: List[int], + prev_timestep: int, + sample: paddle.Tensor, + ) -> paddle.Tensor: """ One step for the third-order multistep DPM-Solver. @@ -374,14 +366,15 @@ def multistep_dpm_solver_third_order_update( prev_timestep, timestep_list[-1], timestep_list[-2], - timestep_list[-3], ) - m0, m1, m2 = model_output_list[-1], model_output_list[ - -2], model_output_list[-3] + timestep_list[-3], + ) + m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3] lambda_t, lambda_s0, lambda_s1, lambda_s2 = ( self.marginal_lambda(t), self.marginal_lambda(s0), self.marginal_lambda(s1), - self.marginal_lambda(s2), ) + self.marginal_lambda(s2), + ) alpha_t = self.marginal_log_mean_coeff(t) alpha_t = paddle.exp(alpha_t) sigma_t, sigma_s0 = self.marginal_std(t), self.marginal_std(s0) @@ -393,20 +386,23 @@ def multistep_dpm_solver_third_order_update( D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1) if self.config.algorithm_type == "dpmsolver++": # See https://arxiv.org/abs/2206.00927 for detailed derivations - x_t = ((sigma_t / sigma_s0) * sample - - (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + (alpha_t * ( - (paddle.exp(-h) - 1.0) / h + 1.0)) * D1 - (alpha_t * ( - (paddle.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2) + x_t = ( + (sigma_t / sigma_s0) * sample + - (alpha_t * (paddle.exp(-h) - 1.0)) * D0 + + (alpha_t * ((paddle.exp(-h) - 1.0) / h + 1.0)) * D1 + - (alpha_t * ((paddle.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2 + ) else: raise ValueError return x_t def step( - self, - model_output: paddle.Tensor, - timestep: int, - sample: paddle.Tensor, - return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]: + self, + model_output: paddle.Tensor, + timestep: int, + sample: paddle.Tensor, + return_dict: bool = True, + ) -> Union[SchedulerOutput, Tuple]: """ Step function propagating the sample with the multistep DPM-Solver. @@ -437,59 +433,47 @@ def step( if self.method == "multistep": if step_index == 0: vec_t = timestep.expand([sample.shape[0]]) - model_output = self.convert_model_output(model_output, vec_t, - sample) + model_output = self.convert_model_output(model_output, vec_t, sample) self.noise_prev_list.append(model_output) self.t_prev_list.append(vec_t) if step_index > 0 and step_index < order: vec_t = timestep.expand([sample.shape[0]]) - sample = self.dpm_multistep_update(sample, self.noise_prev_list, - self.t_prev_list, vec_t, - step_index) - model_output = self.convert_model_output(model_output, vec_t, - sample) + sample = self.dpm_multistep_update(sample, self.noise_prev_list, self.t_prev_list, vec_t, step_index) + model_output = self.convert_model_output(model_output, vec_t, sample) self.noise_prev_list.append(model_output) self.t_prev_list.append(vec_t) if step_index >= order and step_index < len(self.timesteps): vec_t = timestep.expand([sample.shape[0]]) - sample = self.dpm_multistep_update(sample, self.noise_prev_list, - self.t_prev_list, vec_t, - order) + sample = self.dpm_multistep_update(sample, self.noise_prev_list, self.t_prev_list, vec_t, order) for i in range(order - 1): self.t_prev_list[i] = self.t_prev_list[i + 1] self.noise_prev_list[i] = self.noise_prev_list[i + 1] self.t_prev_list[-1] = vec_t if step_index < len(self.timesteps) - 1: - self.noise_prev_list[-1] = self.convert_model_output( - model_output, vec_t, sample) + self.noise_prev_list[-1] = self.convert_model_output(model_output, vec_t, sample) else: raise ValueError prev_sample = sample if not return_dict: - return (prev_sample, ) + return (prev_sample,) return SchedulerOutput(prev_sample=prev_sample) def dpm_multistep_update(self, x, noise_prev_list, t_prev_list, t, order): if order == 1: - return self.dpm_solver_first_order_update(noise_prev_list[-1], t, - t_prev_list[-1], x) + return self.dpm_solver_first_order_update(noise_prev_list[-1], t, t_prev_list[-1], x) elif order == 2: - return self.multistep_dpm_solver_second_order_update( - noise_prev_list, t_prev_list, t, x) + return self.multistep_dpm_solver_second_order_update(noise_prev_list, t_prev_list, t, x) elif order == 3: - return self.multistep_dpm_solver_third_order_update( - noise_prev_list, t_prev_list, t, x) + return self.multistep_dpm_solver_third_order_update(noise_prev_list, t_prev_list, t, x) else: - raise ValueError("Solver order must be 1 or 2 or 3, got {}".format( - order)) + raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order)) - def scale_model_input(self, sample: paddle.Tensor, *args, - **kwargs) -> paddle.Tensor: + def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_ancestral_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_ancestral_discrete.py index a2a0a495031de..95332c844b137 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_ancestral_discrete.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_ancestral_discrete.py @@ -47,8 +47,7 @@ class EulerAncestralDiscreteSchedulerOutput(BaseOutput): # Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, - max_beta=0.999) -> paddle.Tensor: +def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor: """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -67,7 +66,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, """ def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2 + return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 betas = [] for i in range(num_diffusion_timesteps): @@ -108,37 +107,39 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - num_train_timesteps: int=1000, - beta_start: float=0.0001, - beta_end: float=0.02, - beta_schedule: str="linear", - trained_betas: Optional[Union[np.ndarray, List[float]]]=None, - prediction_type: str="epsilon", ): + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.0001, + beta_end: float = 0.02, + beta_schedule: str = "linear", + trained_betas: Optional[Union[np.ndarray, List[float]]] = None, + prediction_type: str = "epsilon", + ): if trained_betas is not None: self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32) elif beta_schedule == "linear": - self.betas = paddle.linspace( - beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) + self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) elif beta_schedule == "scaled_linear": # this schedule is very specific to the latent diffusion model. - self.betas = (paddle.linspace( - beta_start**0.5, - beta_end**0.5, - num_train_timesteps, - dtype=paddle.float32, )**2) + self.betas = ( + paddle.linspace( + beta_start**0.5, + beta_end**0.5, + num_train_timesteps, + dtype=paddle.float32, + ) + ** 2 + ) elif beta_schedule == "squaredcos_cap_v2": # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError( - f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = paddle.cumprod(self.alphas, 0) - sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)** - 0.5) + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32) self.sigmas = paddle.to_tensor(sigmas) @@ -147,15 +148,11 @@ def __init__( # setable values self.num_inference_steps = None - timesteps = np.linspace( - 0, num_train_timesteps - 1, num_train_timesteps, - dtype=float)[::-1].copy() + timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy() self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32) self.is_scale_input_called = False - def scale_model_input( - self, sample: paddle.Tensor, - timestep: Union[float, paddle.Tensor]) -> paddle.Tensor: + def scale_model_input(self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor]) -> paddle.Tensor: """ Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm. @@ -168,7 +165,7 @@ def scale_model_input( """ step_index = (self.timesteps == timestep).nonzero().item() sigma = self.sigmas[step_index] - sample = sample / ((sigma**2 + 1)**0.5) + sample = sample / ((sigma**2 + 1) ** 0.5) self.is_scale_input_called = True return sample @@ -182,27 +179,21 @@ def set_timesteps(self, num_inference_steps: int): """ self.num_inference_steps = num_inference_steps - timesteps = np.linspace( - 0, - self.config.num_train_timesteps - 1, - num_inference_steps, - dtype=float)[::-1].copy() - sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)** - 0.5) + timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy() + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas) sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32) self.sigmas = paddle.to_tensor(sigmas) self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32) def step( - self, - model_output: paddle.Tensor, - timestep: Union[float, paddle.Tensor], - sample: paddle.Tensor, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - return_dict: bool=True, ) -> Union[ - EulerAncestralDiscreteSchedulerOutput, Tuple]: + self, + model_output: paddle.Tensor, + timestep: Union[float, paddle.Tensor], + sample: paddle.Tensor, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + return_dict: bool = True, + ) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]: """ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). @@ -224,7 +215,8 @@ def step( if not self.is_scale_input_called: logger.warning( "The `scale_model_input` function should be called before `step` to ensure correct denoising. " - "See `StableDiffusionPipeline` for a usage example.") + "See `StableDiffusionPipeline` for a usage example." + ) step_index = (self.timesteps == timestep).nonzero().item() sigma = self.sigmas[step_index] @@ -233,11 +225,9 @@ def step( pred_original_sample = sample - sigma * model_output elif self.config.prediction_type == "v_prediction": # * c_out + input * c_skip - pred_original_sample = model_output * (-sigma / (sigma**2 + 1)**0.5 - ) + (sample / (sigma**2 + 1)) + pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1)) elif self.config.prediction_type == "sample": - raise NotImplementedError( - "prediction_type not implemented yet: sample") + raise NotImplementedError("prediction_type not implemented yet: sample") else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`" @@ -245,9 +235,8 @@ def step( sigma_from = self.sigmas[step_index] sigma_to = self.sigmas[step_index + 1] - sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from - **2)**0.5 - sigma_down = (sigma_to**2 - sigma_up**2)**0.5 + sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5 + sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5 # 2. Convert to an ODE derivative derivative = (sample - pred_original_sample) / sigma @@ -256,28 +245,28 @@ def step( prev_sample = sample + derivative * dt - noise = randn_tensor( - model_output.shape, dtype=model_output.dtype, generator=generator) + noise = randn_tensor(model_output.shape, dtype=model_output.dtype, generator=generator) prev_sample = prev_sample + noise * sigma_up if not return_dict: - return (prev_sample, ) + return (prev_sample,) return EulerAncestralDiscreteSchedulerOutput( - prev_sample=prev_sample, pred_original_sample=pred_original_sample) + prev_sample=prev_sample, pred_original_sample=pred_original_sample + ) def add_noise( - self, - original_samples: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor, ) -> paddle.Tensor: + self, + original_samples: paddle.Tensor, + noise: paddle.Tensor, + timesteps: paddle.Tensor, + ) -> paddle.Tensor: # Make sure sigmas and timesteps have the same dtype as original_samples sigmas = self.sigmas.cast(original_samples.dtype) schedule_timesteps = self.timesteps - step_indices = [(schedule_timesteps == t).nonzero().item() - for t in timesteps] + step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps] sigma = sigmas[step_indices].flatten() while len(sigma.shape) < len(original_samples.shape): diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_discrete.py index 8d53b8dd4f3a9..a45e3bf0e5617 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_discrete.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_euler_discrete.py @@ -66,7 +66,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): """ def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2 + return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 betas = [] for i in range(num_diffusion_timesteps): @@ -114,39 +114,41 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - num_train_timesteps: int=1000, - beta_start: float=0.0001, - beta_end: float=0.02, - beta_schedule: str="linear", - trained_betas: Optional[Union[np.ndarray, List[float]]]=None, - prediction_type: str="epsilon", - interpolation_type: str="linear", - use_karras_sigmas: Optional[bool]=False, ): + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.0001, + beta_end: float = 0.02, + beta_schedule: str = "linear", + trained_betas: Optional[Union[np.ndarray, List[float]]] = None, + prediction_type: str = "epsilon", + interpolation_type: str = "linear", + use_karras_sigmas: Optional[bool] = False, + ): if trained_betas is not None: self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32) elif beta_schedule == "linear": - self.betas = paddle.linspace( - beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) + self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) elif beta_schedule == "scaled_linear": # this schedule is very specific to the latent diffusion model. - self.betas = (paddle.linspace( - beta_start**0.5, - beta_end**0.5, - num_train_timesteps, - dtype=paddle.float32, )**2) + self.betas = ( + paddle.linspace( + beta_start**0.5, + beta_end**0.5, + num_train_timesteps, + dtype=paddle.float32, + ) + ** 2 + ) elif beta_schedule == "squaredcos_cap_v2": # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError( - f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = paddle.cumprod(self.alphas, 0) - sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)** - 0.5) + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32) self.sigmas = paddle.to_tensor(sigmas) @@ -155,16 +157,12 @@ def __init__( # setable values self.num_inference_steps = None - timesteps = np.linspace( - 0, num_train_timesteps - 1, num_train_timesteps, - dtype=float)[::-1].copy() + timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy() self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32) self.is_scale_input_called = False self.use_karras_sigmas = use_karras_sigmas - def scale_model_input( - self, sample: paddle.Tensor, - timestep: Union[float, paddle.Tensor]) -> paddle.Tensor: + def scale_model_input(self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor]) -> paddle.Tensor: """ Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm. @@ -178,7 +176,7 @@ def scale_model_input( step_index = (self.timesteps == timestep).nonzero().item() sigma = self.sigmas[step_index] - sample = sample / ((sigma**2 + 1)**0.5) + sample = sample / ((sigma**2 + 1) ** 0.5) self.is_scale_input_called = True return sample @@ -193,31 +191,23 @@ def set_timesteps(self, num_inference_steps: int): """ self.num_inference_steps = num_inference_steps - timesteps = np.linspace( - 0, - self.config.num_train_timesteps - 1, - num_inference_steps, - dtype=float)[::-1].copy() - sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)** - 0.5) + timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy() + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) log_sigmas = np.log(sigmas) if self.config.interpolation_type == "linear": sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas) elif self.config.interpolation_type == "log_linear": - sigmas = paddle.linspace( - np.log(sigmas[-1]), np.log(sigmas[0]), - num_inference_steps + 1).exp() + sigmas = paddle.linspace(np.log(sigmas[-1]), np.log(sigmas[0]), num_inference_steps + 1).exp() else: raise ValueError( f"{self.config.interpolation_type} is not implemented. Please specify interpolation_type to either" - " 'linear' or 'log_linear'") + " 'linear' or 'log_linear'" + ) if self.use_karras_sigmas: - sigmas = self._convert_to_karras( - in_sigmas=sigmas, num_inference_steps=self.num_inference_steps) - timesteps = np.array( - [self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]) + sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps) + timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]) sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32) self.sigmas = paddle.to_tensor(sigmas) @@ -231,9 +221,7 @@ def _sigma_to_t(self, sigma, log_sigmas): dists = log_sigma - log_sigmas[:, np.newaxis] # get sigmas range - low_idx = (np.cumsum( - (dists >= 0), axis=0).argmax(axis=0) - .clip(max=log_sigmas.shape[0] - 2)) + low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2) high_idx = low_idx + 1 low = log_sigmas[low_idx] @@ -248,8 +236,7 @@ def _sigma_to_t(self, sigma, log_sigmas): t = t.reshape(sigma.shape) return t - def _convert_to_karras(self, in_sigmas: paddle.Tensor, - num_inference_steps) -> paddle.Tensor: + def _convert_to_karras(self, in_sigmas: paddle.Tensor, num_inference_steps) -> paddle.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" sigma_min = in_sigmas[-1].item() @@ -257,24 +244,23 @@ def _convert_to_karras(self, in_sigmas: paddle.Tensor, rho = 7.0 # 7.0 is the value used in the paper ramp = np.linspace(0, 1, num_inference_steps) - min_inv_rho = sigma_min**(1 / rho) - max_inv_rho = sigma_max**(1 / rho) - sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho))**rho + min_inv_rho = sigma_min ** (1 / rho) + max_inv_rho = sigma_max ** (1 / rho) + sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho return sigmas def step( - self, - model_output: paddle.Tensor, - timestep: Union[float, paddle.Tensor], - sample: paddle.Tensor, - s_churn: float=0.0, - s_tmin: float=0.0, - s_tmax: float=float("inf"), - s_noise: float=1.0, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - return_dict: bool=True, ) -> Union[EulerDiscreteSchedulerOutput, - Tuple]: + self, + model_output: paddle.Tensor, + timestep: Union[float, paddle.Tensor], + sample: paddle.Tensor, + s_churn: float = 0.0, + s_tmin: float = 0.0, + s_tmax: float = float("inf"), + s_noise: float = 1.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + return_dict: bool = True, + ) -> Union[EulerDiscreteSchedulerOutput, Tuple]: """ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). @@ -301,35 +287,32 @@ def step( if not self.is_scale_input_called: logger.warning( "The `scale_model_input` function should be called before `step` to ensure correct denoising. " - "See `StableDiffusionPipeline` for a usage example.") + "See `StableDiffusionPipeline` for a usage example." + ) step_index = (self.timesteps == timestep).nonzero().item() sigma = self.sigmas[step_index] - gamma = (min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) - if s_tmin <= sigma <= s_tmax else 0.0) + gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0 - noise = randn_tensor( - model_output.shape, dtype=model_output.dtype, generator=generator) + noise = randn_tensor(model_output.shape, dtype=model_output.dtype, generator=generator) eps = noise * s_noise sigma_hat = sigma * (gamma + 1) if gamma > 0: - sample = sample + eps * (sigma_hat**2 - sigma**2)**0.5 + sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5 # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise # NOTE: "original_sample" should not be an expected prediction_type but is left in for # backwards compatibility - if (self.config.prediction_type == "original_sample" or - self.config.prediction_type == "sample"): + if self.config.prediction_type == "original_sample" or self.config.prediction_type == "sample": pred_original_sample = model_output elif self.config.prediction_type == "epsilon": pred_original_sample = sample - sigma_hat * model_output elif self.config.prediction_type == "v_prediction": # * c_out + input * c_skip - pred_original_sample = model_output * (-sigma / (sigma**2 + 1)**0.5 - ) + (sample / (sigma**2 + 1)) + pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1)) else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`" @@ -343,22 +326,21 @@ def step( prev_sample = sample + derivative * dt if not return_dict: - return (prev_sample, ) + return (prev_sample,) - return EulerDiscreteSchedulerOutput( - prev_sample=prev_sample, pred_original_sample=pred_original_sample) + return EulerDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample) def add_noise( - self, - original_samples: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor, ) -> paddle.Tensor: + self, + original_samples: paddle.Tensor, + noise: paddle.Tensor, + timesteps: paddle.Tensor, + ) -> paddle.Tensor: # Make sure sigmas and timesteps have the same dtype as original_samples sigmas = self.sigmas.cast(original_samples.dtype) schedule_timesteps = self.timesteps - step_indices = [(schedule_timesteps == t).nonzero().item() - for t in timesteps] + step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps] sigma = sigmas[step_indices].flatten() while len(sigma.shape) < len(original_samples.shape): diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_heun_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_heun_discrete.py index 4cd27a38164ff..05a8673a2a358 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_heun_discrete.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_heun_discrete.py @@ -20,13 +20,11 @@ import paddle from ..configuration_utils import ConfigMixin, register_to_config -from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin, - SchedulerOutput) +from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput # Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, - max_beta=0.999) -> paddle.Tensor: +def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor: """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -45,7 +43,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, """ def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2 + return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 betas = [] for i in range(num_diffusion_timesteps): @@ -90,32 +88,35 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - num_train_timesteps: int=1000, - beta_start: float=0.00085, # sensible defaults - beta_end: float=0.012, - beta_schedule: str="linear", - trained_betas: Optional[Union[np.ndarray, List[float]]]=None, - prediction_type: str="epsilon", - use_karras_sigmas: Optional[bool]=False, ): + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.00085, # sensible defaults + beta_end: float = 0.012, + beta_schedule: str = "linear", + trained_betas: Optional[Union[np.ndarray, List[float]]] = None, + prediction_type: str = "epsilon", + use_karras_sigmas: Optional[bool] = False, + ): if trained_betas is not None: self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32) elif beta_schedule == "linear": - self.betas = paddle.linspace( - beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) + self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) elif beta_schedule == "scaled_linear": # this schedule is very specific to the latent diffusion model. - self.betas = (paddle.linspace( - beta_start**0.5, - beta_end**0.5, - num_train_timesteps, - dtype=paddle.float32, )**2) + self.betas = ( + paddle.linspace( + beta_start**0.5, + beta_end**0.5, + num_train_timesteps, + dtype=paddle.float32, + ) + ** 2 + ) elif beta_schedule == "squaredcos_cap_v2": # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError( - f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = paddle.cumprod(self.alphas, 0) @@ -137,9 +138,10 @@ def index_for_timestep(self, timestep, schedule_timesteps=None): return indices[pos].item() def scale_model_input( - self, - sample: paddle.Tensor, - timestep: Union[float, paddle.Tensor], ) -> paddle.Tensor: + self, + sample: paddle.Tensor, + timestep: Union[float, paddle.Tensor], + ) -> paddle.Tensor: """ Args: Ensures interchangeability with schedulers that need to scale the denoising model input depending on the @@ -151,13 +153,14 @@ def scale_model_input( step_index = self.index_for_timestep(timestep) sigma = self.sigmas[step_index] - sample = sample / ((sigma**2 + 1)**0.5) + sample = sample / ((sigma**2 + 1) ** 0.5) return sample def set_timesteps( - self, - num_inference_steps: int, - num_train_timesteps: Optional[int]=None, ): + self, + num_inference_steps: int, + num_train_timesteps: Optional[int] = None, + ): """ Sets the timesteps used for the diffusion chain. Supporting function to be run before inference. @@ -169,32 +172,25 @@ def set_timesteps( num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps - timesteps = np.linspace( - 0, num_train_timesteps - 1, num_inference_steps, - dtype=float)[::-1].copy() + timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy() - sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)** - 0.5) + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) log_sigmas = np.log(sigmas) sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas) if self.use_karras_sigmas: - sigmas = self._convert_to_karras( - in_sigmas=sigmas, num_inference_steps=self.num_inference_steps) - timesteps = np.array( - [self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]) + sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps) + timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]) sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32) sigmas = paddle.to_tensor(sigmas) - self.sigmas = paddle.concat( - [sigmas[:1], sigmas[1:-1].repeat_interleave(2), sigmas[-1:]]) + self.sigmas = paddle.concat([sigmas[:1], sigmas[1:-1].repeat_interleave(2), sigmas[-1:]]) # standard deviation of the initial noise distribution self.init_noise_sigma = self.sigmas.max() timesteps = paddle.to_tensor(timesteps) - timesteps = paddle.concat( - [timesteps[:1], timesteps[1:].repeat_interleave(2)]) + timesteps = paddle.concat([timesteps[:1], timesteps[1:].repeat_interleave(2)]) self.timesteps = timesteps.cast(paddle.float32) @@ -210,9 +206,7 @@ def _sigma_to_t(self, sigma, log_sigmas): dists = log_sigma - log_sigmas[:, np.newaxis] # get sigmas range - low_idx = (np.cumsum( - (dists >= 0), axis=0).argmax(axis=0) - .clip(max=log_sigmas.shape[0] - 2)) + low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2) high_idx = low_idx + 1 low = log_sigmas[low_idx] @@ -227,8 +221,7 @@ def _sigma_to_t(self, sigma, log_sigmas): t = t.reshape(sigma.shape) return t - def _convert_to_karras(self, in_sigmas: paddle.Tensor, - num_inference_steps) -> paddle.Tensor: + def _convert_to_karras(self, in_sigmas: paddle.Tensor, num_inference_steps) -> paddle.Tensor: """Constructs the noise schedule of Karras et al. (2022).""" sigma_min = in_sigmas[-1].item() @@ -236,9 +229,9 @@ def _convert_to_karras(self, in_sigmas: paddle.Tensor, rho = 7.0 # 7.0 is the value used in the paper ramp = np.linspace(0, 1, num_inference_steps) - min_inv_rho = sigma_min**(1 / rho) - max_inv_rho = sigma_max**(1 / rho) - sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho))**rho + min_inv_rho = sigma_min ** (1 / rho) + max_inv_rho = sigma_max ** (1 / rho) + sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho return sigmas @property @@ -246,11 +239,12 @@ def state_in_first_order(self): return self.dt is None def step( - self, - model_output: Union[paddle.Tensor, np.ndarray], - timestep: Union[float, paddle.Tensor], - sample: Union[paddle.Tensor, np.ndarray], - return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]: + self, + model_output: Union[paddle.Tensor, np.ndarray], + timestep: Union[float, paddle.Tensor], + sample: Union[paddle.Tensor, np.ndarray], + return_dict: bool = True, + ) -> Union[SchedulerOutput, Tuple]: """ Args: Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion @@ -286,11 +280,11 @@ def step( pred_original_sample = sample - sigma_input * model_output elif self.config.prediction_type == "v_prediction": sigma_input = sigma_hat if self.state_in_first_order else sigma_next - pred_original_sample = model_output * (-sigma_input / ( - sigma_input**2 + 1)**0.5) + (sample / (sigma_input**2 + 1)) + pred_original_sample = model_output * (-sigma_input / (sigma_input**2 + 1) ** 0.5) + ( + sample / (sigma_input**2 + 1) + ) elif self.config.prediction_type == "sample": - raise NotImplementedError( - "prediction_type not implemented yet: sample") + raise NotImplementedError("prediction_type not implemented yet: sample") else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`" @@ -324,22 +318,21 @@ def step( prev_sample = sample + derivative * dt if not return_dict: - return (prev_sample, ) + return (prev_sample,) return SchedulerOutput(prev_sample=prev_sample) def add_noise( - self, - original_samples: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor, ) -> paddle.Tensor: + self, + original_samples: paddle.Tensor, + noise: paddle.Tensor, + timesteps: paddle.Tensor, + ) -> paddle.Tensor: # Make sure sigmas and timesteps have the same dtype as original_samples sigmas = self.sigmas.cast(original_samples.dtype) schedule_timesteps = self.timesteps - step_indices = [ - self.index_for_timestep(t, schedule_timesteps) for t in timesteps - ] + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps] sigma = sigmas[step_indices].flatten() while len(sigma.shape) < len(original_samples.shape): diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_ipndm.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_ipndm.py index 4d2b87c82ae86..8b8595755cb61 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_ipndm.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_ipndm.py @@ -43,9 +43,10 @@ class IPNDMScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - num_train_timesteps: int=1000, - trained_betas: Optional[Union[np.ndarray, List[float]]]=None, ): + self, + num_train_timesteps: int = 1000, + trained_betas: Optional[Union[np.ndarray, List[float]]] = None, + ): # set `betas`, `alphas`, `timesteps` self.set_timesteps(num_train_timesteps) @@ -73,24 +74,23 @@ def set_timesteps(self, num_inference_steps: int): steps = paddle.concat([steps, paddle.to_tensor([0.0])]) if self.config.trained_betas is not None: - self.betas = paddle.to_tensor( - self.config.trained_betas, dtype=paddle.float32) + self.betas = paddle.to_tensor(self.config.trained_betas, dtype=paddle.float32) else: - self.betas = paddle.sin(steps * math.pi / 2)**2 + self.betas = paddle.sin(steps * math.pi / 2) ** 2 - self.alphas = (1.0 - self.betas**2)**0.5 + self.alphas = (1.0 - self.betas**2) ** 0.5 - self.timesteps = (paddle.atan2(self.betas, self.alphas) / math.pi * - 2)[:-1] + self.timesteps = (paddle.atan2(self.betas, self.alphas) / math.pi * 2)[:-1] self.ets = [] def step( - self, - model_output: paddle.Tensor, - timestep: int, - sample: paddle.Tensor, - return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]: + self, + model_output: paddle.Tensor, + timestep: int, + sample: paddle.Tensor, + return_dict: bool = True, + ) -> Union[SchedulerOutput, Tuple]: """ Step function propagating the sample with the linear multi-step method. This has one forward pass with multiple times to approximate the solution. @@ -119,8 +119,7 @@ def step( timestep_index = (self.timesteps == timestep).nonzero().item() prev_timestep_index = timestep_index + 1 - ets = (sample * self.betas[timestep_index] + model_output * - self.alphas[timestep_index]) + ets = sample * self.betas[timestep_index] + model_output * self.alphas[timestep_index] self.ets.append(ets) if len(self.ets) == 1: @@ -128,22 +127,18 @@ def step( elif len(self.ets) == 2: ets = (3 * self.ets[-1] - self.ets[-2]) / 2 elif len(self.ets) == 3: - ets = ( - 23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12 + ets = (23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12 else: - ets = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] + 37 * - self.ets[-3] - 9 * self.ets[-4]) + ets = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] + 37 * self.ets[-3] - 9 * self.ets[-4]) - prev_sample = self._get_prev_sample(sample, timestep_index, - prev_timestep_index, ets) + prev_sample = self._get_prev_sample(sample, timestep_index, prev_timestep_index, ets) if not return_dict: - return (prev_sample, ) + return (prev_sample,) return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: paddle.Tensor, *args, - **kwargs) -> paddle.Tensor: + def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. @@ -156,8 +151,7 @@ def scale_model_input(self, sample: paddle.Tensor, *args, """ return sample - def _get_prev_sample(self, sample, timestep_index, prev_timestep_index, - ets): + def _get_prev_sample(self, sample, timestep_index, prev_timestep_index, ets): alpha = self.alphas[timestep_index] sigma = self.betas[timestep_index] diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py index 937c161348c12..9857a57444941 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py @@ -21,13 +21,11 @@ from ..configuration_utils import ConfigMixin, register_to_config from ..utils import randn_tensor -from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin, - SchedulerOutput) +from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput # Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, - max_beta=0.999) -> paddle.Tensor: +def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor: """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -46,7 +44,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, """ def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2 + return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 betas = [] for i in range(num_diffusion_timesteps): @@ -88,31 +86,34 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - num_train_timesteps: int=1000, - beta_start: float=0.00085, # sensible defaults - beta_end: float=0.012, - beta_schedule: str="linear", - trained_betas: Optional[Union[np.ndarray, List[float]]]=None, - prediction_type: str="epsilon", ): + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.00085, # sensible defaults + beta_end: float = 0.012, + beta_schedule: str = "linear", + trained_betas: Optional[Union[np.ndarray, List[float]]] = None, + prediction_type: str = "epsilon", + ): if trained_betas is not None: self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32) elif beta_schedule == "linear": - self.betas = paddle.linspace( - beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) + self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) elif beta_schedule == "scaled_linear": # this schedule is very specific to the latent diffusion model. - self.betas = (paddle.linspace( - beta_start**0.5, - beta_end**0.5, - num_train_timesteps, - dtype=paddle.float32, )**2) + self.betas = ( + paddle.linspace( + beta_start**0.5, + beta_end**0.5, + num_train_timesteps, + dtype=paddle.float32, + ) + ** 2 + ) elif beta_schedule == "squaredcos_cap_v2": # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError( - f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = paddle.cumprod(self.alphas, 0) @@ -133,9 +134,10 @@ def index_for_timestep(self, timestep, schedule_timesteps=None): return indices[pos].item() def scale_model_input( - self, - sample: paddle.Tensor, - timestep: Union[float, paddle.Tensor], ) -> paddle.Tensor: + self, + sample: paddle.Tensor, + timestep: Union[float, paddle.Tensor], + ) -> paddle.Tensor: """ Args: Ensures interchangeability with schedulers that need to scale the denoising model input depending on the @@ -151,13 +153,14 @@ def scale_model_input( else: sigma = self.sigmas_interpol[step_index - 1] - sample = sample / ((sigma**2 + 1)**0.5) + sample = sample / ((sigma**2 + 1) ** 0.5) return sample def set_timesteps( - self, - num_inference_steps: int, - num_train_timesteps: Optional[int]=None, ): + self, + num_inference_steps: int, + num_train_timesteps: Optional[int] = None, + ): """ Sets the timesteps used for the diffusion chain. Supporting function to be run before inference. @@ -169,12 +172,9 @@ def set_timesteps( num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps - timesteps = np.linspace( - 0, num_train_timesteps - 1, num_inference_steps, - dtype=float)[::-1].copy() + timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy() - sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)** - 0.5) + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) self.log_sigmas = paddle.to_tensor(np.log(sigmas), dtype=paddle.float32) sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas) @@ -184,9 +184,8 @@ def set_timesteps( # compute up and down sigmas sigmas_next = sigmas.roll(-1) sigmas_next[-1] = 0.0 - sigmas_up = (sigmas_next**2 * (sigmas**2 - sigmas_next**2) / sigmas - **2)**0.5 - sigmas_down = (sigmas_next**2 - sigmas_up**2)**0.5 + sigmas_up = (sigmas_next**2 * (sigmas**2 - sigmas_next**2) / sigmas**2) ** 0.5 + sigmas_down = (sigmas_next**2 - sigmas_up**2) ** 0.5 sigmas_down[-1] = 0.0 # compute interpolated sigmas @@ -194,20 +193,16 @@ def set_timesteps( sigmas_interpol[-2:] = 0.0 # set sigmas - self.sigmas = paddle.concat( - [sigmas[:1], sigmas[1:].repeat_interleave(2), sigmas[-1:]]) - self.sigmas_interpol = paddle.concat([ - sigmas_interpol[:1], - sigmas_interpol[1:].repeat_interleave(2), - sigmas_interpol[-1:], - ]) - self.sigmas_up = paddle.concat([ - sigmas_up[:1], sigmas_up[1:].repeat_interleave(2), sigmas_up[-1:] - ]) - self.sigmas_down = paddle.concat([ - sigmas_down[:1], sigmas_down[1:].repeat_interleave(2), - sigmas_down[-1:] - ]) + self.sigmas = paddle.concat([sigmas[:1], sigmas[1:].repeat_interleave(2), sigmas[-1:]]) + self.sigmas_interpol = paddle.concat( + [ + sigmas_interpol[:1], + sigmas_interpol[1:].repeat_interleave(2), + sigmas_interpol[-1:], + ] + ) + self.sigmas_up = paddle.concat([sigmas_up[:1], sigmas_up[1:].repeat_interleave(2), sigmas_up[-1:]]) + self.sigmas_down = paddle.concat([sigmas_down[:1], sigmas_down[1:].repeat_interleave(2), sigmas_down[-1:]]) # standard deviation of the initial noise distribution self.init_noise_sigma = self.sigmas.max() @@ -215,12 +210,9 @@ def set_timesteps( timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32) timesteps_interpol = self.sigma_to_t(sigmas_interpol) - timesteps_interpol = paddle.cast( - timesteps_interpol, dtype=timesteps.dtype) + timesteps_interpol = paddle.cast(timesteps_interpol, dtype=timesteps.dtype) - interleaved_timesteps = paddle.stack( - (timesteps_interpol[:-2, None], timesteps[1:, None]), - axis=-1).flatten() + interleaved_timesteps = paddle.stack((timesteps_interpol[:-2, None], timesteps[1:, None]), axis=-1).flatten() self.timesteps = paddle.concat([timesteps[:1], interleaved_timesteps]) @@ -234,8 +226,7 @@ def sigma_to_t(self, sigma): dists = log_sigma - self.log_sigmas[:, None] # get sigmas range - low_idx = ((dists >= 0).cast("int64").cumsum(axis=0).argmax(axis=0) - .clip(max=self.log_sigmas.shape[0] - 2)) + low_idx = (dists >= 0).cast("int64").cumsum(axis=0).argmax(axis=0).clip(max=self.log_sigmas.shape[0] - 2) high_idx = low_idx + 1 low = self.log_sigmas[low_idx] @@ -255,13 +246,13 @@ def state_in_first_order(self): return self.sample is None def step( - self, - model_output: Union[paddle.Tensor, np.ndarray], - timestep: Union[float, paddle.Tensor], - sample: Union[paddle.Tensor, np.ndarray], - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]: + self, + model_output: Union[paddle.Tensor, np.ndarray], + timestep: Union[float, paddle.Tensor], + sample: Union[paddle.Tensor, np.ndarray], + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + return_dict: bool = True, + ) -> Union[SchedulerOutput, Tuple]: """ Args: Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion @@ -295,8 +286,7 @@ def step( gamma = 0 sigma_hat = sigma * (gamma + 1) # Note: sigma_hat == sigma for now - noise = randn_tensor( - model_output.shape, dtype=model_output.dtype, generator=generator) + noise = randn_tensor(model_output.shape, dtype=model_output.dtype, generator=generator) # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise if self.config.prediction_type == "epsilon": @@ -304,11 +294,11 @@ def step( pred_original_sample = sample - sigma_input * model_output elif self.config.prediction_type == "v_prediction": sigma_input = sigma_hat if self.state_in_first_order else sigma_interpol - pred_original_sample = model_output * (-sigma_input / ( - sigma_input**2 + 1)**0.5) + (sample / (sigma_input**2 + 1)) + pred_original_sample = model_output * (-sigma_input / (sigma_input**2 + 1) ** 0.5) + ( + sample / (sigma_input**2 + 1) + ) elif self.config.prediction_type == "sample": - raise NotImplementedError( - "prediction_type not implemented yet: sample") + raise NotImplementedError("prediction_type not implemented yet: sample") else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`" @@ -338,22 +328,21 @@ def step( prev_sample = prev_sample + noise * sigma_up if not return_dict: - return (prev_sample, ) + return (prev_sample,) return SchedulerOutput(prev_sample=prev_sample) def add_noise( - self, - original_samples: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor, ) -> paddle.Tensor: + self, + original_samples: paddle.Tensor, + noise: paddle.Tensor, + timesteps: paddle.Tensor, + ) -> paddle.Tensor: # Make sure sigmas and timesteps have the same dtype as original_samples sigmas = self.sigmas.cast(original_samples.dtype) schedule_timesteps = self.timesteps - step_indices = [ - self.index_for_timestep(t, schedule_timesteps) for t in timesteps - ] + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps] sigma = sigmas[step_indices].flatten() while len(sigma.shape) < len(original_samples.shape): diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_discrete.py index b6df7c60c3000..87790b6ece926 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_discrete.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_k_dpm_2_discrete.py @@ -20,13 +20,11 @@ import paddle from ..configuration_utils import ConfigMixin, register_to_config -from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin, - SchedulerOutput) +from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput # Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, - max_beta=0.999) -> paddle.Tensor: +def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> paddle.Tensor: """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -45,7 +43,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, """ def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2 + return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 betas = [] for i in range(num_diffusion_timesteps): @@ -87,31 +85,34 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - num_train_timesteps: int=1000, - beta_start: float=0.00085, # sensible defaults - beta_end: float=0.012, - beta_schedule: str="linear", - trained_betas: Optional[Union[np.ndarray, List[float]]]=None, - prediction_type: str="epsilon", ): + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.00085, # sensible defaults + beta_end: float = 0.012, + beta_schedule: str = "linear", + trained_betas: Optional[Union[np.ndarray, List[float]]] = None, + prediction_type: str = "epsilon", + ): if trained_betas is not None: self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32) elif beta_schedule == "linear": - self.betas = paddle.linspace( - beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) + self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) elif beta_schedule == "scaled_linear": # this schedule is very specific to the latent diffusion model. - self.betas = (paddle.linspace( - beta_start**0.5, - beta_end**0.5, - num_train_timesteps, - dtype=paddle.float32, )**2) + self.betas = ( + paddle.linspace( + beta_start**0.5, + beta_end**0.5, + num_train_timesteps, + dtype=paddle.float32, + ) + ** 2 + ) elif beta_schedule == "squaredcos_cap_v2": # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError( - f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = paddle.cumprod(self.alphas, 0) @@ -132,9 +133,10 @@ def index_for_timestep(self, timestep, schedule_timesteps=None): return indices[pos].item() def scale_model_input( - self, - sample: paddle.Tensor, - timestep: Union[float, paddle.Tensor], ) -> paddle.Tensor: + self, + sample: paddle.Tensor, + timestep: Union[float, paddle.Tensor], + ) -> paddle.Tensor: """ Args: Ensures interchangeability with schedulers that need to scale the denoising model input depending on the @@ -150,13 +152,14 @@ def scale_model_input( else: sigma = self.sigmas_interpol[step_index] - sample = sample / ((sigma**2 + 1)**0.5) + sample = sample / ((sigma**2 + 1) ** 0.5) return sample def set_timesteps( - self, - num_inference_steps: int, - num_train_timesteps: Optional[int]=None, ): + self, + num_inference_steps: int, + num_train_timesteps: Optional[int] = None, + ): """ Sets the timesteps used for the diffusion chain. Supporting function to be run before inference. @@ -168,12 +171,9 @@ def set_timesteps( num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps - timesteps = np.linspace( - 0, num_train_timesteps - 1, num_inference_steps, - dtype=float)[::-1].copy() + timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy() - sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)** - 0.5) + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) self.log_sigmas = paddle.to_tensor(np.log(sigmas), dtype=paddle.float32) sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas) @@ -185,13 +185,14 @@ def set_timesteps( # must set to 0.0 sigmas_interpol[-1] = 0.0 - self.sigmas = paddle.concat( - [sigmas[:1], sigmas[1:].repeat_interleave(2), sigmas[-1:]]) - self.sigmas_interpol = paddle.concat([ - sigmas_interpol[:1], - sigmas_interpol[1:].repeat_interleave(2), - sigmas_interpol[-1:], - ]) + self.sigmas = paddle.concat([sigmas[:1], sigmas[1:].repeat_interleave(2), sigmas[-1:]]) + self.sigmas_interpol = paddle.concat( + [ + sigmas_interpol[:1], + sigmas_interpol[1:].repeat_interleave(2), + sigmas_interpol[-1:], + ] + ) # standard deviation of the initial noise distribution self.init_noise_sigma = self.sigmas.max() @@ -199,11 +200,8 @@ def set_timesteps( timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32) # interpolate timesteps timesteps_interpol = self.sigma_to_t(sigmas_interpol) - timesteps_interpol = paddle.cast( - timesteps_interpol, dtype=timesteps.dtype) - interleaved_timesteps = paddle.stack( - (timesteps_interpol[1:-1, None], timesteps[1:, None]), - axis=-1).flatten() + timesteps_interpol = paddle.cast(timesteps_interpol, dtype=timesteps.dtype) + interleaved_timesteps = paddle.stack((timesteps_interpol[1:-1, None], timesteps[1:, None]), axis=-1).flatten() self.timesteps = paddle.concat([timesteps[:1], interleaved_timesteps]) @@ -217,8 +215,7 @@ def sigma_to_t(self, sigma): dists = log_sigma - self.log_sigmas[:, None] # get sigmas range - low_idx = ((dists >= 0).cast("int64").cumsum(axis=0).argmax(axis=0) - .clip(max=self.log_sigmas.shape[0] - 2)) + low_idx = (dists >= 0).cast("int64").cumsum(axis=0).argmax(axis=0).clip(max=self.log_sigmas.shape[0] - 2) high_idx = low_idx + 1 low = self.log_sigmas[low_idx] @@ -238,11 +235,12 @@ def state_in_first_order(self): return self.sample is None def step( - self, - model_output: Union[paddle.Tensor, np.ndarray], - timestep: Union[float, paddle.Tensor], - sample: Union[paddle.Tensor, np.ndarray], - return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]: + self, + model_output: Union[paddle.Tensor, np.ndarray], + timestep: Union[float, paddle.Tensor], + sample: Union[paddle.Tensor, np.ndarray], + return_dict: bool = True, + ) -> Union[SchedulerOutput, Tuple]: """ Args: Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion @@ -280,11 +278,11 @@ def step( pred_original_sample = sample - sigma_input * model_output elif self.config.prediction_type == "v_prediction": sigma_input = sigma_hat if self.state_in_first_order else sigma_interpol - pred_original_sample = model_output * (-sigma_input / ( - sigma_input**2 + 1)**0.5) + (sample / (sigma_input**2 + 1)) + pred_original_sample = model_output * (-sigma_input / (sigma_input**2 + 1) ** 0.5) + ( + sample / (sigma_input**2 + 1) + ) elif self.config.prediction_type == "sample": - raise NotImplementedError( - "prediction_type not implemented yet: sample") + raise NotImplementedError("prediction_type not implemented yet: sample") else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`" @@ -312,22 +310,21 @@ def step( prev_sample = sample + derivative * dt if not return_dict: - return (prev_sample, ) + return (prev_sample,) return SchedulerOutput(prev_sample=prev_sample) def add_noise( - self, - original_samples: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor, ) -> paddle.Tensor: + self, + original_samples: paddle.Tensor, + noise: paddle.Tensor, + timesteps: paddle.Tensor, + ) -> paddle.Tensor: # Make sure sigmas and timesteps have the same dtype as original_samples sigmas = self.sigmas.cast(original_samples.dtype) schedule_timesteps = self.timesteps - step_indices = [ - self.index_for_timestep(t, schedule_timesteps) for t in timesteps - ] + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps] sigma = sigmas[step_indices].flatten() while len(sigma.shape) < len(original_samples.shape): diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_karras_ve.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_karras_ve.py index ba4bf176efd6c..f104b1a69a8d9 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_karras_ve.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_karras_ve.py @@ -81,13 +81,14 @@ class KarrasVeScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - sigma_min: float=0.02, - sigma_max: float=100, - s_noise: float=1.007, - s_churn: float=80, - s_min: float=0.05, - s_max: float=50, ): + self, + sigma_min: float = 0.02, + sigma_max: float = 100, + s_noise: float = 1.007, + s_churn: float = 80, + s_min: float = 0.05, + s_max: float = 50, + ): # standard deviation of the initial noise distribution self.init_noise_sigma = sigma_max @@ -96,9 +97,7 @@ def __init__( self.timesteps: paddle.Tensor = None self.schedule: paddle.Tensor = None # sigma(t_i) - def scale_model_input(self, - sample: paddle.Tensor, - timestep: Optional[int]=None) -> paddle.Tensor: + def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. @@ -124,17 +123,21 @@ def set_timesteps(self, num_inference_steps: int): self.num_inference_steps = num_inference_steps timesteps = np.arange(0, self.num_inference_steps)[::-1].copy() self.timesteps = paddle.to_tensor(timesteps) - schedule = [(self.config.sigma_max - **2 * (self.config.sigma_min**2 / self.config.sigma_max**2) - **(i / (num_inference_steps - 1))) for i in self.timesteps] + schedule = [ + ( + self.config.sigma_max**2 + * (self.config.sigma_min**2 / self.config.sigma_max**2) ** (i / (num_inference_steps - 1)) + ) + for i in self.timesteps + ] self.schedule = paddle.to_tensor(schedule, dtype=paddle.float32) def add_noise_to_input( - self, - sample: paddle.Tensor, - sigma: float, - generator: Optional[paddle.Generator]=None, ) -> Tuple[ - paddle.Tensor, float]: + self, + sample: paddle.Tensor, + sigma: float, + generator: Optional[paddle.Generator] = None, + ) -> Tuple[paddle.Tensor, float]: """ Explicit Langevin-like "churn" step of adding noise to the sample according to a factor gamma_i ≥ 0 to reach a higher noise level sigma_hat = sigma_i + gamma_i*sigma_i. @@ -142,26 +145,25 @@ def add_noise_to_input( TODO Args: """ if self.config.s_min <= sigma <= self.config.s_max: - gamma = min(self.config.s_churn / self.num_inference_steps, - 2**0.5 - 1) + gamma = min(self.config.s_churn / self.num_inference_steps, 2**0.5 - 1) else: gamma = 0 # sample eps ~ N(0, S_noise^2 * I) - eps = self.config.s_noise * randn_tensor( - sample.shape, generator=generator) + eps = self.config.s_noise * randn_tensor(sample.shape, generator=generator) sigma_hat = sigma + gamma * sigma - sample_hat = sample + ((sigma_hat**2 - sigma**2)**0.5 * eps) + sample_hat = sample + ((sigma_hat**2 - sigma**2) ** 0.5 * eps) return sample_hat, sigma_hat def step( - self, - model_output: paddle.Tensor, - sigma_hat: float, - sigma_prev: float, - sample_hat: paddle.Tensor, - return_dict: bool=True, ) -> Union[KarrasVeOutput, Tuple]: + self, + model_output: paddle.Tensor, + sigma_hat: float, + sigma_prev: float, + sample_hat: paddle.Tensor, + return_dict: bool = True, + ) -> Union[KarrasVeOutput, Tuple]: """ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). @@ -191,17 +193,19 @@ def step( return KarrasVeOutput( prev_sample=sample_prev, derivative=derivative, - pred_original_sample=pred_original_sample, ) + pred_original_sample=pred_original_sample, + ) def step_correct( - self, - model_output: paddle.Tensor, - sigma_hat: float, - sigma_prev: float, - sample_hat: paddle.Tensor, - sample_prev: paddle.Tensor, - derivative: paddle.Tensor, - return_dict: bool=True, ) -> Union[KarrasVeOutput, Tuple]: + self, + model_output: paddle.Tensor, + sigma_hat: float, + sigma_prev: float, + sample_hat: paddle.Tensor, + sample_prev: paddle.Tensor, + derivative: paddle.Tensor, + return_dict: bool = True, + ) -> Union[KarrasVeOutput, Tuple]: """ Correct the predicted sample based on the output model_output of the network. TODO complete description @@ -220,8 +224,7 @@ def step_correct( """ pred_original_sample = sample_prev + sigma_prev * model_output derivative_corr = (sample_prev - pred_original_sample) / sigma_prev - sample_prev = sample_hat + (sigma_prev - sigma_hat) * ( - 0.5 * derivative + 0.5 * derivative_corr) + sample_prev = sample_hat + (sigma_prev - sigma_hat) * (0.5 * derivative + 0.5 * derivative_corr) if not return_dict: return (sample_prev, derivative) @@ -229,7 +232,8 @@ def step_correct( return KarrasVeOutput( prev_sample=sample_prev, derivative=derivative, - pred_original_sample=pred_original_sample, ) + pred_original_sample=pred_original_sample, + ) def add_noise(self, original_samples, noise, timesteps): raise NotImplementedError() diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_lms_discrete.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_lms_discrete.py index 872f3891e0cf4..122b5e8dffa7d 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_lms_discrete.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_lms_discrete.py @@ -65,7 +65,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): """ def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2 + return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 betas = [] for i in range(num_diffusion_timesteps): @@ -106,37 +106,39 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - num_train_timesteps: int=1000, - beta_start: float=0.0001, - beta_end: float=0.02, - beta_schedule: str="linear", - trained_betas: Optional[Union[np.ndarray, List[float]]]=None, - prediction_type: str="epsilon", ): + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.0001, + beta_end: float = 0.02, + beta_schedule: str = "linear", + trained_betas: Optional[Union[np.ndarray, List[float]]] = None, + prediction_type: str = "epsilon", + ): if trained_betas is not None: self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32) elif beta_schedule == "linear": - self.betas = paddle.linspace( - beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) + self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) elif beta_schedule == "scaled_linear": # this schedule is very specific to the latent diffusion model. - self.betas = (paddle.linspace( - beta_start**0.5, - beta_end**0.5, - num_train_timesteps, - dtype=paddle.float32, )**2) + self.betas = ( + paddle.linspace( + beta_start**0.5, + beta_end**0.5, + num_train_timesteps, + dtype=paddle.float32, + ) + ** 2 + ) elif beta_schedule == "squaredcos_cap_v2": # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError( - f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = paddle.cumprod(self.alphas, 0) - sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)** - 0.5) + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32) self.sigmas = paddle.to_tensor(sigmas) @@ -145,16 +147,12 @@ def __init__( # setable values self.num_inference_steps = None - timesteps = np.linspace( - 0, num_train_timesteps - 1, num_train_timesteps, - dtype=float)[::-1].copy() + timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy() self.timesteps = paddle.to_tensor(timesteps, dtype=paddle.float32) self.derivatives = [] self.is_scale_input_called = False - def scale_model_input( - self, sample: paddle.Tensor, - timestep: Union[float, paddle.Tensor]) -> paddle.Tensor: + def scale_model_input(self, sample: paddle.Tensor, timestep: Union[float, paddle.Tensor]) -> paddle.Tensor: """ Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the K-LMS algorithm. @@ -167,7 +165,7 @@ def scale_model_input( """ step_index = (self.timesteps == timestep).nonzero().item() sigma = self.sigmas[step_index] - sample = sample / ((sigma**2 + 1)**0.5) + sample = sample / ((sigma**2 + 1) ** 0.5) self.is_scale_input_called = True return sample @@ -186,12 +184,10 @@ def lms_derivative(tau): for k in range(order): if current_order == k: continue - prod *= (tau - self.sigmas[t - k]) / ( - self.sigmas[t - current_order] - self.sigmas[t - k]) + prod *= (tau - self.sigmas[t - k]) / (self.sigmas[t - current_order] - self.sigmas[t - k]) return prod - integrated_coeff = integrate.quad( - lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0] + integrated_coeff = integrate.quad(lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0] return integrated_coeff @@ -205,13 +201,8 @@ def set_timesteps(self, num_inference_steps: int): """ self.num_inference_steps = num_inference_steps - timesteps = np.linspace( - 0, - self.config.num_train_timesteps - 1, - num_inference_steps, - dtype=float)[::-1].copy() - sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod)** - 0.5) + timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy() + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas) sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32) self.sigmas = paddle.to_tensor(sigmas) @@ -220,13 +211,13 @@ def set_timesteps(self, num_inference_steps: int): self.derivatives = [] def step( - self, - model_output: paddle.Tensor, - timestep: Union[float, paddle.Tensor], - sample: paddle.Tensor, - order: int=4, - return_dict: bool=True, ) -> Union[LMSDiscreteSchedulerOutput, - Tuple]: + self, + model_output: paddle.Tensor, + timestep: Union[float, paddle.Tensor], + sample: paddle.Tensor, + order: int = 4, + return_dict: bool = True, + ) -> Union[LMSDiscreteSchedulerOutput, Tuple]: """ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). @@ -248,7 +239,8 @@ def step( if not self.is_scale_input_called: warnings.warn( "The `scale_model_input` function should be called before `step` to ensure correct denoising. " - "See `StableDiffusionPipeline` for a usage example.") + "See `StableDiffusionPipeline` for a usage example." + ) step_index = (self.timesteps == timestep).nonzero().item() sigma = self.sigmas[step_index] @@ -258,8 +250,7 @@ def step( pred_original_sample = sample - sigma * model_output elif self.config.prediction_type == "v_prediction": # * c_out + input * c_skip - pred_original_sample = model_output * (-sigma / (sigma**2 + 1)**0.5 - ) + (sample / (sigma**2 + 1)) + pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1)) elif self.config.prediction_type == "sample": pred_original_sample = model_output else: @@ -275,33 +266,29 @@ def step( # 3. Compute linear multistep coefficients order = min(step_index + 1, order) - lms_coeffs = [ - self.get_lms_coefficient(order, step_index, curr_order) - for curr_order in range(order) - ] + lms_coeffs = [self.get_lms_coefficient(order, step_index, curr_order) for curr_order in range(order)] # 4. Compute previous sample based on the derivatives path - prev_sample = sample + sum(coeff * derivative - for coeff, derivative in zip( - lms_coeffs, reversed(self.derivatives))) + prev_sample = sample + sum( + coeff * derivative for coeff, derivative in zip(lms_coeffs, reversed(self.derivatives)) + ) if not return_dict: - return (prev_sample, ) + return (prev_sample,) - return LMSDiscreteSchedulerOutput( - prev_sample=prev_sample, pred_original_sample=pred_original_sample) + return LMSDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample) def add_noise( - self, - original_samples: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor, ) -> paddle.Tensor: + self, + original_samples: paddle.Tensor, + noise: paddle.Tensor, + timesteps: paddle.Tensor, + ) -> paddle.Tensor: # Make sure sigmas and timesteps have the same dtype as original_samples sigmas = self.sigmas.cast(original_samples.dtype) schedule_timesteps = self.timesteps - step_indices = [(schedule_timesteps == t).nonzero().item() - for t in timesteps] + step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps] sigma = sigmas[step_indices].flatten() while len(sigma.shape) < len(original_samples.shape): diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_pndm.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_pndm.py index 437f108e73af3..c821dae87d35d 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_pndm.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_pndm.py @@ -22,8 +22,7 @@ import paddle from ..configuration_utils import ConfigMixin, register_to_config -from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin, - SchedulerOutput) +from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput # Copied from ppdiffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar @@ -46,7 +45,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): """ def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2 + return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 betas = [] for i in range(num_diffusion_timesteps): @@ -99,40 +98,42 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - num_train_timesteps: int=1000, - beta_start: float=0.0001, - beta_end: float=0.02, - beta_schedule: str="linear", - trained_betas: Optional[Union[np.ndarray, List[float]]]=None, - skip_prk_steps: bool=False, - set_alpha_to_one: bool=False, - prediction_type: str="epsilon", - steps_offset: int=0, ): + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.0001, + beta_end: float = 0.02, + beta_schedule: str = "linear", + trained_betas: Optional[Union[np.ndarray, List[float]]] = None, + skip_prk_steps: bool = False, + set_alpha_to_one: bool = False, + prediction_type: str = "epsilon", + steps_offset: int = 0, + ): if trained_betas is not None: self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32) elif beta_schedule == "linear": - self.betas = paddle.linspace( - beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) + self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) elif beta_schedule == "scaled_linear": # this schedule is very specific to the latent diffusion model. - self.betas = (paddle.linspace( - beta_start**0.5, - beta_end**0.5, - num_train_timesteps, - dtype=paddle.float32, )**2) + self.betas = ( + paddle.linspace( + beta_start**0.5, + beta_end**0.5, + num_train_timesteps, + dtype=paddle.float32, + ) + ** 2 + ) elif beta_schedule == "squaredcos_cap_v2": # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError( - f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = paddle.cumprod(self.alphas, 0) - self.final_alpha_cumprod = (paddle.to_tensor(1.0) if set_alpha_to_one - else self.alphas_cumprod[0]) + self.final_alpha_cumprod = paddle.to_tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0] # standard deviation of the initial noise distribution self.init_noise_sigma = 1.0 @@ -168,8 +169,7 @@ def set_timesteps(self, num_inference_steps: int): step_ratio = self.config.num_train_timesteps // self.num_inference_steps # creates integer timesteps by multiplying by ratio # casting to int to avoid issues when num_inference_step is power of 3 - self._timesteps = (np.arange(0, num_inference_steps) * - step_ratio).round() + self._timesteps = (np.arange(0, num_inference_steps) * step_ratio).round() self._timesteps += self.config.steps_offset if self.config.skip_prk_steps: @@ -177,25 +177,20 @@ def set_timesteps(self, num_inference_steps: int): # produce better results. When using PNDM with `self.config.skip_prk_steps` the implementation # is based on crowsonkb's PLMS sampler implementation: https://github.com/CompVis/latent-diffusion/pull/51 self.prk_timesteps = np.array([]) - self.plms_timesteps = np.concatenate([ - self._timesteps[:-1], self._timesteps[-2:-1], - self._timesteps[-1:] - ])[::-1].copy() + self.plms_timesteps = np.concatenate([self._timesteps[:-1], self._timesteps[-2:-1], self._timesteps[-1:]])[ + ::-1 + ].copy() else: - prk_timesteps = np.array(self._timesteps[-self.pndm_order:]).repeat( - 2) + np.tile( - np.array([ - 0, self.config.num_train_timesteps // - num_inference_steps // 2 - ]), - self.pndm_order, ) - self.prk_timesteps = ( - prk_timesteps[:-1].repeat(2)[1:-1])[::-1].copy() - self.plms_timesteps = self._timesteps[:-3][::-1].copy( - ) # we copy to avoid having negative strides which are not supported by paddle - - timesteps = np.concatenate( - [self.prk_timesteps, self.plms_timesteps]).astype(np.int64) + prk_timesteps = np.array(self._timesteps[-self.pndm_order :]).repeat(2) + np.tile( + np.array([0, self.config.num_train_timesteps // num_inference_steps // 2]), + self.pndm_order, + ) + self.prk_timesteps = (prk_timesteps[:-1].repeat(2)[1:-1])[::-1].copy() + self.plms_timesteps = self._timesteps[:-3][ + ::-1 + ].copy() # we copy to avoid having negative strides which are not supported by paddle + + timesteps = np.concatenate([self.prk_timesteps, self.plms_timesteps]).astype(np.int64) self.timesteps = paddle.to_tensor(timesteps) self.ets = [] @@ -203,11 +198,12 @@ def set_timesteps(self, num_inference_steps: int): self.cur_model_output = 0 def step( - self, - model_output: paddle.Tensor, - timestep: int, - sample: paddle.Tensor, - return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]: + self, + model_output: paddle.Tensor, + timestep: int, + sample: paddle.Tensor, + return_dict: bool = True, + ) -> Union[SchedulerOutput, Tuple]: """ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). @@ -227,26 +223,28 @@ def step( returning a tuple, the first element is the sample tensor. """ - if self.counter < len( - self.prk_timesteps) and not self.config.skip_prk_steps: + if self.counter < len(self.prk_timesteps) and not self.config.skip_prk_steps: return self.step_prk( model_output=model_output, timestep=timestep, sample=sample, - return_dict=return_dict, ) + return_dict=return_dict, + ) else: return self.step_plms( model_output=model_output, timestep=timestep, sample=sample, - return_dict=return_dict, ) + return_dict=return_dict, + ) def step_prk( - self, - model_output: paddle.Tensor, - timestep: int, - sample: paddle.Tensor, - return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]: + self, + model_output: paddle.Tensor, + timestep: int, + sample: paddle.Tensor, + return_dict: bool = True, + ) -> Union[SchedulerOutput, Tuple]: """ Step function propagating the sample with the Runge-Kutta method. RK takes 4 forward passes to approximate the solution to the differential equation. @@ -268,9 +266,7 @@ def step_prk( "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler" ) - diff_to_prev = (0 - if self.counter % 2 else self.config.num_train_timesteps - // self.num_inference_steps // 2) + diff_to_prev = 0 if self.counter % 2 else self.config.num_train_timesteps // self.num_inference_steps // 2 prev_timestep = timestep - diff_to_prev timestep = self.prk_timesteps[self.counter // 4 * 4] @@ -289,21 +285,21 @@ def step_prk( # cur_sample should not be `None` cur_sample = self.cur_sample if self.cur_sample is not None else sample - prev_sample = self._get_prev_sample(cur_sample, timestep, prev_timestep, - model_output) + prev_sample = self._get_prev_sample(cur_sample, timestep, prev_timestep, model_output) self.counter += 1 if not return_dict: - return (prev_sample, ) + return (prev_sample,) return SchedulerOutput(prev_sample=prev_sample) def step_plms( - self, - model_output: paddle.Tensor, - timestep: int, - sample: paddle.Tensor, - return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]: + self, + model_output: paddle.Tensor, + timestep: int, + sample: paddle.Tensor, + return_dict: bool = True, + ) -> Union[SchedulerOutput, Tuple]: """ Step function propagating the sample with the linear multi-step method. This has one forward pass with multiple times to approximate the solution. @@ -330,18 +326,17 @@ def step_plms( f"{self.__class__} can only be run AFTER scheduler has been run " "in 'prk' mode for at least 12 iterations " "See: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_pndm.py " - "for more information.") + "for more information." + ) - prev_timestep = (timestep - self.config.num_train_timesteps // - self.num_inference_steps) + prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps if self.counter != 1: self.ets = self.ets[-3:] self.ets.append(model_output) else: prev_timestep = timestep - timestep = (timestep + self.config.num_train_timesteps // - self.num_inference_steps) + timestep = timestep + self.config.num_train_timesteps // self.num_inference_steps if len(self.ets) == 1 and self.counter == 0: model_output = model_output @@ -353,23 +348,19 @@ def step_plms( elif len(self.ets) == 2: model_output = (3 * self.ets[-1] - self.ets[-2]) / 2 elif len(self.ets) == 3: - model_output = ( - 23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12 + model_output = (23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12 else: - model_output = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] + - 37 * self.ets[-3] - 9 * self.ets[-4]) + model_output = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] + 37 * self.ets[-3] - 9 * self.ets[-4]) - prev_sample = self._get_prev_sample(sample, timestep, prev_timestep, - model_output) + prev_sample = self._get_prev_sample(sample, timestep, prev_timestep, model_output) self.counter += 1 if not return_dict: - return (prev_sample, ) + return (prev_sample,) return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: paddle.Tensor, *args, - **kwargs) -> paddle.Tensor: + def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. @@ -396,14 +387,12 @@ def _get_prev_sample(self, sample, timestep, prev_timestep, model_output): # model_output -> e_θ(x_t, t) # prev_sample -> x_(t−δ) alpha_prod_t = self.alphas_cumprod[timestep] - alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep] if - prev_timestep >= 0 else self.final_alpha_cumprod) + alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod beta_prod_t = 1 - alpha_prod_t beta_prod_t_prev = 1 - alpha_prod_t_prev if self.config.prediction_type == "v_prediction": - model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t** - 0.5) * sample + model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample elif self.config.prediction_type != "epsilon": raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon` or `v_prediction`" @@ -413,41 +402,41 @@ def _get_prev_sample(self, sample, timestep, prev_timestep, model_output): # denominator of x_t in formula (9) and plus 1 # Note: (α_(t−δ) - α_t) / (sqrt(α_t) * (sqrt(α_(t−δ)) + sqr(α_t))) = # sqrt(α_(t−δ)) / sqrt(α_t)) - sample_coeff = (alpha_prod_t_prev / alpha_prod_t)**(0.5) + sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5) # corresponds to denominator of e_θ(x_t, t) in formula (9) - model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev**(0.5) + ( - alpha_prod_t * beta_prod_t * alpha_prod_t_prev)**(0.5) + model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + ( + alpha_prod_t * beta_prod_t * alpha_prod_t_prev + ) ** (0.5) # full formula (9) - prev_sample = (sample_coeff * sample - - (alpha_prod_t_prev - alpha_prod_t - ) * model_output / model_output_denom_coeff) + prev_sample = ( + sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff + ) return prev_sample # Copied from ppdiffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( - self, - original_samples: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor, ) -> paddle.Tensor: + self, + original_samples: paddle.Tensor, + noise: paddle.Tensor, + timesteps: paddle.Tensor, + ) -> paddle.Tensor: # Make sure alphas_cumprod and timestep have same dtype as original_samples alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype) - sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5 + sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 sqrt_alpha_prod = sqrt_alpha_prod.flatten() while len(sqrt_alpha_prod.shape) < len(original_samples.shape): sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) - sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5 + sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() - while len(sqrt_one_minus_alpha_prod.shape) < len( - original_samples.shape): + while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape): sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) - noisy_samples = (sqrt_alpha_prod * original_samples + - sqrt_one_minus_alpha_prod * noise) + noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise return noisy_samples def __len__(self): diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_repaint.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_repaint.py index 71460c026a92b..d040c40ba5124 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_repaint.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_repaint.py @@ -64,7 +64,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): """ def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2 + return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 betas = [] for i in range(num_diffusion_timesteps): @@ -109,26 +109,30 @@ class RePaintScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - num_train_timesteps: int=1000, - beta_start: float=0.0001, - beta_end: float=0.02, - beta_schedule: str="linear", - eta: float=0.0, - trained_betas: Optional[np.ndarray]=None, - clip_sample: bool=True, ): + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.0001, + beta_end: float = 0.02, + beta_schedule: str = "linear", + eta: float = 0.0, + trained_betas: Optional[np.ndarray] = None, + clip_sample: bool = True, + ): if trained_betas is not None: self.betas = paddle.to_tensor(trained_betas) elif beta_schedule == "linear": - self.betas = paddle.linspace( - beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) + self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) elif beta_schedule == "scaled_linear": # this schedule is very specific to the latent diffusion model. - self.betas = (paddle.linspace( - beta_start**0.5, - beta_end**0.5, - num_train_timesteps, - dtype=paddle.float32, )**2) + self.betas = ( + paddle.linspace( + beta_start**0.5, + beta_end**0.5, + num_train_timesteps, + dtype=paddle.float32, + ) + ** 2 + ) elif beta_schedule == "squaredcos_cap_v2": # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) @@ -137,8 +141,7 @@ def __init__( betas = paddle.linspace(-6, 6, num_train_timesteps) self.betas = F.sigmoid(betas) * (beta_end - beta_start) + beta_start else: - raise NotImplementedError( - f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = paddle.cumprod(self.alphas, 0) @@ -151,14 +154,11 @@ def __init__( # setable values self.num_inference_steps = None - self.timesteps = paddle.to_tensor( - np.arange(0, num_train_timesteps)[::-1].copy()) + self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps)[::-1].copy()) self.eta = eta - def scale_model_input(self, - sample: paddle.Tensor, - timestep: Optional[int]=None) -> paddle.Tensor: + def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. @@ -173,12 +173,12 @@ def scale_model_input(self, return sample def set_timesteps( - self, - num_inference_steps: int, - jump_length: int=10, - jump_n_sample: int=10, ): - num_inference_steps = min(self.config.num_train_timesteps, - num_inference_steps) + self, + num_inference_steps: int, + jump_length: int = 10, + jump_n_sample: int = 10, + ): + num_inference_steps = min(self.config.num_train_timesteps, num_inference_steps) self.num_inference_steps = num_inference_steps timesteps = [] @@ -198,16 +198,14 @@ def set_timesteps( t = t + 1 timesteps.append(t) - timesteps = np.array(timesteps) * (self.config.num_train_timesteps // - self.num_inference_steps) + timesteps = np.array(timesteps) * (self.config.num_train_timesteps // self.num_inference_steps) self.timesteps = paddle.to_tensor(timesteps) def _get_variance(self, t): prev_timestep = t - self.config.num_train_timesteps // self.num_inference_steps alpha_prod_t = self.alphas_cumprod[t] - alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep] if - prev_timestep >= 0 else self.final_alpha_cumprod) + alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod beta_prod_t = 1 - alpha_prod_t beta_prod_t_prev = 1 - alpha_prod_t_prev @@ -218,21 +216,20 @@ def _get_variance(self, t): # Is equivalent to formula (16) in https://arxiv.org/pdf/2010.02502.pdf # without eta. # variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * self.betas[t] - variance = (beta_prod_t_prev / beta_prod_t) * ( - 1 - alpha_prod_t / alpha_prod_t_prev) + variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev) return variance def step( - self, - model_output: paddle.Tensor, - timestep: int, - sample: paddle.Tensor, - original_image: paddle.Tensor, - mask: paddle.Tensor, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - return_dict: bool=True, ) -> Union[RePaintSchedulerOutput, Tuple]: + self, + model_output: paddle.Tensor, + timestep: int, + sample: paddle.Tensor, + original_image: paddle.Tensor, + mask: paddle.Tensor, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + return_dict: bool = True, + ) -> Union[RePaintSchedulerOutput, Tuple]: """ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). @@ -258,19 +255,16 @@ def step( """ t = timestep - prev_timestep = (timestep - self.config.num_train_timesteps // - self.num_inference_steps) + prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps # 1. compute alphas, betas alpha_prod_t = self.alphas_cumprod[t] - alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep] if - prev_timestep >= 0 else self.final_alpha_cumprod) + alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod beta_prod_t = 1 - alpha_prod_t # 2. compute predicted original sample from predicted noise also called # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf - pred_original_sample = ( - sample - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5 + pred_original_sample = (sample - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5 # 3. Clip "predicted x_0" if self.config.clip_sample: @@ -284,9 +278,8 @@ def step( # been observed. # 5. Add noise - noise = randn_tensor( - model_output.shape, generator=generator, dtype=model_output.dtype) - std_dev_t = self.eta * self._get_variance(timestep)**0.5 + noise = randn_tensor(model_output.shape, generator=generator, dtype=model_output.dtype) + std_dev_t = self.eta * self._get_variance(timestep) ** 0.5 variance = 0 if t > 0 and self.eta > 0: @@ -294,51 +287,44 @@ def step( # 6. compute "direction pointing to x_t" of formula (12) # from https://arxiv.org/pdf/2010.02502.pdf - pred_sample_direction = ( - 1 - alpha_prod_t_prev - std_dev_t**2)**0.5 * model_output + pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** 0.5 * model_output # 7. compute x_{t-1} of formula (12) from https://arxiv.org/pdf/2010.02502.pdf - prev_unknown_part = (alpha_prod_t_prev**0.5 * pred_original_sample + - pred_sample_direction + variance) + prev_unknown_part = alpha_prod_t_prev**0.5 * pred_original_sample + pred_sample_direction + variance # 8. Algorithm 1 Line 5 https://arxiv.org/pdf/2201.09865.pdf - prev_known_part = (alpha_prod_t_prev**0.5) * original_image + ( - (1 - alpha_prod_t_prev)**0.5) * noise + prev_known_part = (alpha_prod_t_prev**0.5) * original_image + ((1 - alpha_prod_t_prev) ** 0.5) * noise # 9. Algorithm 1 Line 8 https://arxiv.org/pdf/2201.09865.pdf - pred_prev_sample = mask * prev_known_part + (1.0 - mask - ) * prev_unknown_part + pred_prev_sample = mask * prev_known_part + (1.0 - mask) * prev_unknown_part if not return_dict: return ( pred_prev_sample, - pred_original_sample, ) + pred_original_sample, + ) - return RePaintSchedulerOutput( - prev_sample=pred_prev_sample, - pred_original_sample=pred_original_sample) + return RePaintSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample) def undo_step(self, sample, timestep, generator=None): n = self.config.num_train_timesteps // self.num_inference_steps for i in range(n): beta = self.betas[timestep + i] - noise = randn_tensor( - sample.shape, generator=generator, dtype=sample.dtype) + noise = randn_tensor(sample.shape, generator=generator, dtype=sample.dtype) # 10. Algorithm 1 Line 10 https://arxiv.org/pdf/2201.09865.pdf - sample = (1 - beta)**0.5 * sample + beta**0.5 * noise + sample = (1 - beta) ** 0.5 * sample + beta**0.5 * noise return sample def add_noise( - self, - original_samples: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor, ) -> paddle.Tensor: - raise NotImplementedError( - "Use `DDPMScheduler.add_noise()` to train for sampling with RePaint." - ) + self, + original_samples: paddle.Tensor, + noise: paddle.Tensor, + timesteps: paddle.Tensor, + ) -> paddle.Tensor: + raise NotImplementedError("Use `DDPMScheduler.add_noise()` to train for sampling with RePaint.") def __len__(self): return self.config.num_train_timesteps diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_ve.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_ve.py index 3513d6691d0e5..83644fdecc48a 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_ve.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_ve.py @@ -71,13 +71,14 @@ class ScoreSdeVeScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - num_train_timesteps: int=2000, - snr: float=0.15, - sigma_min: float=0.01, - sigma_max: float=1348.0, - sampling_eps: float=1e-5, - correct_steps: int=1, ): + self, + num_train_timesteps: int = 2000, + snr: float = 0.15, + sigma_min: float = 0.01, + sigma_max: float = 1348.0, + sampling_eps: float = 1e-5, + correct_steps: int = 1, + ): # standard deviation of the initial noise distribution self.init_noise_sigma = sigma_max @@ -86,9 +87,7 @@ def __init__( self.set_sigmas(num_train_timesteps, sigma_min, sigma_max, sampling_eps) - def scale_model_input(self, - sample: paddle.Tensor, - timestep: Optional[int]=None) -> paddle.Tensor: + def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. @@ -102,7 +101,7 @@ def scale_model_input(self, """ return sample - def set_timesteps(self, num_inference_steps: int, sampling_eps: float=None): + def set_timesteps(self, num_inference_steps: int, sampling_eps: float = None): """ Sets the continuous timesteps used for the diffusion chain. Supporting function to be run before inference. @@ -113,17 +112,17 @@ def set_timesteps(self, num_inference_steps: int, sampling_eps: float=None): final timestep value (overrides value given at Scheduler instantiation). """ - sampling_eps = (sampling_eps if sampling_eps is not None else - self.config.sampling_eps) + sampling_eps = sampling_eps if sampling_eps is not None else self.config.sampling_eps self.timesteps = paddle.linspace(1, sampling_eps, num_inference_steps) def set_sigmas( - self, - num_inference_steps: int, - sigma_min: float=None, - sigma_max: float=None, - sampling_eps: float=None, ): + self, + num_inference_steps: int, + sigma_min: float = None, + sigma_max: float = None, + sampling_eps: float = None, + ): """ Sets the noise scales used for the diffusion chain. Supporting function to be run before inference. @@ -142,33 +141,31 @@ def set_sigmas( """ sigma_min = sigma_min if sigma_min is not None else self.config.sigma_min sigma_max = sigma_max if sigma_max is not None else self.config.sigma_max - sampling_eps = (sampling_eps if sampling_eps is not None else - self.config.sampling_eps) + sampling_eps = sampling_eps if sampling_eps is not None else self.config.sampling_eps if self.timesteps is None: self.set_timesteps(num_inference_steps, sampling_eps) - self.sigmas = sigma_min * (sigma_max / sigma_min)**(self.timesteps / - sampling_eps) + self.sigmas = sigma_min * (sigma_max / sigma_min) ** (self.timesteps / sampling_eps) self.discrete_sigmas = paddle.exp( - paddle.linspace( - math.log(sigma_min), math.log(sigma_max), num_inference_steps)) - self.sigmas = paddle.to_tensor( - [sigma_min * (sigma_max / sigma_min)**t for t in self.timesteps]) + paddle.linspace(math.log(sigma_min), math.log(sigma_max), num_inference_steps) + ) + self.sigmas = paddle.to_tensor([sigma_min * (sigma_max / sigma_min) ** t for t in self.timesteps]) def get_adjacent_sigma(self, timesteps, t): return paddle.where( timesteps == 0, paddle.zeros_like(t), - self.discrete_sigmas[timesteps - 1], ) + self.discrete_sigmas[timesteps - 1], + ) def step_pred( - self, - model_output: paddle.Tensor, - timestep: int, - sample: paddle.Tensor, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - return_dict: bool=True, ) -> Union[SdeVeOutput, Tuple]: + self, + model_output: paddle.Tensor, + timestep: int, + sample: paddle.Tensor, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + return_dict: bool = True, + ) -> Union[SdeVeOutput, Tuple]: """ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). @@ -191,15 +188,13 @@ def step_pred( "`self.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler" ) - timestep = timestep * paddle.ones( - (sample.shape[0], - )) # paddle.repeat_interleave(timestep, sample.shape[0]) + timestep = timestep * paddle.ones((sample.shape[0],)) # paddle.repeat_interleave(timestep, sample.shape[0]) timesteps = (timestep * (len(self.timesteps) - 1)).cast("int64") sigma = self.discrete_sigmas[timesteps] adjacent_sigma = self.get_adjacent_sigma(timesteps, timestep) drift = paddle.zeros_like(sample) - diffusion = (sigma**2 - adjacent_sigma**2)**0.5 + diffusion = (sigma**2 - adjacent_sigma**2) ** 0.5 # equation 6 in the paper: the model_output modeled by the network is grad_x log pt(x) # also equation 47 shows the analog from SDE models to ancestral sampling methods @@ -209,28 +204,23 @@ def step_pred( drift = drift - diffusion**2 * model_output # equation 6: sample noise for the diffusion term of - noise = randn_tensor( - sample.shape, generator=generator, dtype=sample.dtype) - prev_sample_mean = ( - sample - drift - ) # subtract because `dt` is a small negative timestep + noise = randn_tensor(sample.shape, generator=generator, dtype=sample.dtype) + prev_sample_mean = sample - drift # subtract because `dt` is a small negative timestep # TODO is the variable diffusion the correct scaling term for the noise? - prev_sample = (prev_sample_mean + diffusion * noise - ) # add impact of diffusion field g + prev_sample = prev_sample_mean + diffusion * noise # add impact of diffusion field g if not return_dict: return (prev_sample, prev_sample_mean) - return SdeVeOutput( - prev_sample=prev_sample, prev_sample_mean=prev_sample_mean) + return SdeVeOutput(prev_sample=prev_sample, prev_sample_mean=prev_sample_mean) def step_correct( - self, - model_output: paddle.Tensor, - sample: paddle.Tensor, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]: + self, + model_output: paddle.Tensor, + sample: paddle.Tensor, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + return_dict: bool = True, + ) -> Union[SchedulerOutput, Tuple]: """ Correct the predicted sample based on the output model_output of the network. This is often run repeatedly after making the prediction for the previous timestep. @@ -257,12 +247,10 @@ def step_correct( noise = randn_tensor(sample.shape, generator=generator) # compute step size from the model_output, the noise, and the snr - grad_norm = paddle.norm( - model_output.reshape([model_output.shape[0], -1]), axis=-1).mean() - noise_norm = paddle.norm( - noise.reshape([noise.shape[0], -1]), axis=-1).mean() - step_size = (self.config.snr * noise_norm / grad_norm)**2 * 2 - step_size = step_size * paddle.ones((sample.shape[0], )) + grad_norm = paddle.norm(model_output.reshape([model_output.shape[0], -1]), axis=-1).mean() + noise_norm = paddle.norm(noise.reshape([noise.shape[0], -1]), axis=-1).mean() + step_size = (self.config.snr * noise_norm / grad_norm) ** 2 * 2 + step_size = step_size * paddle.ones((sample.shape[0],)) # self.repeat_scalar(step_size, sample.shape[0]) # compute corrected sample: model_output term and noise term @@ -270,23 +258,22 @@ def step_correct( while len(step_size.shape) < len(sample.shape): step_size = step_size.unsqueeze(-1) prev_sample_mean = sample + step_size * model_output - prev_sample = prev_sample_mean + ((step_size * 2)**0.5) * noise + prev_sample = prev_sample_mean + ((step_size * 2) ** 0.5) * noise if not return_dict: - return (prev_sample, ) + return (prev_sample,) return SchedulerOutput(prev_sample=prev_sample) def add_noise( - self, - original_samples: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor, ) -> paddle.Tensor: + self, + original_samples: paddle.Tensor, + noise: paddle.Tensor, + timesteps: paddle.Tensor, + ) -> paddle.Tensor: # Make sure sigmas and timesteps have the same dtype as original_samples sigmas = self.discrete_sigmas[timesteps] - noise = (paddle.randn( - original_samples.shape, - dtype=original_samples.dtype) * sigmas[:, None, None, None]) + noise = paddle.randn(original_samples.shape, dtype=original_samples.dtype) * sigmas[:, None, None, None] noisy_samples = noise + original_samples return noisy_samples diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_vp.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_vp.py index 23b4303cbf257..c0e1eebc3eb96 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_vp.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_sde_vp.py @@ -42,18 +42,13 @@ class ScoreSdeVpScheduler(SchedulerMixin, ConfigMixin): order = 1 @register_to_config - def __init__(self, - num_train_timesteps=2000, - beta_min=0.1, - beta_max=20, - sampling_eps=1e-3): + def __init__(self, num_train_timesteps=2000, beta_min=0.1, beta_max=20, sampling_eps=1e-3): self.sigmas = None self.discrete_sigmas = None self.timesteps = None def set_timesteps(self, num_inference_steps): - self.timesteps = paddle.linspace(1, self.config.sampling_eps, - num_inference_steps) + self.timesteps = paddle.linspace(1, self.config.sampling_eps, num_inference_steps) def step_pred(self, score, x, t, generator=None): if self.timesteps is None: @@ -63,9 +58,9 @@ def step_pred(self, score, x, t, generator=None): # TODO(Patrick) better comments + non-Paddle # postprocess model score - log_mean_coeff = (-0.25 * t**2 * - (self.config.beta_max - self.config.beta_min - ) - 0.5 * t * self.config.beta_min) + log_mean_coeff = ( + -0.25 * t**2 * (self.config.beta_max - self.config.beta_min) - 0.5 * t * self.config.beta_min + ) std = paddle.sqrt(1.0 - paddle.exp(2.0 * log_mean_coeff)) std = std.flatten() while len(std.shape) < len(score.shape): @@ -75,8 +70,7 @@ def step_pred(self, score, x, t, generator=None): # compute dt = -1.0 / len(self.timesteps) - beta_t = self.config.beta_min + t * (self.config.beta_max - - self.config.beta_min) + beta_t = self.config.beta_min + t * (self.config.beta_max - self.config.beta_min) beta_t = beta_t.flatten() while len(beta_t.shape) < len(x.shape): beta_t = beta_t.unsqueeze(-1) diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_unclip.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_unclip.py index 8b809e90c7159..491409f76a5e6 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_unclip.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_unclip.py @@ -64,7 +64,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): """ def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2 + return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 betas = [] for i in range(num_diffusion_timesteps): @@ -102,17 +102,16 @@ class UnCLIPScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - num_train_timesteps: int=1000, - variance_type: str="fixed_small_log", - clip_sample: bool=True, - clip_sample_range: Optional[float]=1.0, - prediction_type: str="epsilon", - beta_schedule: str="squaredcos_cap_v2", ): + self, + num_train_timesteps: int = 1000, + variance_type: str = "fixed_small_log", + clip_sample: bool = True, + clip_sample_range: Optional[float] = 1.0, + prediction_type: str = "epsilon", + beta_schedule: str = "squaredcos_cap_v2", + ): if beta_schedule != "squaredcos_cap_v2": - raise ValueError( - "UnCLIPScheduler only supports `beta_schedule`: 'squaredcos_cap_v2'" - ) + raise ValueError("UnCLIPScheduler only supports `beta_schedule`: 'squaredcos_cap_v2'") self.betas = betas_for_alpha_bar(num_train_timesteps) @@ -125,14 +124,11 @@ def __init__( # setable values self.num_inference_steps = None - self.timesteps = paddle.to_tensor( - np.arange(0, num_train_timesteps)[::-1].copy()) + self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps)[::-1].copy()) self.variance_type = variance_type - def scale_model_input(self, - sample: paddle.Tensor, - timestep: Optional[int]=None) -> paddle.Tensor: + def scale_model_input(self, sample: paddle.Tensor, timestep: Optional[int] = None) -> paddle.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. @@ -159,23 +155,16 @@ def set_timesteps(self, num_inference_steps: int): the number of diffusion steps used when generating samples with a pre-trained model. """ self.num_inference_steps = num_inference_steps - step_ratio = (self.config.num_train_timesteps - 1) / ( - self.num_inference_steps - 1) - timesteps = ((np.arange(0, num_inference_steps) * step_ratio) - .round()[::-1].copy().astype(np.int64)) + step_ratio = (self.config.num_train_timesteps - 1) / (self.num_inference_steps - 1) + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64) self.timesteps = paddle.to_tensor(timesteps) - def _get_variance(self, - t, - prev_timestep=None, - predicted_variance=None, - variance_type=None): + def _get_variance(self, t, prev_timestep=None, predicted_variance=None, variance_type=None): if prev_timestep is None: prev_timestep = t - 1 alpha_prod_t = self.alphas_cumprod[t] - alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep] - if prev_timestep >= 0 else self.one) + alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.one beta_prod_t = 1 - alpha_prod_t beta_prod_t_prev = 1 - alpha_prod_t_prev @@ -207,13 +196,14 @@ def _get_variance(self, return variance def step( - self, - model_output: paddle.Tensor, - timestep: int, - sample: paddle.Tensor, - prev_timestep: Optional[int]=None, - generator=None, - return_dict: bool=True, ) -> Union[UnCLIPSchedulerOutput, Tuple]: + self, + model_output: paddle.Tensor, + timestep: int, + sample: paddle.Tensor, + prev_timestep: Optional[int] = None, + generator=None, + return_dict: bool = True, + ) -> Union[UnCLIPSchedulerOutput, Tuple]: """ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). @@ -236,12 +226,11 @@ def step( """ t = timestep - if (model_output.shape[1] == sample.shape[1] * 2 and - self.variance_type == "learned_range"): + if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type == "learned_range": # must split like this, 3 -> split 2 -> [2, 1] model_output, predicted_variance = model_output.split( - [sample.shape[1], model_output.shape[1] - sample.shape[1]], - axis=1) + [sample.shape[1], model_output.shape[1] - sample.shape[1]], axis=1 + ) else: predicted_variance = None @@ -250,8 +239,7 @@ def step( prev_timestep = t - 1 alpha_prod_t = self.alphas_cumprod[t] - alpha_prod_t_prev = (self.alphas_cumprod[prev_timestep] - if prev_timestep >= 0 else self.one) + alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.one beta_prod_t = 1 - alpha_prod_t beta_prod_t_prev = 1 - alpha_prod_t_prev @@ -265,32 +253,31 @@ def step( # 2. compute predicted original sample from predicted noise also called # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf if self.config.prediction_type == "epsilon": - pred_original_sample = (sample - beta_prod_t** - (0.5) * model_output) / alpha_prod_t**(0.5) + pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) elif self.config.prediction_type == "sample": pred_original_sample = model_output else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon` or `sample`" - " for the UnCLIPScheduler.") + " for the UnCLIPScheduler." + ) # 3. Clip "predicted x_0" if self.config.clip_sample: pred_original_sample = paddle.clip( pred_original_sample, -self.config.clip_sample_range, - self.config.clip_sample_range, ) + self.config.clip_sample_range, + ) # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf - pred_original_sample_coeff = (alpha_prod_t_prev - **(0.5) * beta) / beta_prod_t - current_sample_coeff = alpha**(0.5) * beta_prod_t_prev / beta_prod_t + pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * beta) / beta_prod_t + current_sample_coeff = alpha ** (0.5) * beta_prod_t_prev / beta_prod_t # 5. Compute predicted previous sample µ_t # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf - pred_prev_sample = (pred_original_sample_coeff * pred_original_sample + - current_sample_coeff * sample) + pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample # 6. Add noise variance = 0 @@ -298,12 +285,14 @@ def step( variance_noise = randn_tensor( model_output.shape, dtype=model_output.dtype, - generator=generator, ) + generator=generator, + ) variance = self._get_variance( t, predicted_variance=predicted_variance, - prev_timestep=prev_timestep, ) + prev_timestep=prev_timestep, + ) if self.variance_type == "fixed_small_log": variance = variance @@ -312,15 +301,14 @@ def step( else: raise ValueError( f"variance_type given as {self.variance_type} must be one of `fixed_small_log` or `learned_range`" - " for the UnCLIPScheduler.") + " for the UnCLIPScheduler." + ) variance = variance * variance_noise pred_prev_sample = pred_prev_sample + variance if not return_dict: - return (pred_prev_sample, ) + return (pred_prev_sample,) - return UnCLIPSchedulerOutput( - prev_sample=pred_prev_sample, - pred_original_sample=pred_original_sample) + return UnCLIPSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample) diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_unipc_multistep.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_unipc_multistep.py index 4fb50fb0e19c2..fa85c31efc8c1 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_unipc_multistep.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_unipc_multistep.py @@ -23,8 +23,7 @@ import paddle from ..configuration_utils import ConfigMixin, register_to_config -from .scheduling_utils import (KarrasDiffusionSchedulers, SchedulerMixin, - SchedulerOutput) +from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): @@ -46,7 +45,7 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): """ def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2)**2 + return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 betas = [] for i in range(num_diffusion_timesteps): @@ -126,40 +125,43 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - num_train_timesteps: int=1000, - beta_start: float=0.0001, - beta_end: float=0.02, - beta_schedule: str="linear", - trained_betas: Optional[Union[np.ndarray, List[float]]]=None, - solver_order: int=2, - prediction_type: str="epsilon", - thresholding: bool=False, - dynamic_thresholding_ratio: float=0.995, - sample_max_value: float=1.0, - predict_x0: bool=True, - solver_type: str="bh2", - lower_order_final: bool=True, - disable_corrector: List[int]=[], - solver_p: SchedulerMixin=None, ): + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.0001, + beta_end: float = 0.02, + beta_schedule: str = "linear", + trained_betas: Optional[Union[np.ndarray, List[float]]] = None, + solver_order: int = 2, + prediction_type: str = "epsilon", + thresholding: bool = False, + dynamic_thresholding_ratio: float = 0.995, + sample_max_value: float = 1.0, + predict_x0: bool = True, + solver_type: str = "bh2", + lower_order_final: bool = True, + disable_corrector: List[int] = [], + solver_p: SchedulerMixin = None, + ): if trained_betas is not None: self.betas = paddle.to_tensor(trained_betas, dtype=paddle.float32) elif beta_schedule == "linear": - self.betas = paddle.linspace( - beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) + self.betas = paddle.linspace(beta_start, beta_end, num_train_timesteps, dtype=paddle.float32) elif beta_schedule == "scaled_linear": # this schedule is very specific to the latent diffusion model. - self.betas = (paddle.linspace( - beta_start**0.5, - beta_end**0.5, - num_train_timesteps, - dtype=paddle.float32, )**2) + self.betas = ( + paddle.linspace( + beta_start**0.5, + beta_end**0.5, + num_train_timesteps, + dtype=paddle.float32, + ) + ** 2 + ) elif beta_schedule == "squaredcos_cap_v2": # Glide cosine schedule self.betas = betas_for_alpha_bar(num_train_timesteps) else: - raise NotImplementedError( - f"{beta_schedule} does is not implemented for {self.__class__}") + raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") self.alphas = 1.0 - self.betas self.alphas_cumprod = paddle.cumprod(self.alphas, 0) @@ -175,16 +177,12 @@ def __init__( if solver_type in ["midpoint", "heun", "logrho"]: self.register_to_config(solver_type="bh1") else: - raise NotImplementedError( - f"{solver_type} does is not implemented for {self.__class__}" - ) + raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}") self.predict_x0 = predict_x0 # setable values self.num_inference_steps = None - timesteps = np.linspace( - 0, num_train_timesteps - 1, num_train_timesteps, - dtype=np.float32)[::-1].copy() + timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy() self.timesteps = paddle.to_tensor(timesteps) self.model_outputs = [None] * solver_order self.timestep_list = [None] * solver_order @@ -201,9 +199,12 @@ def set_timesteps(self, num_inference_steps: int): num_inference_steps (`int`): the number of diffusion steps used when generating samples with a pre-trained model. """ - timesteps = (np.linspace(0, self.config.num_train_timesteps - 1, - num_inference_steps + 1).round()[::-1][:-1] - .copy().astype(np.int64)) + timesteps = ( + np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1) + .round()[::-1][:-1] + .copy() + .astype(np.int64) + ) # when num_inference_steps == num_train_timesteps, we can end up with # duplicates in timesteps. @@ -214,7 +215,9 @@ def set_timesteps(self, num_inference_steps: int): self.num_inference_steps = len(timesteps) - self.model_outputs = [None, ] * self.config.solver_order + self.model_outputs = [ + None, + ] * self.config.solver_order self.lower_order_nums = 0 self.last_sample = None if self.solver_p: @@ -242,8 +245,7 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor: abs_sample = sample.abs() # "a certain percentile absolute pixel value" - s = paddle.quantile( - abs_sample, self.config.dynamic_thresholding_ratio, axis=1) + s = paddle.quantile(abs_sample, self.config.dynamic_thresholding_ratio, axis=1) # paddle.clip donot support min > max if self.config.sample_max_value < 1: s = paddle.ones_like(s) * self.config.sample_max_value @@ -251,21 +253,15 @@ def _threshold_sample(self, sample: paddle.Tensor) -> paddle.Tensor: s = paddle.clip( s, min=1, max=self.config.sample_max_value ) # When clip to min=1, equivalent to standard clipping to [-1, 1] - s = s.unsqueeze( - 1) # (batch_size, 1) because clip will broadcast along axis=0 - sample = ( - paddle.clip(sample, -s, s) / - s) # "we threshold xt0 to the range [-s, s] and then divide by s" + s = s.unsqueeze(1) # (batch_size, 1) because clip will broadcast along axis=0 + sample = paddle.clip(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s" sample = paddle.reshape(sample, [batch_size, channels, height, width]) sample = paddle.cast(sample, dtype) return sample - def convert_model_output(self, - model_output: paddle.Tensor, - timestep: int, - sample: paddle.Tensor) -> paddle.Tensor: + def convert_model_output(self, model_output: paddle.Tensor, timestep: int, sample: paddle.Tensor) -> paddle.Tensor: r""" Convert the model output to the corresponding type that the algorithm PC needs. @@ -280,19 +276,18 @@ def convert_model_output(self, """ if self.predict_x0: if self.config.prediction_type == "epsilon": - alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[ - timestep] + alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep] x0_pred = (sample - sigma_t * model_output) / alpha_t elif self.config.prediction_type == "sample": x0_pred = model_output elif self.config.prediction_type == "v_prediction": - alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[ - timestep] + alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep] x0_pred = alpha_t * sample - sigma_t * model_output else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or" - " `v_prediction` for the UniPCMultistepScheduler.") + " `v_prediction` for the UniPCMultistepScheduler." + ) if self.config.thresholding: x0_pred = self._threshold_sample(x0_pred) @@ -302,26 +297,26 @@ def convert_model_output(self, if self.config.prediction_type == "epsilon": return model_output elif self.config.prediction_type == "sample": - alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[ - timestep] + alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep] epsilon = (sample - alpha_t * model_output) / sigma_t return epsilon elif self.config.prediction_type == "v_prediction": - alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[ - timestep] + alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep] epsilon = alpha_t * model_output + sigma_t * sample return epsilon else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or" - " `v_prediction` for the UniPCMultistepScheduler.") + " `v_prediction` for the UniPCMultistepScheduler." + ) def multistep_uni_p_bh_update( - self, - model_output: paddle.Tensor, - prev_timestep: int, - sample: paddle.Tensor, - order: int, ) -> paddle.Tensor: + self, + model_output: paddle.Tensor, + prev_timestep: int, + sample: paddle.Tensor, + order: int, + ) -> paddle.Tensor: """ One step for the UniP (B(h) version). Alternatively, `self.solver_p` is used if is specified. @@ -424,12 +419,13 @@ def multistep_uni_p_bh_update( return x_t def multistep_uni_c_bh_update( - self, - this_model_output: paddle.Tensor, - this_timestep: int, - last_sample: paddle.Tensor, - this_sample: paddle.Tensor, - order: int, ) -> paddle.Tensor: + self, + this_model_output: paddle.Tensor, + this_timestep: int, + last_sample: paddle.Tensor, + this_sample: paddle.Tensor, + order: int, + ) -> paddle.Tensor: """ One step for the UniC (B(h) version). @@ -512,8 +508,7 @@ def multistep_uni_c_bh_update( if self.predict_x0: x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0 if D1s is not None: - corr_res = paddle.einsum("k,bkchw->bchw", - rhos_c[:-1].squeeze(1), D1s) + corr_res = paddle.einsum("k,bkchw->bchw", rhos_c[:-1].squeeze(1), D1s) else: corr_res = 0 D1_t = model_t - m0 @@ -521,8 +516,7 @@ def multistep_uni_c_bh_update( else: x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0 if D1s is not None: - corr_res = paddle.einsum("k,bkchw->bchw", - rhos_c[:-1].squeeze(1), D1s) + corr_res = paddle.einsum("k,bkchw->bchw", rhos_c[:-1].squeeze(1), D1s) else: corr_res = 0 D1_t = model_t - m0 @@ -531,11 +525,12 @@ def multistep_uni_c_bh_update( return x_t def step( - self, - model_output: paddle.Tensor, - timestep: int, - sample: paddle.Tensor, - return_dict: bool=True, ) -> Union[SchedulerOutput, Tuple]: + self, + model_output: paddle.Tensor, + timestep: int, + sample: paddle.Tensor, + return_dict: bool = True, + ) -> Union[SchedulerOutput, Tuple]: """ Step function propagating the sample with the multistep UniPC. @@ -563,23 +558,22 @@ def step( else: step_index = step_index.item() - use_corrector = (step_index > 0 and - step_index - 1 not in self.disable_corrector and - self.last_sample is not None) + use_corrector = ( + step_index > 0 and step_index - 1 not in self.disable_corrector and self.last_sample is not None + ) - model_output_convert = self.convert_model_output(model_output, timestep, - sample) + model_output_convert = self.convert_model_output(model_output, timestep, sample) if use_corrector: sample = self.multistep_uni_c_bh_update( this_model_output=model_output_convert, this_timestep=timestep, last_sample=self.last_sample, this_sample=sample, - order=self.this_order, ) + order=self.this_order, + ) # now prepare to run the predictor - prev_timestep = (0 if step_index == len(self.timesteps) - 1 else - self.timesteps[step_index + 1]) + prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1] for i in range(self.config.solver_order - 1): self.model_outputs[i] = self.model_outputs[i + 1] @@ -589,13 +583,11 @@ def step( self.timestep_list[-1] = timestep if self.config.lower_order_final: - this_order = min(self.config.solver_order, - len(self.timesteps) - step_index) + this_order = min(self.config.solver_order, len(self.timesteps) - step_index) else: this_order = self.config.solver_order - self.this_order = min(this_order, - self.lower_order_nums + 1) # warmup for multistep + self.this_order = min(this_order, self.lower_order_nums + 1) # warmup for multistep assert self.this_order > 0 self.last_sample = sample @@ -603,18 +595,18 @@ def step( model_output=model_output, # pass the original non-converted model output, in case solver-p is used prev_timestep=prev_timestep, sample=sample, - order=self.this_order, ) + order=self.this_order, + ) if self.lower_order_nums < self.config.solver_order: self.lower_order_nums += 1 if not return_dict: - return (prev_sample, ) + return (prev_sample,) return SchedulerOutput(prev_sample=prev_sample) - def scale_model_input(self, sample: paddle.Tensor, *args, - **kwargs) -> paddle.Tensor: + def scale_model_input(self, sample: paddle.Tensor, *args, **kwargs) -> paddle.Tensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. @@ -629,26 +621,25 @@ def scale_model_input(self, sample: paddle.Tensor, *args, # Copied from ppdiffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( - self, - original_samples: paddle.Tensor, - noise: paddle.Tensor, - timesteps: paddle.Tensor, ) -> paddle.Tensor: + self, + original_samples: paddle.Tensor, + noise: paddle.Tensor, + timesteps: paddle.Tensor, + ) -> paddle.Tensor: # Make sure alphas_cumprod and timestep have same dtype as original_samples alphas_cumprod = self.alphas_cumprod.cast(original_samples.dtype) - sqrt_alpha_prod = alphas_cumprod[timesteps]**0.5 + sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 sqrt_alpha_prod = sqrt_alpha_prod.flatten() while len(sqrt_alpha_prod.shape) < len(original_samples.shape): sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) - sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps])**0.5 + sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() - while len(sqrt_one_minus_alpha_prod.shape) < len( - original_samples.shape): + while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape): sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) - noisy_samples = (sqrt_alpha_prod * original_samples + - sqrt_one_minus_alpha_prod * noise) + noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise return noisy_samples def __len__(self): diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_utils.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_utils.py index 96707f403e49a..d5bcdca0d1a9f 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_utils.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_utils.py @@ -76,11 +76,12 @@ class SchedulerMixin: @classmethod def from_pretrained( - cls, - pretrained_model_name_or_path: Dict[str, Any]=None, - subfolder: Optional[str]=None, - return_unused_kwargs: bool=False, - **kwargs, ): + cls, + pretrained_model_name_or_path: Dict[str, Any] = None, + subfolder: Optional[str] = None, + return_unused_kwargs: bool = False, + **kwargs, + ): r""" Instantiate a Scheduler class from a pre-defined JSON configuration file inside a directory or Hub repo. @@ -142,15 +143,16 @@ def from_pretrained( subfolder=subfolder, return_unused_kwargs=True, return_commit_hash=True, - **kwargs, ) - return cls.from_config( - config, return_unused_kwargs=return_unused_kwargs, **kwargs) + **kwargs, + ) + return cls.from_config(config, return_unused_kwargs=return_unused_kwargs, **kwargs) def save_pretrained( - self, - save_directory: Union[str, os.PathLike], - push_to_hub: bool=False, - **kwargs, ): + self, + save_directory: Union[str, os.PathLike], + push_to_hub: bool = False, + **kwargs, + ): """ Save a scheduler configuration object to the directory `save_directory`, so that it can be re-loaded using the [`~SchedulerMixin.from_pretrained`] class method. @@ -159,8 +161,7 @@ def save_pretrained( save_directory (`str` or `os.PathLike`): Directory where the configuration JSON file will be saved (will be created if it does not exist). """ - self.save_config( - save_directory=save_directory, push_to_hub=push_to_hub, **kwargs) + self.save_config(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs) @property def compatibles(self): @@ -177,7 +178,6 @@ def _get_compatibles(cls): compatible_classes_str = list(set([cls.__name__] + cls._compatibles)) diffusers_library = importlib.import_module(__name__.split(".")[0]) compatible_classes = [ - getattr(diffusers_library, c) for c in compatible_classes_str - if hasattr(diffusers_library, c) + getattr(diffusers_library, c) for c in compatible_classes_str if hasattr(diffusers_library, c) ] return compatible_classes diff --git a/ppdiffusers/ppdiffusers/schedulers/scheduling_vq_diffusion.py b/ppdiffusers/ppdiffusers/schedulers/scheduling_vq_diffusion.py index 71ee1bc4ad4e8..f9f3c34bba785 100644 --- a/ppdiffusers/ppdiffusers/schedulers/scheduling_vq_diffusion.py +++ b/ppdiffusers/ppdiffusers/schedulers/scheduling_vq_diffusion.py @@ -69,8 +69,7 @@ def index_to_log_onehot(x: paddle.Tensor, num_classes: int) -> paddle.Tensor: return log_x -def gumbel_noised(logits: paddle.Tensor, - generator: Optional[paddle.Generator]) -> paddle.Tensor: +def gumbel_noised(logits: paddle.Tensor, generator: Optional[paddle.Generator]) -> paddle.Tensor: """ Apply gumbel noise to `logits` """ @@ -80,34 +79,32 @@ def gumbel_noised(logits: paddle.Tensor, return noised -def alpha_schedules(num_diffusion_timesteps: int, - alpha_cum_start=0.99999, - alpha_cum_end=0.000009): +def alpha_schedules(num_diffusion_timesteps: int, alpha_cum_start=0.99999, alpha_cum_end=0.000009): """ Cumulative and non-cumulative alpha schedules. See section 4.1. """ - att = (np.arange(0, num_diffusion_timesteps) / - (num_diffusion_timesteps - 1) * - (alpha_cum_end - alpha_cum_start) + alpha_cum_start) + att = ( + np.arange(0, num_diffusion_timesteps) / (num_diffusion_timesteps - 1) * (alpha_cum_end - alpha_cum_start) + + alpha_cum_start + ) att = np.concatenate(([1], att)) at = att[1:] / att[:-1] att = np.concatenate((att[1:], [1])) return at, att -def gamma_schedules(num_diffusion_timesteps: int, - gamma_cum_start=0.000009, - gamma_cum_end=0.99999): +def gamma_schedules(num_diffusion_timesteps: int, gamma_cum_start=0.000009, gamma_cum_end=0.99999): """ Cumulative and non-cumulative gamma schedules. See section 4.1. """ - ctt = (np.arange(0, num_diffusion_timesteps) / - (num_diffusion_timesteps - 1) * - (gamma_cum_end - gamma_cum_start) + gamma_cum_start) + ctt = ( + np.arange(0, num_diffusion_timesteps) / (num_diffusion_timesteps - 1) * (gamma_cum_end - gamma_cum_start) + + gamma_cum_start + ) ctt = np.concatenate(([0], ctt)) one_minus_ctt = 1 - ctt one_minus_ct = one_minus_ctt[1:] / one_minus_ctt[:-1] @@ -155,13 +152,14 @@ class VQDiffusionScheduler(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - num_vec_classes: int, - num_train_timesteps: int=100, - alpha_cum_start: float=0.99999, - alpha_cum_end: float=0.000009, - gamma_cum_start: float=0.000009, - gamma_cum_end: float=0.99999, ): + self, + num_vec_classes: int, + num_train_timesteps: int = 100, + alpha_cum_start: float = 0.99999, + alpha_cum_end: float = 0.000009, + gamma_cum_start: float = 0.000009, + gamma_cum_end: float = 0.99999, + ): self.num_embed = num_vec_classes # By convention, the index for the mask class is the last class index @@ -170,11 +168,13 @@ def __init__( at, att = alpha_schedules( num_train_timesteps, alpha_cum_start=alpha_cum_start, - alpha_cum_end=alpha_cum_end, ) + alpha_cum_end=alpha_cum_end, + ) ct, ctt = gamma_schedules( num_train_timesteps, gamma_cum_start=gamma_cum_start, - gamma_cum_end=gamma_cum_end, ) + gamma_cum_end=gamma_cum_end, + ) num_non_mask_classes = self.num_embed - 1 bt = (1 - at - ct) / num_non_mask_classes @@ -203,8 +203,7 @@ def __init__( # setable values self.num_inference_steps = None - self.timesteps = paddle.to_tensor( - np.arange(0, num_train_timesteps)[::-1].copy()) + self.timesteps = paddle.to_tensor(np.arange(0, num_train_timesteps)[::-1].copy()) def set_timesteps(self, num_inference_steps: int): """ @@ -219,14 +218,13 @@ def set_timesteps(self, num_inference_steps: int): self.timesteps = paddle.to_tensor(timesteps) def step( - self, - model_output: paddle.Tensor, - timestep: paddle.Tensor, - sample: paddle.Tensor, - generator: Optional[Union[paddle.Generator, List[ - paddle.Generator]]]=None, - return_dict: bool=True, ) -> Union[VQDiffusionSchedulerOutput, - Tuple]: + self, + model_output: paddle.Tensor, + timestep: paddle.Tensor, + sample: paddle.Tensor, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + return_dict: bool = True, + ) -> Union[VQDiffusionSchedulerOutput, Tuple]: """ Predict the sample at the previous timestep via the reverse transition distribution i.e. Equation (11). See the docstring for `self.q_posterior` for more in depth docs on how Equation (11) is computed. @@ -263,7 +261,7 @@ def step( x_t_min_1 = log_p_x_t_min_1.argmax(axis=1) if not return_dict: - return (x_t_min_1, ) + return (x_t_min_1,) return VQDiffusionSchedulerOutput(prev_sample=x_t_min_1) @@ -299,10 +297,12 @@ def q_posterior(self, log_p_x_0, x_t, t): log_onehot_x_t = index_to_log_onehot(x_t, self.num_embed) log_q_x_t_given_x_0 = self.log_Q_t_transitioning_to_known_class( - t=t, x_t=x_t, log_onehot_x_t=log_onehot_x_t, cumulative=True) + t=t, x_t=x_t, log_onehot_x_t=log_onehot_x_t, cumulative=True + ) log_q_t_given_x_t_min_1 = self.log_Q_t_transitioning_to_known_class( - t=t, x_t=x_t, log_onehot_x_t=log_onehot_x_t, cumulative=False) + t=t, x_t=x_t, log_onehot_x_t=log_onehot_x_t, cumulative=False + ) # p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) ... p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0) # . . . @@ -384,12 +384,9 @@ def q_posterior(self, log_p_x_0, x_t, t): # The last row is trivially verified. The other rows can be verified by directly expanding equation (11) stated in terms of forward probabilities. return log_p_x_t_min_1 - def log_Q_t_transitioning_to_known_class(self, - *, - t: paddle.Tensor, - x_t: paddle.Tensor, - log_onehot_x_t: paddle.Tensor, - cumulative: bool): + def log_Q_t_transitioning_to_known_class( + self, *, t: paddle.Tensor, x_t: paddle.Tensor, log_onehot_x_t: paddle.Tensor, cumulative: bool + ): """ Returns the log probabilities of the rows from the (cumulative or non-cumulative) transition matrix for each latent pixel in `x_t`. @@ -462,9 +459,7 @@ def log_Q_t_transitioning_to_known_class(self, # # `P(x_t=mask|x_{t-1=mask}) = 1` and 1 will be the value of the last row of the onehot vector # if x_t is masked - log_onehot_x_t_transitioning_from_masked = log_onehot_x_t[:, - -1, :].unsqueeze( - 1) + log_onehot_x_t_transitioning_from_masked = log_onehot_x_t[:, -1, :].unsqueeze(1) # `index_to_log_onehot` will add onehot vectors for masked pixels, # so the default one hot matrix has one too many rows. See the doc string @@ -486,14 +481,12 @@ def log_Q_t_transitioning_to_known_class(self, # The whole column of each masked pixel is `c` mask_class_mask = x_t == self.mask_class - mask_class_mask = mask_class_mask.unsqueeze(1).expand( - [-1, self.num_embed - 1, -1]) + mask_class_mask = mask_class_mask.unsqueeze(1).expand([-1, self.num_embed - 1, -1]) # log_Q_t[mask_class_mask] = c log_Q_t = paddle.where(mask_class_mask, c, log_Q_t) if not cumulative: - log_Q_t = paddle.concat( - (log_Q_t, log_onehot_x_t_transitioning_from_masked), axis=1) + log_Q_t = paddle.concat((log_Q_t, log_onehot_x_t_transitioning_from_masked), axis=1) return log_Q_t diff --git a/ppdiffusers/ppdiffusers/training_utils.py b/ppdiffusers/ppdiffusers/training_utils.py index dba0703882e22..32a8251578ea7 100644 --- a/ppdiffusers/ppdiffusers/training_utils.py +++ b/ppdiffusers/ppdiffusers/training_utils.py @@ -67,17 +67,18 @@ class EMAModel: """ def __init__( - self, - parameters, - decay: float=0.9999, - min_decay: float=0.0, - update_after_step: int=0, - use_ema_warmup: bool=False, - inv_gamma: Union[float, int]=1.0, - power: Union[float, int]=2 / 3, - model_cls: Optional[Any]=None, - model_config: Dict[str, Any]=None, - **kwargs, ): + self, + parameters, + decay: float = 0.9999, + min_decay: float = 0.0, + update_after_step: int = 0, + use_ema_warmup: bool = False, + inv_gamma: Union[float, int] = 1.0, + power: Union[float, int] = 2 / 3, + model_cls: Optional[Any] = None, + model_config: Dict[str, Any] = None, + **kwargs, + ): """ Args: parameters (Iterable[nn.Parameter]): The parameters to track. @@ -99,39 +100,35 @@ def __init__( if isinstance(parameters, nn.Layer): deprecation_message = ( "Passing a `nn.Layer` to `ExponentialMovingAverage` is deprecated. " - "Please pass the parameters of the module instead.") + "Please pass the parameters of the module instead." + ) deprecate( "passing a `nn.Layer` to `ExponentialMovingAverage`", "1.0.0", deprecation_message, - standard_warn=False, ) + standard_warn=False, + ) parameters = parameters.parameters() # set use_ema_warmup to True if a nn.Layer is passed for backwards compatibility use_ema_warmup = True if kwargs.get("max_value", None) is not None: - deprecation_message = ( - "The `max_value` argument is deprecated. Please use `decay` instead." - ) - deprecate( - "max_value", "1.0.0", deprecation_message, standard_warn=False) + deprecation_message = "The `max_value` argument is deprecated. Please use `decay` instead." + deprecate("max_value", "1.0.0", deprecation_message, standard_warn=False) decay = kwargs["max_value"] if kwargs.get("min_value", None) is not None: deprecation_message = "The `min_value` argument is deprecated. Please use `min_decay` instead." - deprecate( - "min_value", "1.0.0", deprecation_message, standard_warn=False) + deprecate("min_value", "1.0.0", deprecation_message, standard_warn=False) min_decay = kwargs["min_value"] parameters = list(parameters) self.shadow_params = [p.clone().detach() for p in parameters] if kwargs.get("device", None) is not None: - deprecation_message = ( - "The `device` argument is deprecated. Please use `to` instead.") - deprecate( - "device", "1.0.0", deprecation_message, standard_warn=False) + deprecation_message = "The `device` argument is deprecated. Please use `to` instead." + deprecate("device", "1.0.0", deprecation_message, standard_warn=False) self.to(device=kwargs["device"]) self.temp_stored_params = None @@ -153,23 +150,17 @@ def from_pretrained(cls, path, model_cls) -> "EMAModel": _, ema_kwargs = model_cls.load_config(path, return_unused_kwargs=True) model = model_cls.from_pretrained(path) - ema_model = cls(model.parameters(), - model_cls=model_cls, - model_config=model.config) + ema_model = cls(model.parameters(), model_cls=model_cls, model_config=model.config) ema_model.load_state_dict(ema_kwargs) return ema_model def save_pretrained(self, path): if self.model_cls is None: - raise ValueError( - "`save_pretrained` can only be used if `model_cls` was defined at __init__." - ) + raise ValueError("`save_pretrained` can only be used if `model_cls` was defined at __init__.") if self.model_config is None: - raise ValueError( - "`save_pretrained` can only be used if `model_config` was defined at __init__." - ) + raise ValueError("`save_pretrained` can only be used if `model_config` was defined at __init__.") model = self.model_cls.from_config(self.model_config) state_dict = self.state_dict() @@ -190,7 +181,7 @@ def get_decay(self, optimization_step: int) -> float: return 0.0 if self.use_ema_warmup: - cur_decay_value = 1 - (1 + step / self.inv_gamma)**-self.power + cur_decay_value = 1 - (1 + step / self.inv_gamma) ** -self.power else: cur_decay_value = (1 + step) / (10 + step) @@ -204,12 +195,14 @@ def step(self, parameters): if isinstance(parameters, nn.Layer): deprecation_message = ( "Passing a `nn.Layer` to `ExponentialMovingAverage.step` is deprecated. " - "Please pass the parameters of the module instead.") + "Please pass the parameters of the module instead." + ) deprecate( "passing a `nn.Layer` to `ExponentialMovingAverage.step`", "1.0.0", deprecation_message, - standard_warn=False, ) + standard_warn=False, + ) parameters = parameters.parameters() parameters = list(parameters) @@ -223,8 +216,7 @@ def step(self, parameters): for s_param, param in zip(self.shadow_params, parameters): if not param.stop_gradient: - s_param.copy_(s_param - one_minus_decay * (s_param - param), - True) + s_param.copy_(s_param - one_minus_decay * (s_param - param), True) else: s_param.copy_(param, True) @@ -267,9 +259,7 @@ def store(self, parameters) -> None: parameters: Iterable of `nn.Parameter`; the parameters to be temporarily stored. """ - self.temp_stored_params = [ - param.detach().cpu().clone() for param in parameters - ] + self.temp_stored_params = [param.detach().cpu().clone() for param in parameters] def restore(self, parameters) -> None: r""" @@ -282,9 +272,7 @@ def restore(self, parameters) -> None: `ExponentialMovingAverage` was initialized will be used. """ if self.temp_stored_params is None: - raise RuntimeError( - "This ExponentialMovingAverage has no `store()`ed weights " - "to `restore()`") + raise RuntimeError("This ExponentialMovingAverage has no `store()`ed weights " "to `restore()`") for c_param, param in zip(self.temp_stored_params, parameters): param.copy_(c_param, True) @@ -310,18 +298,15 @@ def load_state_dict(self, state_dict: dict) -> None: if not isinstance(self.min_decay, float): raise ValueError("Invalid min_decay") - self.optimization_step = state_dict.get("optimization_step", - self.optimization_step) + self.optimization_step = state_dict.get("optimization_step", self.optimization_step) if not isinstance(self.optimization_step, int): raise ValueError("Invalid optimization_step") - self.update_after_step = state_dict.get("update_after_step", - self.update_after_step) + self.update_after_step = state_dict.get("update_after_step", self.update_after_step) if not isinstance(self.update_after_step, int): raise ValueError("Invalid update_after_step") - self.use_ema_warmup = state_dict.get("use_ema_warmup", - self.use_ema_warmup) + self.use_ema_warmup = state_dict.get("use_ema_warmup", self.use_ema_warmup) if not isinstance(self.use_ema_warmup, bool): raise ValueError("Invalid use_ema_warmup") @@ -338,8 +323,7 @@ def load_state_dict(self, state_dict: dict) -> None: self.shadow_params = shadow_params if not isinstance(self.shadow_params, list): raise ValueError("shadow_params must be a list") - if not all( - isinstance(p, paddle.Tensor) for p in self.shadow_params): + if not all(isinstance(p, paddle.Tensor) for p in self.shadow_params): raise ValueError("shadow_params must all be Tensors") @@ -353,17 +337,13 @@ def main_process_first(desc="work"): try: if not is_main_process: # tell all replicas to wait - logger.debug( - f"{rank}: waiting for the {main_process_desc} to perform {desc}" - ) + logger.debug(f"{rank}: waiting for the {main_process_desc} to perform {desc}") paddle.distributed.barrier() yield finally: if is_main_process: # the wait is over - logger.debug( - f"{rank}: {main_process_desc} completed {desc}, releasing all replicas" - ) + logger.debug(f"{rank}: {main_process_desc} completed {desc}, releasing all replicas") paddle.distributed.barrier() else: yield diff --git a/ppdiffusers/ppdiffusers/utils/__init__.py b/ppdiffusers/ppdiffusers/utils/__init__.py index 93a62dd290d7b..4b5b8ba7e4234 100644 --- a/ppdiffusers/ppdiffusers/utils/__init__.py +++ b/ppdiffusers/ppdiffusers/utils/__init__.py @@ -20,33 +20,78 @@ from ..version import VERSION as __version__ from . import initializer_utils from .constants import ( - CONFIG_NAME, DEPRECATED_REVISION_ARGS, DIFFUSERS_CACHE, DOWNLOAD_SERVER, - FASTDEPLOY_MODEL_NAME, FASTDEPLOY_WEIGHTS_NAME, FLAX_WEIGHTS_NAME, - FROM_DIFFUSERS, FROM_HF_HUB, HF_MODULES_CACHE, - HUGGINGFACE_CO_RESOLVE_ENDPOINT, LOW_CPU_MEM_USAGE_DEFAULT, NEG_INF, - ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME, PADDLE_WEIGHTS_NAME, - PPDIFFUSERS_CACHE, PPDIFFUSERS_DYNAMIC_MODULE_NAME, - PPDIFFUSERS_MODULES_CACHE, PPNLP_BOS_RESOLVE_ENDPOINT, TEST_DOWNLOAD_SERVER, - TEXT_ENCODER_ATTN_MODULE, TO_DIFFUSERS, TORCH_SAFETENSORS_WEIGHTS_NAME, - TORCH_WEIGHTS_NAME, WEIGHTS_NAME, get_map_location_default, str2bool) + CONFIG_NAME, + DEPRECATED_REVISION_ARGS, + DIFFUSERS_CACHE, + DOWNLOAD_SERVER, + FASTDEPLOY_MODEL_NAME, + FASTDEPLOY_WEIGHTS_NAME, + FLAX_WEIGHTS_NAME, + FROM_DIFFUSERS, + FROM_HF_HUB, + HF_MODULES_CACHE, + HUGGINGFACE_CO_RESOLVE_ENDPOINT, + LOW_CPU_MEM_USAGE_DEFAULT, + NEG_INF, + ONNX_EXTERNAL_WEIGHTS_NAME, + ONNX_WEIGHTS_NAME, + PADDLE_WEIGHTS_NAME, + PPDIFFUSERS_CACHE, + PPDIFFUSERS_DYNAMIC_MODULE_NAME, + PPDIFFUSERS_MODULES_CACHE, + PPNLP_BOS_RESOLVE_ENDPOINT, + TEST_DOWNLOAD_SERVER, + TEXT_ENCODER_ATTN_MODULE, + TO_DIFFUSERS, + TORCH_SAFETENSORS_WEIGHTS_NAME, + TORCH_WEIGHTS_NAME, + WEIGHTS_NAME, + get_map_location_default, + str2bool, +) from .deprecation_utils import deprecate from .doc_utils import replace_example_docstring -from .download_utils import (_add_variant, _get_model_file, bos_hf_download, - ppdiffusers_bos_dir_download, - ppdiffusers_url_download) +from .download_utils import ( + _add_variant, + _get_model_file, + bos_hf_download, + ppdiffusers_bos_dir_download, + ppdiffusers_url_download, +) from .dynamic_modules_utils import get_class_from_dynamic_module from .hub_utils import HF_HUB_OFFLINE, extract_commit_hash, http_user_agent from .import_utils import ( - BACKENDS_MAPPING, ENV_VARS_TRUE_AND_AUTO_VALUES, ENV_VARS_TRUE_VALUES, - DummyObject, OptionalDependencyNotAvailable, is_bs4_available, - is_einops_available, is_fastdeploy_available, is_ftfy_available, - is_inflect_available, is_k_diffusion_available, is_k_diffusion_version, - is_librosa_available, is_note_seq_available, is_omegaconf_available, - is_paddle_available, is_paddle_version, is_paddlenlp_available, - is_paddlenlp_version, is_ppxformers_available, is_safetensors_available, - is_scipy_available, is_tensorboard_available, is_torch_available, - is_torch_version, is_unidecode_available, is_visualdl_available, - is_wandb_available, requires_backends) + BACKENDS_MAPPING, + ENV_VARS_TRUE_AND_AUTO_VALUES, + ENV_VARS_TRUE_VALUES, + DummyObject, + OptionalDependencyNotAvailable, + is_bs4_available, + is_einops_available, + is_fastdeploy_available, + is_ftfy_available, + is_inflect_available, + is_k_diffusion_available, + is_k_diffusion_version, + is_librosa_available, + is_note_seq_available, + is_omegaconf_available, + is_paddle_available, + is_paddle_version, + is_paddlenlp_available, + is_paddlenlp_version, + is_ppxformers_available, + is_safetensors_available, + is_scipy_available, + is_tensorboard_available, + is_torch_available, + is_torch_version, + is_unidecode_available, + is_visualdl_available, + is_wandb_available, + requires_backends, +) + # custom load_utils from .load_utils import is_torch_file, safetensors_load, smart_load, torch_load from .logging import get_logger @@ -56,9 +101,21 @@ if is_paddle_available(): from .testing_utils import ( - floats_tensor, image_grid, load_hf_numpy, load_image, load_numpy, - load_pd, load_ppnlp_numpy, nightly, paddle_all_close, paddle_device, - parse_flag_from_env, print_tensor_test, require_paddle_gpu, slow) + floats_tensor, + image_grid, + load_hf_numpy, + load_image, + load_numpy, + load_pd, + load_ppnlp_numpy, + nightly, + paddle_all_close, + paddle_device, + parse_flag_from_env, + print_tensor_test, + require_paddle_gpu, + slow, + ) if is_torch_available(): from .testing_utils import require_torch diff --git a/ppdiffusers/ppdiffusers/utils/constants.py b/ppdiffusers/ppdiffusers/utils/constants.py index 2a112f725dc0c..2e51e9e559395 100644 --- a/ppdiffusers/ppdiffusers/utils/constants.py +++ b/ppdiffusers/ppdiffusers/utils/constants.py @@ -31,9 +31,8 @@ def str2bool(v): ppnlp_cache_home = os.path.expanduser( - os.getenv("PPNLP_HOME", - os.path.join( - os.getenv("XDG_CACHE_HOME", "~/.cache"), "paddlenlp"))) + os.getenv("PPNLP_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "paddlenlp")) +) ppdiffusers_default_cache_path = os.path.join(ppnlp_cache_home, "ppdiffusers") # diffusers_default_cache_path = os.path.join(HUGGINGFACE_HUB_CACHE, "diffusers") @@ -51,25 +50,20 @@ def str2bool(v): DIFFUSERS_CACHE = diffusers_default_cache_path DIFFUSERS_DYNAMIC_MODULE_NAME = "diffusers_modules" PPDIFFUSERS_DYNAMIC_MODULE_NAME = "ppdiffusers_modules" -HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", - os.path.join(hf_cache_home, "modules")) -PPDIFFUSERS_MODULES_CACHE = os.getenv("PPDIFFUSERS_MODULES_CACHE", - os.path.join(ppnlp_cache_home, "modules")) +HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(hf_cache_home, "modules")) +PPDIFFUSERS_MODULES_CACHE = os.getenv("PPDIFFUSERS_MODULES_CACHE", os.path.join(ppnlp_cache_home, "modules")) PADDLE_WEIGHTS_NAME = "model_state.pdparams" FASTDEPLOY_WEIGHTS_NAME = "inference.pdiparams" FASTDEPLOY_MODEL_NAME = "inference.pdmodel" WEIGHTS_NAME = PADDLE_WEIGHTS_NAME -TEST_DOWNLOAD_SERVER = ( - "https://paddlenlp.bj.bcebos.com/models/community/ppdiffusers/tests") +TEST_DOWNLOAD_SERVER = "https://paddlenlp.bj.bcebos.com/models/community/ppdiffusers/tests" DOWNLOAD_SERVER = "https://bj.bcebos.com/paddlenlp/models/community" -PPNLP_BOS_RESOLVE_ENDPOINT = os.getenv("PPNLP_ENDPOINT", - "https://bj.bcebos.com/paddlenlp") +PPNLP_BOS_RESOLVE_ENDPOINT = os.getenv("PPNLP_ENDPOINT", "https://bj.bcebos.com/paddlenlp") DEPRECATED_REVISION_ARGS = ["fp16", "non-ema"] TEXT_ENCODER_ATTN_MODULE = ".self_attn" -LOW_CPU_MEM_USAGE_DEFAULT = str2bool( - os.getenv("LOW_CPU_MEM_USAGE_DEFAULT", False)) +LOW_CPU_MEM_USAGE_DEFAULT = str2bool(os.getenv("LOW_CPU_MEM_USAGE_DEFAULT", False)) NEG_INF = -1e4 @@ -87,5 +81,4 @@ def str2bool(v): def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None): print(x.tolist()) print(y.tolist()) - return raw_all_close( - x, y, rtol=rtol, atol=atol, equal_nan=equal_nan, name=name) + return raw_all_close(x, y, rtol=rtol, atol=atol, equal_nan=equal_nan, name=name) diff --git a/ppdiffusers/ppdiffusers/utils/deprecation_utils.py b/ppdiffusers/ppdiffusers/utils/deprecation_utils.py index 8207e2c77d07f..010f89e11386e 100644 --- a/ppdiffusers/ppdiffusers/utils/deprecation_utils.py +++ b/ppdiffusers/ppdiffusers/utils/deprecation_utils.py @@ -21,39 +21,38 @@ def deprecate( - *args, - take_from: Optional[Union[Dict, Any]]=None, - standard_warn=True, - stacklevel=2, ): + *args, + take_from: Optional[Union[Dict, Any]] = None, + standard_warn=True, + stacklevel=2, +): from ..version import VERSION as __version__ deprecated_kwargs = take_from values = () if not isinstance(args[0], tuple): - args = (args, ) + args = (args,) for attribute, version_name, message in args: - if version.parse(version.parse(__version__) - .base_version) >= version.parse(version_name): + if version.parse(version.parse(__version__).base_version) >= version.parse(version_name): raise ValueError( f"The deprecation tuple {(attribute, version_name, message)} should be removed since ppdiffusers'" - f" version {__version__} is >= {version_name}") + f" version {__version__} is >= {version_name}" + ) warning = None - if isinstance(deprecated_kwargs, - dict) and attribute in deprecated_kwargs: - values += (deprecated_kwargs.pop(attribute), ) + if isinstance(deprecated_kwargs, dict) and attribute in deprecated_kwargs: + values += (deprecated_kwargs.pop(attribute),) warning = f"The `{attribute}` argument is deprecated and will be removed in version {version_name}." elif hasattr(deprecated_kwargs, attribute): - values += (getattr(deprecated_kwargs, attribute), ) + values += (getattr(deprecated_kwargs, attribute),) warning = f"The `{attribute}` attribute is deprecated and will be removed in version {version_name}." elif deprecated_kwargs is None: warning = f"`{attribute}` is deprecated and will be removed in version {version_name}." if warning is not None: warning = warning + " " if standard_warn else "" - warnings.warn( - warning + message, FutureWarning, stacklevel=stacklevel) + warnings.warn(warning + message, FutureWarning, stacklevel=stacklevel) if isinstance(deprecated_kwargs, dict) and len(deprecated_kwargs) > 0: call_frame = inspect.getouterframes(inspect.currentframe())[1] @@ -61,9 +60,7 @@ def deprecate( line_number = call_frame.lineno function = call_frame.function key, value = next(iter(deprecated_kwargs.items())) - raise TypeError( - f"{function} in {filename} line {line_number-1} got an unexpected keyword argument `{key}`" - ) + raise TypeError(f"{function} in {filename} line {line_number-1} got an unexpected keyword argument `{key}`") if len(values) == 0: return diff --git a/ppdiffusers/ppdiffusers/utils/doc_utils.py b/ppdiffusers/ppdiffusers/utils/doc_utils.py index c8b3fe1ab24bc..01188c98e9152 100644 --- a/ppdiffusers/ppdiffusers/utils/doc_utils.py +++ b/ppdiffusers/ppdiffusers/utils/doc_utils.py @@ -23,8 +23,7 @@ def docstring_decorator(fn): func_doc = fn.__doc__ lines = func_doc.split("\n") i = 0 - while i < len(lines) and re.search(r"^\s*Examples?:\s*$", - lines[i]) is None: + while i < len(lines) and re.search(r"^\s*Examples?:\s*$", lines[i]) is None: i += 1 if i < len(lines): lines[i] = example_docstring @@ -32,7 +31,8 @@ def docstring_decorator(fn): else: raise ValueError( f"The function {fn} should have an empty 'Examples:' in its docstring as placeholder, " - f"current docstring is:\n{func_doc}") + f"current docstring is:\n{func_doc}" + ) fn.__doc__ = func_doc return fn diff --git a/ppdiffusers/ppdiffusers/utils/download_utils.py b/ppdiffusers/ppdiffusers/utils/download_utils.py index a65ba335b0282..2ef31e8ba396b 100644 --- a/ppdiffusers/ppdiffusers/utils/download_utils.py +++ b/ppdiffusers/ppdiffusers/utils/download_utils.py @@ -28,8 +28,11 @@ from filelock import FileLock from huggingface_hub import hf_hub_download from huggingface_hub.file_download import _chmod_and_replace, http_get -from huggingface_hub.utils import (EntryNotFoundError, RepositoryNotFoundError, - RevisionNotFoundError) +from huggingface_hub.utils import ( + EntryNotFoundError, + RepositoryNotFoundError, + RevisionNotFoundError, +) from huggingface_hub.utils import tqdm as hf_tqdm from packaging import version from requests import HTTPError @@ -37,14 +40,18 @@ from tqdm.contrib.concurrent import thread_map from ..version import VERSION as __version__ -from .constants import (DEPRECATED_REVISION_ARGS, - HUGGINGFACE_CO_RESOLVE_ENDPOINT, PPDIFFUSERS_CACHE, - PPNLP_BOS_RESOLVE_ENDPOINT, - TORCH_SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME) +from .constants import ( + DEPRECATED_REVISION_ARGS, + HUGGINGFACE_CO_RESOLVE_ENDPOINT, + PPDIFFUSERS_CACHE, + PPNLP_BOS_RESOLVE_ENDPOINT, + TORCH_SAFETENSORS_WEIGHTS_NAME, + WEIGHTS_NAME, +) from .logging import get_logger -def _add_variant(weights_name: str, variant: Optional[str]=None) -> str: +def _add_variant(weights_name: str, variant: Optional[str] = None) -> str: if variant is not None: splits = weights_name.split(".") splits = splits[:-1] + [variant] + splits[-1:] @@ -55,36 +62,34 @@ def _add_variant(weights_name: str, variant: Optional[str]=None) -> str: # https://github.com/huggingface/diffusers/blob/da2ce1a6b92f48cabe9e9d3944c4ee8b007b2871/src/diffusers/utils/hub_utils.py#L246 def _get_model_file( - pretrained_model_name_or_path, - *, - weights_name, - subfolder, - cache_dir, - force_download=False, - revision=None, - proxies=None, - resume_download=False, - local_files_only=None, - use_auth_token=None, - user_agent=None, - commit_hash=None, - file_lock_timeout=-1, - from_hf_hub=False, ): + pretrained_model_name_or_path, + *, + weights_name, + subfolder, + cache_dir, + force_download=False, + revision=None, + proxies=None, + resume_download=False, + local_files_only=None, + use_auth_token=None, + user_agent=None, + commit_hash=None, + file_lock_timeout=-1, + from_hf_hub=False, +): pretrained_model_name_or_path = str(pretrained_model_name_or_path) if os.path.isfile(pretrained_model_name_or_path): return pretrained_model_name_or_path elif os.path.isdir(pretrained_model_name_or_path): - if os.path.isfile( - os.path.join(pretrained_model_name_or_path, weights_name)): + if os.path.isfile(os.path.join(pretrained_model_name_or_path, weights_name)): # Load from a PyTorch checkpoint - model_file = os.path.join(pretrained_model_name_or_path, - weights_name) + model_file = os.path.join(pretrained_model_name_or_path, weights_name) return model_file elif subfolder is not None and os.path.isfile( - os.path.join(pretrained_model_name_or_path, subfolder, - weights_name)): - model_file = os.path.join(pretrained_model_name_or_path, subfolder, - weights_name) + os.path.join(pretrained_model_name_or_path, subfolder, weights_name) + ): + model_file = os.path.join(pretrained_model_name_or_path, subfolder, weights_name) return model_file else: raise EnvironmentError( @@ -105,19 +110,20 @@ def _get_model_file( use_auth_token=use_auth_token, user_agent=user_agent, file_lock_timeout=file_lock_timeout, - commit_hash=commit_hash, ) + commit_hash=commit_hash, + ) REPO_TYPES = ["model"] DEFAULT_REVISION = "main" # REPO_ID_SEPARATOR = "--" REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$") -PPDIFFUSERS_BOS_URL_TEMPLATE = ( - PPNLP_BOS_RESOLVE_ENDPOINT + - "/{repo_type}/community/{repo_id}/{revision}/{filename}") +PPDIFFUSERS_BOS_URL_TEMPLATE = PPNLP_BOS_RESOLVE_ENDPOINT + "/{repo_type}/community/{repo_id}/{revision}/{filename}" ALLOW_PATTERNS_MAPPING = { - "scheduler": ["scheduler_config.json", ], + "scheduler": [ + "scheduler_config.json", + ], "text_encoder": [ "model_state.pdparams", "config.json", @@ -190,12 +196,13 @@ def _get_model_file( def ppdiffusers_bos_url( - repo_id: str, - filename: str, - *, - subfolder: Optional[str]=None, - repo_type: Optional[str]=None, - revision: Optional[str]=None, ) -> str: + repo_id: str, + filename: str, + *, + subfolder: Optional[str] = None, + repo_type: Optional[str] = None, + revision: Optional[str] = None, +) -> str: if subfolder == "": subfolder = None if subfolder is not None: @@ -212,9 +219,9 @@ def ppdiffusers_bos_url( return PPDIFFUSERS_BOS_URL_TEMPLATE.format( repo_type=repo_type, repo_id=repo_id, - revision=quote( - revision, safe=""), - filename=quote(filename), ).replace(f"/{DEFAULT_REVISION}/", "/") + revision=quote(revision, safe=""), + filename=quote(filename), + ).replace(f"/{DEFAULT_REVISION}/", "/") def repo_folder_name(*, repo_id: str, repo_type: str) -> str: @@ -229,16 +236,17 @@ def repo_folder_name(*, repo_id: str, repo_type: str) -> str: def ppdiffusers_bos_download( - repo_id: str, - filename: str, - *, - subfolder: Optional[str]=None, - repo_type: Optional[str]=None, - revision: Optional[str]=None, - cache_dir: Union[str, Path, None]=None, - force_download: bool=False, - resume_download: bool=False, - file_lock_timeout: int=-1, ): + repo_id: str, + filename: str, + *, + subfolder: Optional[str] = None, + repo_type: Optional[str] = None, + revision: Optional[str] = None, + cache_dir: Union[str, Path, None] = None, + force_download: bool = False, + resume_download: bool = False, + file_lock_timeout: int = -1, +): if cache_dir is None: cache_dir = PPDIFFUSERS_CACHE if revision is None: @@ -256,12 +264,8 @@ def ppdiffusers_bos_download( repo_type = REPO_TYPES[0] if repo_type not in REPO_TYPES: - raise ValueError( - f"Invalid repo type: {repo_type}. Accepted repo types are:" - f" {str(REPO_TYPES)}") - storage_folder = os.path.join( - cache_dir, repo_folder_name( - repo_id=repo_id, repo_type=repo_type)) + raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are:" f" {str(REPO_TYPES)}") + storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type)) os.makedirs(storage_folder, exist_ok=True) # cross platform transcription of filename, to be used as a local file path. @@ -275,8 +279,7 @@ def ppdiffusers_bos_download( if os.path.exists(pointer_path) and not force_download: return pointer_path - url_to_download = ppdiffusers_bos_url( - repo_id, filename, repo_type=repo_type, revision=revision) + url_to_download = ppdiffusers_bos_url(repo_id, filename, repo_type=repo_type, revision=revision) blob_path = os.path.join(storage_folder, filename) # Prevent parallel downloads of the same file with a lock. @@ -312,10 +315,8 @@ def _resumable_file_manager(): resume_size = 0 else: temp_file_manager = partial( # type: ignore - tempfile.NamedTemporaryFile, - mode="wb", - dir=cache_dir, - delete=False) + tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False + ) resume_size = 0 # Download to temporary file, then copy to cache dir once finished. @@ -328,7 +329,8 @@ def _resumable_file_manager(): temp_file, proxies=None, resume_size=resume_size, - headers=None, ) + headers=None, + ) logger.info("storing %s in cache at %s", url_to_download, blob_path) _chmod_and_replace(temp_file.name, blob_path) @@ -341,12 +343,13 @@ def _resumable_file_manager(): def ppdiffusers_url_download( - url_to_download: str, - cache_dir: Union[str, Path, None]=None, - filename: Optional[str]=None, - force_download: bool=False, - resume_download: bool=False, - file_lock_timeout: int=-1, ): + url_to_download: str, + cache_dir: Union[str, Path, None] = None, + filename: Optional[str] = None, + force_download: bool = False, + resume_download: bool = False, + file_lock_timeout: int = -1, +): if cache_dir is None: cache_dir = PPDIFFUSERS_CACHE if isinstance(cache_dir, Path): @@ -386,10 +389,8 @@ def _resumable_file_manager(): resume_size = 0 else: temp_file_manager = partial( # type: ignore - tempfile.NamedTemporaryFile, - mode="wb", - dir=cache_dir, - delete=False) + tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False + ) resume_size = 0 # Download to temporary file, then copy to cache dir once finished. @@ -402,7 +403,8 @@ def _resumable_file_manager(): temp_file, proxies=None, resume_size=resume_size, - headers=None, ) + headers=None, + ) logger.info("storing %s in cache at %s", url_to_download, file_path) _chmod_and_replace(temp_file.name, file_path) @@ -414,28 +416,29 @@ def _resumable_file_manager(): def bos_hf_download( - pretrained_model_name_or_path, - *, - filename, - subfolder, - cache_dir, - force_download=False, - revision=None, - from_hf_hub=False, - proxies=None, - resume_download=False, - local_files_only=None, - use_auth_token=None, - user_agent=None, - file_lock_timeout=-1, - commit_hash=None, ): + pretrained_model_name_or_path, + *, + filename, + subfolder, + cache_dir, + force_download=False, + revision=None, + from_hf_hub=False, + proxies=None, + resume_download=False, + local_files_only=None, + use_auth_token=None, + user_agent=None, + file_lock_timeout=-1, + commit_hash=None, +): if from_hf_hub: # 1. First check if deprecated way of loading from branches is used - if (revision in DEPRECATED_REVISION_ARGS and - (filename == WEIGHTS_NAME or - filename == TORCH_SAFETENSORS_WEIGHTS_NAME) and - version.parse(version.parse(__version__).base_version) >= - version.parse("0.17.0")): + if ( + revision in DEPRECATED_REVISION_ARGS + and (filename == WEIGHTS_NAME or filename == TORCH_SAFETENSORS_WEIGHTS_NAME) + and version.parse(version.parse(__version__).base_version) >= version.parse("0.17.0") + ): try: model_file = hf_hub_download( pretrained_model_name_or_path, @@ -448,15 +451,18 @@ def bos_hf_download( use_auth_token=use_auth_token, user_agent=user_agent, subfolder=subfolder, - revision=revision or commit_hash, ) + revision=revision or commit_hash, + ) warnings.warn( f"Loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'` is deprecated. Loading instead from `revision='main'` with `variant={revision}`. Loading model variants via `revision='{revision}'` will be removed in diffusers v1. Please use `variant='{revision}'` instead.", - FutureWarning, ) + FutureWarning, + ) return model_file except: # noqa: E722 warnings.warn( f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'`. This behavior is deprecated and will be removed in diffusers v1. One should use `variant='{revision}'` instead. However, it appears that {pretrained_model_name_or_path} currently does not have a {_add_variant(filename, revision)} file in the 'main' branch of {pretrained_model_name_or_path}. \n The Diffusers team and community would be very grateful if you could open an issue: https://github.com/huggingface/diffusers/issues/new with the title '{pretrained_model_name_or_path} is missing {_add_variant(filename, revision)}' so that the correct variant file can be added.", - FutureWarning, ) + FutureWarning, + ) # 2. Load model file as usual try: model_file = hf_hub_download( @@ -470,7 +476,8 @@ def bos_hf_download( use_auth_token=use_auth_token, user_agent=user_agent, subfolder=subfolder, - revision=revision, ) + revision=revision, + ) return model_file except RepositoryNotFoundError: @@ -478,7 +485,8 @@ def bos_hf_download( f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier " "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a " "token having permission to this repo with `use_auth_token` or log in with `huggingface-cli " - "login`.") + "login`." + ) except RevisionNotFoundError: raise EnvironmentError( f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for " @@ -486,9 +494,7 @@ def bos_hf_download( f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions." ) except EntryNotFoundError: - raise EnvironmentError( - f"{pretrained_model_name_or_path} does not appear to have a file named {filename}." - ) + raise EnvironmentError(f"{pretrained_model_name_or_path} does not appear to have a file named {filename}.") except HTTPError as err: raise EnvironmentError( f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n{err}" @@ -506,7 +512,8 @@ def bos_hf_download( f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from " "'https://huggingface.co/models', make sure you don't have a local directory with the same name. " f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory " - f"containing a file named {filename}") + f"containing a file named {filename}" + ) except KeyboardInterrupt: raise EnvironmentError( "You have interrupted the download, if you want to continue the download, you can set `resume_download=True`!" @@ -521,7 +528,8 @@ def bos_hf_download( resume_download=resume_download, subfolder=subfolder, revision=revision, - file_lock_timeout=file_lock_timeout, ) + file_lock_timeout=file_lock_timeout, + ) return model_file except HTTPError as err: raise EnvironmentError( @@ -529,13 +537,15 @@ def bos_hf_download( f"There was a specific connection error when trying to load '{pretrained_model_name_or_path}'! " f"We couldn't connect to '{PPNLP_BOS_RESOLVE_ENDPOINT}' to load this model, couldn't find it " f"in the cached files and it looks like '{pretrained_model_name_or_path}' is not the path to a " - f"directory containing a file named '{filename}'.") + f"directory containing a file named '{filename}'." + ) except EnvironmentError: raise EnvironmentError( f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from " f"'{PPNLP_BOS_RESOLVE_ENDPOINT}', make sure you don't have a local directory with the same name. " f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory " - f"containing a file named '{filename}'") + f"containing a file named '{filename}'" + ) except KeyboardInterrupt: raise EnvironmentError( "You have interrupted the download, if you want to continue the download, you can set `resume_download=True`!" @@ -562,20 +572,21 @@ def url_file_exists(url: str) -> bool: def ppdiffusers_bos_dir_download( - repo_id: str, - *, - revision: Optional[str]=None, - repo_type: Optional[str]=None, - cache_dir: Union[str, Path, None]=None, - force_download: bool=False, - resume_download: bool=False, - folder_names: Optional[Union[List[str], str]]=None, - max_workers: int=1, - tqdm_class: Optional[base_tqdm]=None, - variant: Optional[str]=None, - is_fastdeploy_model: Optional[str]=False, - file_lock_timeout: int=-1, - local_files_only: bool=False, ) -> str: + repo_id: str, + *, + revision: Optional[str] = None, + repo_type: Optional[str] = None, + cache_dir: Union[str, Path, None] = None, + force_download: bool = False, + resume_download: bool = False, + folder_names: Optional[Union[List[str], str]] = None, + max_workers: int = 1, + tqdm_class: Optional[base_tqdm] = None, + variant: Optional[str] = None, + is_fastdeploy_model: Optional[str] = False, + file_lock_timeout: int = -1, + local_files_only: bool = False, +) -> str: # update repo id must end with @fastdeploy if is_fastdeploy_model and not repo_id.endswith("@fastdeploy"): repo_id = f"{repo_id}@fastdeploy" @@ -585,12 +596,9 @@ def ppdiffusers_bos_dir_download( filtered_repo_files = [["model_index.json", None]] for subfolder in folder_names: - allow_patterns = ALLOW_PATTERNS_MAPPING.get( - subfolder, ALLOW_PATTERNS_MAPPING["others"]) + allow_patterns = ALLOW_PATTERNS_MAPPING.get(subfolder, ALLOW_PATTERNS_MAPPING["others"]) if is_fastdeploy_model: - allow_patterns = [ - ap for ap in allow_patterns if "pdparams" not in ap - ] + allow_patterns = [ap for ap in allow_patterns if "pdparams" not in ap] allow_patterns.extend(["inference.pdiparams", "inference.pdmodel"]) for filename in allow_patterns: need_to_check_no_variant_file = False @@ -602,25 +610,31 @@ def ppdiffusers_bos_dir_download( url = ppdiffusers_bos_url( repo_id, filename=filename, - subfolder=subfolder, ) + subfolder=subfolder, + ) if url_file_exists(url): # exist file - filtered_repo_files.append([ - filename, - subfolder, - ]) + filtered_repo_files.append( + [ + filename, + subfolder, + ] + ) else: if need_to_check_no_variant_file: url = ppdiffusers_bos_url( repo_id, filename=raw_filename, - subfolder=subfolder, ) + subfolder=subfolder, + ) if url_file_exists(url): # exist file - filtered_repo_files.append([ - raw_filename, - subfolder, - ]) + filtered_repo_files.append( + [ + raw_filename, + subfolder, + ] + ) def _inner_ppdiffusers_bos_download(repo_file_list): filename, _subfolder = repo_file_list @@ -633,7 +647,8 @@ def _inner_ppdiffusers_bos_download(repo_file_list): revision=revision, resume_download=resume_download, force_download=force_download, - file_lock_timeout=file_lock_timeout, ) + file_lock_timeout=file_lock_timeout, + ) thread_map( _inner_ppdiffusers_bos_download, @@ -641,5 +656,6 @@ def _inner_ppdiffusers_bos_download(repo_file_list): desc=f"Fetching {len(filtered_repo_files)} files", max_workers=max_workers, # User can use its own tqdm class or the default one from `huggingface_hub.utils` - tqdm_class=tqdm_class or hf_tqdm, ) + tqdm_class=tqdm_class or hf_tqdm, + ) return os.path.join(cache_dir, repo_id) diff --git a/ppdiffusers/ppdiffusers/utils/dummy_paddle_objects.py b/ppdiffusers/ppdiffusers/utils/dummy_paddle_objects.py index cca1dbd1d7d0d..fcbc659ea253c 100644 --- a/ppdiffusers/ppdiffusers/utils/dummy_paddle_objects.py +++ b/ppdiffusers/ppdiffusers/utils/dummy_paddle_objects.py @@ -225,8 +225,7 @@ def get_cosine_schedule_with_warmup(*args, **kwargs): def get_cosine_with_hard_restarts_schedule_with_warmup(*args, **kwargs): - requires_backends(get_cosine_with_hard_restarts_schedule_with_warmup, - ["paddle"]) + requires_backends(get_cosine_with_hard_restarts_schedule_with_warmup, ["paddle"]) def get_linear_schedule_with_warmup(*args, **kwargs): diff --git a/ppdiffusers/ppdiffusers/utils/dynamic_modules_utils.py b/ppdiffusers/ppdiffusers/utils/dynamic_modules_utils.py index c1da547b98964..574a504c2d775 100644 --- a/ppdiffusers/ppdiffusers/utils/dynamic_modules_utils.py +++ b/ppdiffusers/ppdiffusers/utils/dynamic_modules_utils.py @@ -26,14 +26,16 @@ from typing import Dict, Optional, Union from urllib import request -from huggingface_hub import (HfFolder, cached_download, hf_hub_download, - model_info) +from huggingface_hub import HfFolder, cached_download, hf_hub_download, model_info -from . import (PPDIFFUSERS_DYNAMIC_MODULE_NAME, PPDIFFUSERS_MODULES_CACHE, - logging) +from . import PPDIFFUSERS_DYNAMIC_MODULE_NAME, PPDIFFUSERS_MODULES_CACHE, logging -COMMUNITY_PIPELINES_URL = "https://raw.githubusercontent.com/PaddlePaddle/PaddleMIX/{revision}/ppdiffusers/examples/community/{pipeline}.py" -GITEE_COMMUNITY_PIPELINES_URL = "https://gitee.com/paddlepaddle/PaddleMIX/raw/{revision}/ppdiffusers/examples/community/{pipeline}.py" +COMMUNITY_PIPELINES_URL = ( + "https://raw.githubusercontent.com/PaddlePaddle/PaddleMIX/{revision}/ppdiffusers/examples/community/{pipeline}.py" +) +GITEE_COMMUNITY_PIPELINES_URL = ( + "https://gitee.com/paddlepaddle/PaddleMIX/raw/{revision}/ppdiffusers/examples/community/{pipeline}.py" +) logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -87,11 +89,9 @@ def get_relative_imports(module_file): content = f.read() # Imports of the form `import .xxx` - relative_imports = re.findall( - "^\s*import\s+\.(\S+)\s*$", content, flags=re.MULTILINE) + relative_imports = re.findall("^\s*import\s+\.(\S+)\s*$", content, flags=re.MULTILINE) # Imports of the form `from .xxx import yyy` - relative_imports += re.findall( - "^\s*from\s+\.(\S+)\s+import", content, flags=re.MULTILINE) + relative_imports += re.findall("^\s*from\s+\.(\S+)\s+import", content, flags=re.MULTILINE) # Unique-ify return list(set(relative_imports)) @@ -116,9 +116,7 @@ def get_relative_import_files(module_file): module_path = Path(module_file).parent new_import_files = [str(module_path / m) for m in new_imports] - new_import_files = [ - f for f in new_import_files if f not in all_relative_imports - ] + new_import_files = [f for f in new_import_files if f not in all_relative_imports] files_to_check = [f"{f}.py" for f in new_import_files] no_change = len(new_import_files) == 0 @@ -137,8 +135,7 @@ def check_imports(filename): # Imports of the form `import xxx` imports = re.findall("^\s*import\s+(\S+)\s*$", content, flags=re.MULTILINE) # Imports of the form `from xxx import yyy` - imports += re.findall( - "^\s*from\s+(\S+)\s+import", content, flags=re.MULTILINE) + imports += re.findall("^\s*from\s+(\S+)\s+import", content, flags=re.MULTILINE) # Only keep the top-level module imports = [imp.split(".")[0] for imp in imports if not imp.startswith(".")] @@ -187,29 +184,33 @@ def find_pipeline_class(loaded_module): pipeline_class = None for cls_name, cls in cls_members.items(): - if (cls_name != DiffusionPipeline.__name__ and - issubclass(cls, DiffusionPipeline) and - cls.__module__.split(".")[0] != "ppdiffusers"): + if ( + cls_name != DiffusionPipeline.__name__ + and issubclass(cls, DiffusionPipeline) + and cls.__module__.split(".")[0] != "ppdiffusers" + ): if pipeline_class is not None: raise ValueError( f"Multiple classes that inherit from {DiffusionPipeline.__name__} have been found:" f" {pipeline_class.__name__}, and {cls_name}. Please make sure to define only one in" - f" {loaded_module}.") + f" {loaded_module}." + ) pipeline_class = cls return pipeline_class def get_cached_module_file( - pretrained_model_name_or_path: Union[str, os.PathLike], - module_file: str, - cache_dir: Optional[Union[str, os.PathLike]]=None, - force_download: bool=False, - resume_download: bool=False, - proxies: Optional[Dict[str, str]]=None, - use_auth_token: Optional[Union[bool, str]]=None, - revision: Optional[str]=None, - local_files_only: bool=False, ): + pretrained_model_name_or_path: Union[str, os.PathLike], + module_file: str, + cache_dir: Optional[Union[str, os.PathLike]] = None, + force_download: bool = False, + resume_download: bool = False, + proxies: Optional[Dict[str, str]] = None, + use_auth_token: Optional[Union[bool, str]] = None, + revision: Optional[str] = None, + local_files_only: bool = False, +): """ Prepares Downloads a module from a local folder or a distant repo and returns its path inside the cached Transformers module. @@ -260,8 +261,7 @@ def get_cached_module_file( # Download and cache module_file from the repo `pretrained_model_name_or_path` of grab it if it's a local file. pretrained_model_name_or_path = str(pretrained_model_name_or_path) - module_file_or_url = os.path.join(pretrained_model_name_or_path, - module_file) + module_file_or_url = os.path.join(pretrained_model_name_or_path, module_file) if os.path.isfile(module_file_or_url): resolved_module_file = module_file_or_url @@ -273,8 +273,7 @@ def get_cached_module_file( logger.info(f"Defaulting to main: {revision}.") # community pipeline on GitHub - github_url = COMMUNITY_PIPELINES_URL.format( - revision=revision, pipeline=pretrained_model_name_or_path) + github_url = COMMUNITY_PIPELINES_URL.format(revision=revision, pipeline=pretrained_model_name_or_path) try: resolved_module_file = cached_download( github_url, @@ -283,13 +282,12 @@ def get_cached_module_file( proxies=proxies, resume_download=resume_download, local_files_only=local_files_only, - use_auth_token=False, ) + use_auth_token=False, + ) submodule = "git" module_file = pretrained_model_name_or_path + ".py" except EnvironmentError: - logger.error( - f"Could not locate the {module_file} inside {pretrained_model_name_or_path}." - ) + logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.") raise else: try: @@ -302,13 +300,11 @@ def get_cached_module_file( proxies=proxies, resume_download=resume_download, local_files_only=local_files_only, - use_auth_token=use_auth_token, ) - submodule = os.path.join( - "local", "--".join(pretrained_model_name_or_path.split("/"))) - except EnvironmentError: - logger.error( - f"Could not locate the {module_file} inside {pretrained_model_name_or_path}." + use_auth_token=use_auth_token, ) + submodule = os.path.join("local", "--".join(pretrained_model_name_or_path.split("/"))) + except EnvironmentError: + logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.") raise # Check we have all the requirements in our environment @@ -327,7 +323,8 @@ def get_cached_module_file( module_needed = f"{module_needed}.py" shutil.copy( os.path.join(pretrained_model_name_or_path, module_needed), - submodule_path / module_needed, ) + submodule_path / module_needed, + ) else: # Get the commit hash # TODO: we will get this info in the etag soon, so retrieve it from there and not here. @@ -338,8 +335,7 @@ def get_cached_module_file( else: token = None - commit_hash = model_info( - pretrained_model_name_or_path, revision=revision, token=token).sha + commit_hash = model_info(pretrained_model_name_or_path, revision=revision, token=token).sha # The module file will end up being placed in a subfolder with the git hash of the repo. This way we get the # benefit of versioning. @@ -361,22 +357,24 @@ def get_cached_module_file( proxies=proxies, use_auth_token=use_auth_token, revision=revision, - local_files_only=local_files_only, ) + local_files_only=local_files_only, + ) return os.path.join(full_submodule, module_file) def get_class_from_dynamic_module( - pretrained_model_name_or_path: Union[str, os.PathLike], - module_file: str, - class_name: Optional[str]=None, - cache_dir: Optional[Union[str, os.PathLike]]=None, - force_download: bool=False, - resume_download: bool=False, - proxies: Optional[Dict[str, str]]=None, - use_auth_token: Optional[Union[bool, str]]=None, - revision: Optional[str]=None, - local_files_only: bool=False, - **kwargs, ): + pretrained_model_name_or_path: Union[str, os.PathLike], + module_file: str, + class_name: Optional[str] = None, + cache_dir: Optional[Union[str, os.PathLike]] = None, + force_download: bool = False, + resume_download: bool = False, + proxies: Optional[Dict[str, str]] = None, + use_auth_token: Optional[Union[bool, str]] = None, + revision: Optional[str] = None, + local_files_only: bool = False, + **kwargs, +): """ Extracts a class from a module file, present in the local folder or repository of a model. @@ -449,5 +447,6 @@ def get_class_from_dynamic_module( proxies=proxies, use_auth_token=use_auth_token, revision=revision, - local_files_only=local_files_only, ) + local_files_only=local_files_only, + ) return get_class_in_module(class_name, final_module.replace(".py", "")) diff --git a/ppdiffusers/ppdiffusers/utils/hub_utils.py b/ppdiffusers/ppdiffusers/utils/hub_utils.py index 391c8099f0b30..8de82f5ab9800 100644 --- a/ppdiffusers/ppdiffusers/utils/hub_utils.py +++ b/ppdiffusers/ppdiffusers/utils/hub_utils.py @@ -28,8 +28,14 @@ from ..version import VERSION as __version__ from .constants import DIFFUSERS_CACHE, HUGGINGFACE_CO_RESOLVE_ENDPOINT from .import_utils import ( - ENV_VARS_TRUE_VALUES, _fastdeploy_version, _paddle_version, _torch_version, - is_fastdeploy_available, is_paddle_available, is_torch_available) + ENV_VARS_TRUE_VALUES, + _fastdeploy_version, + _paddle_version, + _torch_version, + is_fastdeploy_available, + is_paddle_available, + is_torch_available, +) from .logging import get_logger logger = get_logger(__name__) @@ -37,12 +43,11 @@ MODEL_CARD_TEMPLATE_PATH = Path(__file__).parent / "model_card_template.md" SESSION_ID = uuid4().hex HF_HUB_OFFLINE = os.getenv("HF_HUB_OFFLINE", "").upper() in ENV_VARS_TRUE_VALUES -DISABLE_TELEMETRY = os.getenv("DISABLE_TELEMETRY", - "").upper() in ENV_VARS_TRUE_VALUES +DISABLE_TELEMETRY = os.getenv("DISABLE_TELEMETRY", "").upper() in ENV_VARS_TRUE_VALUES HUGGINGFACE_CO_TELEMETRY = HUGGINGFACE_CO_RESOLVE_ENDPOINT + "/api/telemetry/" -def http_user_agent(user_agent: Union[Dict, str, None]=None) -> str: +def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str: """ Formats a user-agent string with basic info about a request. """ @@ -65,9 +70,7 @@ def http_user_agent(user_agent: Union[Dict, str, None]=None) -> str: return ua -def get_full_repo_name(model_id: str, - organization: Optional[str]=None, - token: Optional[str]=None): +def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None): if token is None: token = HfFolder.get_token() if organization is None: @@ -82,7 +85,8 @@ def create_model_card(args, model_name): raise ValueError( "Modelcard rendering is based on Jinja templates." " Please make sure to have `jinja` installed before using `create_model_card`." - " To install it, please run `pip install Jinja2`.") + " To install it, please run `pip install Jinja2`." + ) if hasattr(args, "local_rank") and args.local_rank not in [-1, 0]: return @@ -97,41 +101,35 @@ def create_model_card(args, model_name): library_name="ppdiffusers", tags=[], datasets=args.dataset_name, - metrics=[], ), + metrics=[], + ), template_path=MODEL_CARD_TEMPLATE_PATH, model_name=model_name, repo_name=repo_name, - dataset_name=args.dataset_name - if hasattr(args, "dataset_name") else None, + dataset_name=args.dataset_name if hasattr(args, "dataset_name") else None, learning_rate=args.learning_rate, train_batch_size=args.train_batch_size, eval_batch_size=args.eval_batch_size, gradient_accumulation_steps=( - args.gradient_accumulation_steps - if hasattr(args, "gradient_accumulation_steps") else None), + args.gradient_accumulation_steps if hasattr(args, "gradient_accumulation_steps") else None + ), adam_beta1=args.adam_beta1 if hasattr(args, "adam_beta1") else None, adam_beta2=args.adam_beta2 if hasattr(args, "adam_beta2") else None, - adam_weight_decay=args.adam_weight_decay - if hasattr(args, "adam_weight_decay") else None, - adam_epsilon=args.adam_epsilon - if hasattr(args, "adam_epsilon") else None, - lr_scheduler=args.lr_scheduler - if hasattr(args, "lr_scheduler") else None, - lr_warmup_steps=args.lr_warmup_steps - if hasattr(args, "lr_warmup_steps") else None, - ema_inv_gamma=args.ema_inv_gamma - if hasattr(args, "ema_inv_gamma") else None, + adam_weight_decay=args.adam_weight_decay if hasattr(args, "adam_weight_decay") else None, + adam_epsilon=args.adam_epsilon if hasattr(args, "adam_epsilon") else None, + lr_scheduler=args.lr_scheduler if hasattr(args, "lr_scheduler") else None, + lr_warmup_steps=args.lr_warmup_steps if hasattr(args, "lr_warmup_steps") else None, + ema_inv_gamma=args.ema_inv_gamma if hasattr(args, "ema_inv_gamma") else None, ema_power=args.ema_power if hasattr(args, "ema_power") else None, - ema_max_decay=args.ema_max_decay - if hasattr(args, "ema_max_decay") else None, - mixed_precision=args.mixed_precision, ) + ema_max_decay=args.ema_max_decay if hasattr(args, "ema_max_decay") else None, + mixed_precision=args.mixed_precision, + ) card_path = os.path.join(args.output_dir, "README.md") model_card.save(card_path) -def extract_commit_hash(resolved_file: Optional[str], - commit_hash: Optional[str]=None): +def extract_commit_hash(resolved_file: Optional[str], commit_hash: Optional[str] = None): """ Extracts the commit hash from a resolved filename toward a cache file. """ @@ -150,14 +148,12 @@ def extract_commit_hash(resolved_file: Optional[str], # - Diffusers doesn't use custom environment variables to specify the cache path. # - There is no need to migrate the cache format, just move the files to the new location. hf_cache_home = os.path.expanduser( - os.getenv("HF_HOME", - os.path.join( - os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))) + os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface")) +) old_diffusers_cache = os.path.join(hf_cache_home, "diffusers") -def move_cache(old_cache_dir: Optional[str]=None, - new_cache_dir: Optional[str]=None) -> None: +def move_cache(old_cache_dir: Optional[str] = None, new_cache_dir: Optional[str] = None) -> None: if new_cache_dir is None: new_cache_dir = DIFFUSERS_CACHE if old_cache_dir is None: @@ -168,8 +164,7 @@ def move_cache(old_cache_dir: Optional[str]=None, # move file blob by blob for old_blob_path in old_cache_dir.glob("**/blobs/*"): if old_blob_path.is_file() and not old_blob_path.is_symlink(): - new_blob_path = new_cache_dir / old_blob_path.relative_to( - old_cache_dir) + new_blob_path = new_cache_dir / old_blob_path.relative_to(old_cache_dir) new_blob_path.parent.mkdir(parents=True, exist_ok=True) os.replace(old_blob_path, new_blob_path) try: @@ -182,8 +177,7 @@ def move_cache(old_cache_dir: Optional[str]=None, # At this point, old_cache_dir contains symlinks to the new cache (it can still be used). -cache_version_file = os.path.join(DIFFUSERS_CACHE, - "version_diffusers_cache.txt") +cache_version_file = os.path.join(DIFFUSERS_CACHE, "version_diffusers_cache.txt") if not os.path.isfile(cache_version_file): cache_version = 0 else: @@ -194,13 +188,13 @@ def move_cache(old_cache_dir: Optional[str]=None, cache_version = 0 if cache_version < 1: - old_cache_is_not_empty = (os.path.isdir(old_diffusers_cache) and - len(os.listdir(old_diffusers_cache)) > 0) + old_cache_is_not_empty = os.path.isdir(old_diffusers_cache) and len(os.listdir(old_diffusers_cache)) > 0 if old_cache_is_not_empty: logger.warning( "The cache for model files in Diffusers v0.14.0 has moved to a new location. Moving your " "existing cached models. This is a one-time operation, you can interrupt it or run it " - "later by calling `diffusers.utils.hub_utils.move_cache()`.") + "later by calling `diffusers.utils.hub_utils.move_cache()`." + ) try: move_cache() except Exception as e: @@ -208,7 +202,8 @@ def move_cache(old_cache_dir: Optional[str]=None, logger.error( f"There was a problem when trying to move your cache:\n\n{trace}\n{e.__class__.__name__}: {e}\n\nPlease " "file an issue at https://github.com/huggingface/diffusers/issues/new/choose, copy paste this whole " - "message and we will do our best to help.") + "message and we will do our best to help." + ) if cache_version < 1: try: @@ -218,4 +213,5 @@ def move_cache(old_cache_dir: Optional[str]=None, except Exception: logger.warning( f"There was a problem when trying to write in your cache folder ({DIFFUSERS_CACHE}). Please, ensure " - "the directory exists and can be written to.") + "the directory exists and can be written to." + ) diff --git a/ppdiffusers/ppdiffusers/utils/import_utils.py b/ppdiffusers/ppdiffusers/utils/import_utils.py index 577f4d0dc3498..cc43592be9962 100644 --- a/ppdiffusers/ppdiffusers/utils/import_utils.py +++ b/ppdiffusers/ppdiffusers/utils/import_utils.py @@ -64,8 +64,9 @@ if _paddle_available: try: - from paddle.incubate.nn.memory_efficient_attention import \ - memory_efficient_attention # noqa + from paddle.incubate.nn.memory_efficient_attention import ( # noqa + memory_efficient_attention, + ) _ppxformers_available = True except ImportError: @@ -90,8 +91,7 @@ if _safetensors_available: try: _safetensors_version = importlib_metadata.version("safetensors") - logger.info( - f"Safetensors version {_safetensors_version} available.") + logger.info(f"Safetensors version {_safetensors_version} available.") except importlib_metadata.PackageNotFoundError: _safetensors_available = False else: @@ -101,8 +101,7 @@ _transformers_available = importlib.util.find_spec("transformers") is not None try: _transformers_version = importlib_metadata.version("transformers") - logger.debug( - f"Successfully imported transformers version {_transformers_version}") + logger.debug(f"Successfully imported transformers version {_transformers_version}") except importlib_metadata.PackageNotFoundError: _transformers_available = False @@ -116,8 +115,7 @@ _unidecode_available = importlib.util.find_spec("unidecode") is not None try: _unidecode_version = importlib_metadata.version("unidecode") - logger.debug( - f"Successfully imported unidecode version {_unidecode_version}") + logger.debug(f"Successfully imported unidecode version {_unidecode_version}") except importlib_metadata.PackageNotFoundError: _unidecode_available = False @@ -134,14 +132,12 @@ pass _fastdeploy_available = _fastdeploy_version != "N/A" if _fastdeploy_available: - logger.debug( - f"Successfully imported fastdeploy version {_fastdeploy_version}") + logger.debug(f"Successfully imported fastdeploy version {_fastdeploy_version}") _paddlenlp_available = importlib.util.find_spec("paddlenlp") is not None try: _paddlenlp_version = importlib_metadata.version("paddlenlp") - logger.debug( - f"Successfully imported paddlenlp version {_paddlenlp_version}") + logger.debug(f"Successfully imported paddlenlp version {_paddlenlp_version}") except importlib_metadata.PackageNotFoundError: _paddlenlp_available = False @@ -152,7 +148,8 @@ "opencv-python", "opencv-contrib-python", "opencv-python-headless", - "opencv-contrib-python-headless", ) + "opencv-contrib-python-headless", + ) _opencv_version = None for pkg in candidates: try: @@ -183,8 +180,7 @@ _k_diffusion_available = importlib.util.find_spec("k_diffusion") is not None try: _k_diffusion_version = importlib_metadata.version("k_diffusion") - logger.debug( - f"Successfully imported k-diffusion version {_k_diffusion_version}") + logger.debug(f"Successfully imported k-diffusion version {_k_diffusion_version}") except importlib_metadata.PackageNotFoundError: _k_diffusion_available = False @@ -205,16 +201,14 @@ _omegaconf_available = importlib.util.find_spec("omegaconf") is not None try: _omegaconf_version = importlib_metadata.version("omegaconf") - logger.debug( - f"Successfully imported omegaconf version {_omegaconf_version}") + logger.debug(f"Successfully imported omegaconf version {_omegaconf_version}") except importlib_metadata.PackageNotFoundError: _omegaconf_available = False _tensorboard_available = importlib.util.find_spec("tensorboard") try: _tensorboard_version = importlib_metadata.version("tensorboard") - logger.debug( - f"Successfully imported tensorboard version {_tensorboard_version}") + logger.debug(f"Successfully imported tensorboard version {_tensorboard_version}") except importlib_metadata.PackageNotFoundError: _tensorboard_available = False @@ -232,8 +226,7 @@ import einops.layers.paddle einops.layers.paddle - logger.debug( - f"Successfully imported einops version {einops.__version__}") + logger.debug(f"Successfully imported einops version {einops.__version__}") except ImportError: _einops_available = False except importlib_metadata.PackageNotFoundError: @@ -482,27 +475,29 @@ def is_bs4_available(): that match your environment. Please note that you may need to restart your runtime after installation. """ -BACKENDS_MAPPING = OrderedDict([ - ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)), - ("fastdeploy", (is_fastdeploy_available, FASTDEPLOY_IMPORT_ERROR)), - ("paddle", (is_paddle_available, PADDLE_IMPORT_ERROR)), - ("paddlenlp", (is_paddlenlp_available, PADDLENLP_IMPORT_ERROR)), - ("visualdl", (is_visualdl_available, VISUALDL_IMPORT_ERROR)), - ("inflect", (is_inflect_available, INFLECT_IMPORT_ERROR)), - ("opencv", (is_opencv_available, OPENCV_IMPORT_ERROR)), - ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)), - ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)), - ("unidecode", (is_unidecode_available, UNIDECODE_IMPORT_ERROR)), - ("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)), - ("k_diffusion", (is_k_diffusion_available, K_DIFFUSION_IMPORT_ERROR)), - ("wandb", (is_wandb_available, WANDB_IMPORT_ERROR)), - ("omegaconf", (is_omegaconf_available, OMEGACONF_IMPORT_ERROR)), - ("tensorboard", (is_tensorboard_available, TENSORBOARD_IMPORT_ERROR)), - ("einops", (is_einops_available, EINOPS_IMPORT_ERROR)), - ("note_seq", (is_note_seq_available, NOTE_SEQ_IMPORT_ERROR)), - ("compel", (is_compel_available, COMPEL_IMPORT_ERROR)), - ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)), -]) +BACKENDS_MAPPING = OrderedDict( + [ + ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)), + ("fastdeploy", (is_fastdeploy_available, FASTDEPLOY_IMPORT_ERROR)), + ("paddle", (is_paddle_available, PADDLE_IMPORT_ERROR)), + ("paddlenlp", (is_paddlenlp_available, PADDLENLP_IMPORT_ERROR)), + ("visualdl", (is_visualdl_available, VISUALDL_IMPORT_ERROR)), + ("inflect", (is_inflect_available, INFLECT_IMPORT_ERROR)), + ("opencv", (is_opencv_available, OPENCV_IMPORT_ERROR)), + ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)), + ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)), + ("unidecode", (is_unidecode_available, UNIDECODE_IMPORT_ERROR)), + ("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)), + ("k_diffusion", (is_k_diffusion_available, K_DIFFUSION_IMPORT_ERROR)), + ("wandb", (is_wandb_available, WANDB_IMPORT_ERROR)), + ("omegaconf", (is_omegaconf_available, OMEGACONF_IMPORT_ERROR)), + ("tensorboard", (is_tensorboard_available, TENSORBOARD_IMPORT_ERROR)), + ("einops", (is_einops_available, EINOPS_IMPORT_ERROR)), + ("note_seq", (is_note_seq_available, NOTE_SEQ_IMPORT_ERROR)), + ("compel", (is_compel_available, COMPEL_IMPORT_ERROR)), + ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)), + ] +) def requires_backends(obj, backends): @@ -516,26 +511,24 @@ def requires_backends(obj, backends): raise ImportError("".join(failed)) if name in [ - "VersatileDiffusionTextToImagePipeline", - "VersatileDiffusionPipeline", - "VersatileDiffusionDualGuidedPipeline", - "StableDiffusionImageVariationPipeline", - "UnCLIPPipeline", + "VersatileDiffusionTextToImagePipeline", + "VersatileDiffusionPipeline", + "VersatileDiffusionDualGuidedPipeline", + "StableDiffusionImageVariationPipeline", + "UnCLIPPipeline", ] and is_paddlenlp_version("<", "2.5.0"): raise ImportError( f"You need to install `paddlenlp>=2.5.0` in order to use {name}: \n```\n pip install" - " --upgrade paddlenlp \n```") + " --upgrade paddlenlp \n```" + ) - if name in [ - "StableDiffusionDepth2ImgPipeline", - "StableDiffusionPix2PixZeroPipeline", - ] and is_paddlenlp_version( - "<", - "2.5.1" # TODO version + if name in ["StableDiffusionDepth2ImgPipeline", "StableDiffusionPix2PixZeroPipeline"] and is_paddlenlp_version( + "<", "2.5.1" # TODO version ): raise ImportError( f"You need to install `paddlenlp>=2.5.1` in order to use {name}: \n```\n pip install" - " --upgrade paddlenlp \n```") + " --upgrade paddlenlp \n```" + ) class DummyObject(type): @@ -551,9 +544,7 @@ def __getattr__(cls, key): # This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L319 -def compare_versions(library_or_version: Union[str, Version], - operation: str, - requirement_version: str): +def compare_versions(library_or_version: Union[str, Version], operation: str, requirement_version: str): """ Args: Compares a library version to some requirement using a given operation. @@ -565,13 +556,10 @@ def compare_versions(library_or_version: Union[str, Version], The version to compare the library version against """ if operation not in STR_OPERATION_TO_FUNC.keys(): - raise ValueError( - f"`operation` must be one of {list(STR_OPERATION_TO_FUNC.keys())}, received {operation}" - ) + raise ValueError(f"`operation` must be one of {list(STR_OPERATION_TO_FUNC.keys())}, received {operation}") operation = STR_OPERATION_TO_FUNC[operation] if isinstance(library_or_version, str): - library_or_version = parse( - importlib_metadata.version(library_or_version)) + library_or_version = parse(importlib_metadata.version(library_or_version)) return operation(library_or_version, parse(requirement_version)) diff --git a/ppdiffusers/ppdiffusers/utils/initializer_utils.py b/ppdiffusers/ppdiffusers/utils/initializer_utils.py index 9c71cc89861c9..263c7c41a030f 100644 --- a/ppdiffusers/ppdiffusers/utils/initializer_utils.py +++ b/ppdiffusers/ppdiffusers/utils/initializer_utils.py @@ -46,9 +46,7 @@ def _no_grad_uniform_(tensor, a, b): def _no_grad_normal_(tensor, mean=0.0, std=1.0): with paddle.no_grad(): - tensor.copy_( - paddle.normal( - mean=mean, std=std, shape=tensor.shape), True) + tensor.copy_(paddle.normal(mean=mean, std=std, shape=tensor.shape), True) return tensor @@ -134,9 +132,7 @@ def _calculate_fan_in_and_fan_out(tensor, reverse=False): Tuple[fan_in, fan_out] """ if tensor.ndim < 2: - raise ValueError( - "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions" - ) + raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions") if reverse: num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1] @@ -189,8 +185,7 @@ def _calculate_correct_fan(tensor, mode, reverse=False): mode = mode.lower() valid_modes = ["fan_in", "fan_out"] if mode not in valid_modes: - raise ValueError("Mode {} not supported, please use one of {}".format( - mode, valid_modes)) + raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes)) fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse) @@ -216,13 +211,11 @@ def _calculate_gain(nonlinearity, param=None): elif nonlinearity == "leaky_relu": if param is None: negative_slope = 0.01 - elif (not isinstance(param, bool) and isinstance(param, int) or - isinstance(param, float)): + elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float): # True/False are instances of int, hence check above negative_slope = param else: - raise ValueError("negative_slope {} not a valid number".format( - param)) + raise ValueError("negative_slope {} not a valid number".format(param)) return math.sqrt(2.0 / (1 + negative_slope**2)) elif nonlinearity == "selu": return 3.0 / 4 @@ -230,11 +223,7 @@ def _calculate_gain(nonlinearity, param=None): raise ValueError("Unsupported nonlinearity {}".format(nonlinearity)) -def kaiming_uniform_(tensor, - a=0, - mode="fan_in", - nonlinearity="leaky_relu", - reverse=False): +def kaiming_uniform_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False): """ Modified tensor inspace using kaiming_uniform method Args: @@ -252,11 +241,7 @@ def kaiming_uniform_(tensor, return _no_grad_uniform_(tensor, -k, k) -def kaiming_normal_(tensor, - a=0, - mode="fan_in", - nonlinearity="leaky_relu", - reverse=False): +def kaiming_normal_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False): """ Modified tensor inspace using kaiming_normal_ Args: @@ -304,8 +289,7 @@ def reset_initialized_parameter(model, include_self=True): """ for _, m in model.named_sublayers(include_self=include_self): if isinstance(m, nn.Conv2D): - k = float(m._groups) / (m._in_channels * m._kernel_size[0] * - m._kernel_size[1]) + k = float(m._groups) / (m._in_channels * m._kernel_size[0] * m._kernel_size[1]) k = math.sqrt(k) _no_grad_uniform_(m.weight, -k, k) if hasattr(m, "bias") and getattr(m, "bias") is not None: @@ -330,17 +314,17 @@ def reset_initialized_parameter(model, include_self=True): class Init: def __init__(self): for init_func in [ - uniform_, - normal_, - constant_, - ones_, - zeros_, - xavier_uniform_, - xavier_normal_, - kaiming_uniform_, - kaiming_normal_, - linear_init_, - conv_init_, + uniform_, + normal_, + constant_, + ones_, + zeros_, + xavier_uniform_, + xavier_normal_, + kaiming_uniform_, + kaiming_normal_, + linear_init_, + conv_init_, ]: setattr(self, init_func.__name__, init_func) diff --git a/ppdiffusers/ppdiffusers/utils/load_utils.py b/ppdiffusers/ppdiffusers/utils/load_utils.py index 023551a27ce6d..a1602c8862d80 100644 --- a/ppdiffusers/ppdiffusers/utils/load_utils.py +++ b/ppdiffusers/ppdiffusers/utils/load_utils.py @@ -24,8 +24,11 @@ import numpy as np from .constants import get_map_location_default -from .import_utils import (is_paddle_available, is_safetensors_available, - is_torch_available) +from .import_utils import ( + is_paddle_available, + is_safetensors_available, + is_torch_available, +) from .logging import get_logger logger = get_logger(__name__) @@ -68,8 +71,7 @@ def read_prefix_key(path): with open(path, "rb") as file_handler: end_index = seek_by_string(file_handler, "data.pkl", file_size) file_handler.seek(MZ_ZIP_LOCAL_DIR_HEADER_SIZE) - prefix_key = file_handler.read(end_index - MZ_ZIP_LOCAL_DIR_HEADER_SIZE - - len("/data.pkl")) + prefix_key = file_handler.read(end_index - MZ_ZIP_LOCAL_DIR_HEADER_SIZE - len("/data.pkl")) return prefix_key.decode("latin") @@ -89,8 +91,7 @@ def seek_by_string(file_handler, string: str, file_size: int) -> int: word_index = 0 if file_handler.tell() >= file_size - 1: - raise Exception( - f"can't find the find the target string<{string}> in the file") + raise Exception(f"can't find the find the target string<{string}> in the file") return file_handler.tell() @@ -163,21 +164,18 @@ def find_class(self, mod_name, name): return super().find_class(mod_name, name) -def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad, - backward_hooks): +def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad, backward_hooks): # if a tensor has shape [M, N] and stride is [1, N], it's column-wise / fortran-style # if a tensor has shape [M, N] and stride is [M, 1], it's row-wise / C-style # defautls to C-style - if stride is not None and len(stride) > 1 and stride[0] == 1 and stride[ - 1] > 1: + if stride is not None and len(stride) > 1 and stride[0] == 1 and stride[1] > 1: order = "F" else: order = "C" # fix bug when load https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth numel = int(np.prod(size)) - return storage[storage_offset:storage_offset + numel].reshape( - size, order=order) + return storage[storage_offset : storage_offset + numel].reshape(size, order=order) def _rebuild_parameter(data, requires_grad, backward_hooks): @@ -207,8 +205,7 @@ def torch_load(path: str, **pickle_load_args): def load_tensor(dtype, numel, key, location): name = f"{prefix_key}/data/{key}" - typed_storage = np.frombuffer( - torch_zip.open(name).read()[:numel], dtype=dtype) + typed_storage = np.frombuffer(torch_zip.open(name).read()[:numel], dtype=dtype) return typed_storage def persistent_load(saved_id): @@ -226,15 +223,13 @@ def persistent_load(saved_id): typed_storage = loaded_storages[key] else: nbytes = numel * _element_size(dtype) - typed_storage = load_tensor(dtype, nbytes, key, - _maybe_decode_ascii(location)) + typed_storage = load_tensor(dtype, nbytes, key, _maybe_decode_ascii(location)) loaded_storages[key] = typed_storage return typed_storage data_iostream = torch_zip.open(f"{prefix_key}/data.pkl").read() - unpickler_stage = UnpicklerWrapperStage( - io.BytesIO(data_iostream), **pickle_load_args) + unpickler_stage = UnpicklerWrapperStage(io.BytesIO(data_iostream), **pickle_load_args) unpickler_stage.persistent_load = persistent_load state_dict = unpickler_stage.load() torch_zip.close() @@ -263,19 +258,18 @@ def convert_to_paddle(state_dict, return_numpy=False, return_global_step=False): # if "position_id" in k and "int" not in str(v.dtype): # v = v.numpy().astype("int64") if hasattr(v, "numpy") else v.astype("int64") if v.ndim == 0: - v = v.reshape((1, )) + v = v.reshape((1,)) if not return_numpy: # support bfloat16 if "torch.bfloat16" in str(v.dtype): v = v.float() pd_state_dict[k] = ( paddle.to_tensor(v.numpy()).cast(paddle.bfloat16) - if hasattr(v, "numpy") else - paddle.to_tensor(v).cast(paddle.bfloat16)) + if hasattr(v, "numpy") + else paddle.to_tensor(v).cast(paddle.bfloat16) + ) else: - pd_state_dict[k] = (paddle.to_tensor(v.numpy()) - if hasattr(v, "numpy") else - paddle.to_tensor(v)) + pd_state_dict[k] = paddle.to_tensor(v.numpy()) if hasattr(v, "numpy") else paddle.to_tensor(v) else: pd_state_dict[k] = v.numpy() if hasattr(v, "numpy") else v @@ -290,7 +284,7 @@ def convert_to_numpy(state_dict): # if "position_id" in k and "int" not in str(v.dtype): # v = v.numpy().astype("int64") if hasattr(v, "numpy") else v.astype("int64") if v.ndim == 0: - v = v.reshape((1, )) + v = v.reshape((1,)) return pd_state_dict @@ -310,19 +304,18 @@ def safetensors_load(path: str): data = load_file(path) else: - raise ImportError( - "`safetensors_load` requires the `safetensors library: `pip install safetensors`." - ) + raise ImportError("`safetensors_load` requires the `safetensors library: `pip install safetensors`.") return data def smart_load( - path: str, - map_location: str=None, - return_numpy: bool=False, - return_global_step: bool=False, - return_is_torch_weight: bool=False, ): + path: str, + map_location: str = None, + return_numpy: bool = False, + return_global_step: bool = False, + return_is_torch_weight: bool = False, +): if map_location is None: map_location = get_map_location_default() @@ -335,46 +328,36 @@ def smart_load( return state_dict if suffix in torch_suffix: - state_dict = convert_to_paddle( - torch_load(path), return_numpy, return_global_step) + state_dict = convert_to_paddle(torch_load(path), return_numpy, return_global_step) if return_is_torch_weight: state_dict["is_torch_weight"] = True return state_dict if suffix in safetensors_suffix: - state_dict = convert_to_paddle( - safetensors_load(path), return_numpy, return_global_step) + state_dict = convert_to_paddle(safetensors_load(path), return_numpy, return_global_step) if return_is_torch_weight: state_dict["is_torch_weight"] = True return state_dict # must use safetensors_load first try: - state_dict = convert_to_paddle( - safetensors_load(path), return_numpy, return_global_step) + state_dict = convert_to_paddle(safetensors_load(path), return_numpy, return_global_step) if return_is_torch_weight: state_dict["is_torch_weight"] = True return state_dict except Exception: logger.info(f"Cant load file {name} with safetensors!") try: - state_dict = convert_to_paddle( - torch_load(path), return_numpy, return_global_step) + state_dict = convert_to_paddle(torch_load(path), return_numpy, return_global_step) if return_is_torch_weight: state_dict["is_torch_weight"] = True return state_dict except Exception: - logger.info( - f"Cant load file {name} with torch! We will try to load this with safetensors!" - ) + logger.info(f"Cant load file {name} with torch! We will try to load this with safetensors!") try: state_dict = paddle.load(path, return_numpy=return_numpy) return state_dict except Exception: - logger.info( - f"Cant load file {name} with paddle! We will try to load this with torch/safetensors!" - ) + logger.info(f"Cant load file {name} with paddle! We will try to load this with torch/safetensors!") if state_dict is None: - raise ValueError( - f"Cant load {name}, currently we only support ['torch', 'safetensors', 'paddle']!" - ) + raise ValueError(f"Cant load {name}, currently we only support ['torch', 'safetensors', 'paddle']!") diff --git a/ppdiffusers/ppdiffusers/utils/logging.py b/ppdiffusers/ppdiffusers/utils/logging.py index 355cb16bd50bd..12b12c075d2ef 100644 --- a/ppdiffusers/ppdiffusers/utils/logging.py +++ b/ppdiffusers/ppdiffusers/utils/logging.py @@ -58,7 +58,8 @@ def _get_default_logging_level(): else: logging.getLogger().warning( f"Unknown option PPDIFFUSERS_VERBOSITY={env_level_str}, " - f"has to be one of: { ', '.join(log_levels.keys()) }") + f"has to be one of: { ', '.join(log_levels.keys()) }" + ) return _default_log_level @@ -104,7 +105,7 @@ def get_log_levels_dict(): return log_levels -def get_logger(name: Optional[str]=None) -> logging.Logger: +def get_logger(name: Optional[str] = None) -> logging.Logger: """ Return a logger with the specified name. @@ -212,8 +213,7 @@ def remove_handler(handler: logging.Handler) -> None: _configure_library_root_logger() - assert handler is not None and handler not in _get_library_root_logger( - ).handlers + assert handler is not None and handler not in _get_library_root_logger().handlers _get_library_root_logger().removeHandler(handler) @@ -247,8 +247,7 @@ def enable_explicit_format() -> None: handlers = _get_library_root_logger().handlers for handler in handlers: - formatter = logging.Formatter( - "[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s") + formatter = logging.Formatter("[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s") handler.setFormatter(formatter) diff --git a/ppdiffusers/ppdiffusers/utils/outputs.py b/ppdiffusers/ppdiffusers/utils/outputs.py index cd319b7378749..b71ef22559c47 100644 --- a/ppdiffusers/ppdiffusers/utils/outputs.py +++ b/ppdiffusers/ppdiffusers/utils/outputs.py @@ -60,8 +60,7 @@ def __post_init__(self): raise ValueError(f"{self.__class__.__name__} has no fields.") first_field = getattr(self, class_fields[0].name) - other_fields_are_none = all( - getattr(self, field.name) is None for field in class_fields[1:]) + other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:]) if other_fields_are_none and isinstance(first_field, dict): for key, value in first_field.items(): @@ -73,23 +72,16 @@ def __post_init__(self): self[field.name] = v def __delitem__(self, *args, **kwargs): - raise Exception( - f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance." - ) + raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.") def setdefault(self, *args, **kwargs): - raise Exception( - f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance." - ) + raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.") def pop(self, *args, **kwargs): - raise Exception( - f"You cannot use ``pop`` on a {self.__class__.__name__} instance.") + raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.") def update(self, *args, **kwargs): - raise Exception( - f"You cannot use ``update`` on a {self.__class__.__name__} instance." - ) + raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.") def __getitem__(self, k): if isinstance(k, str): @@ -121,6 +113,6 @@ def to_tuple(self) -> Tuple[Any]: for field in fields(self): if getattr(self, field.name, None) is None: continue - tuples = tuples + (getattr(self, field.name), ) + tuples = tuples + (getattr(self, field.name),) return tuples diff --git a/ppdiffusers/ppdiffusers/utils/paddle_utils.py b/ppdiffusers/ppdiffusers/utils/paddle_utils.py index a59bfd24f7166..1fa9da783471b 100644 --- a/ppdiffusers/ppdiffusers/utils/paddle_utils.py +++ b/ppdiffusers/ppdiffusers/utils/paddle_utils.py @@ -43,8 +43,7 @@ def manual_seed(self, seed, generator_name=None): if generator_name is None: generator_name = str(time.time()) if generator_name in self.states_: - raise ValueError("state {} already exists".format( - generator_name)) + raise ValueError("state {} already exists".format(generator_name)) orig_rng_state = paddle.get_cuda_rng_state() paddle.seed(seed) self.states_[generator_name] = paddle.get_cuda_rng_state() @@ -55,8 +54,7 @@ def manual_seed(self, seed, generator_name=None): def rng_state(self, generator_name=None): if generator_name is not None: if generator_name not in self.states_: - raise ValueError("state {} does not exist".format( - generator_name)) + raise ValueError("state {} does not exist".format(generator_name)) orig_cuda_rng_state = paddle.get_cuda_rng_state() paddle.set_cuda_rng_state(self.states_[generator_name]) try: @@ -81,16 +79,13 @@ def get_rng_state_tracker(*args, **kwargs): @paddle.jit.not_to_static def randn_pt(shape, dtype=None, name=None, **kwargs): generator = kwargs.get("generator", None) - is_bfloat16 = ("bfloat16" in str(dtype) or - "bfloat16" in paddle.get_default_dtype()) + is_bfloat16 = "bfloat16" in str(dtype) or "bfloat16" in paddle.get_default_dtype() if is_bfloat16: if generator is None: - return randn( - shape, dtype="float16", name=name).cast(paddle.bfloat16) + return randn(shape, dtype="float16", name=name).cast(paddle.bfloat16) else: with get_rng_state_tracker().rng_state(generator): - return randn( - shape, dtype="float16", name=name).cast(paddle.bfloat16) + return randn(shape, dtype="float16", name=name).cast(paddle.bfloat16) else: if generator is None: return randn(shape, dtype=dtype, name=name) @@ -108,24 +103,20 @@ def rand_pt(shape, dtype=None, name=None, **kwargs): return rand(shape, dtype=dtype, name=name) @paddle.jit.not_to_static - def randint_pt(low=0, high=None, shape=[1], dtype=None, name=None, - **kwargs): + def randint_pt(low=0, high=None, shape=[1], dtype=None, name=None, **kwargs): generator = kwargs.get("generator", None) if generator is None: - return randint( - low=low, high=high, shape=shape, dtype=dtype, name=name) + return randint(low=low, high=high, shape=shape, dtype=dtype, name=name) else: with get_rng_state_tracker().rng_state(generator): - return randint( - low=low, high=high, shape=shape, dtype=dtype, name=name) + return randint(low=low, high=high, shape=shape, dtype=dtype, name=name) @paddle.jit.not_to_static def randn_like_pt(x, dtype=None, name=None, **kwargs): generator = kwargs.get("generator", None) if dtype is None: dtype = x.dtype - return randn_pt( - x.shape, dtype=dtype, generator=generator, name=name, **kwargs) + return randn_pt(x.shape, dtype=dtype, generator=generator, name=name, **kwargs) paddle.randn = randn_pt paddle.rand = rand_pt @@ -133,23 +124,19 @@ def randn_like_pt(x, dtype=None, name=None, **kwargs): paddle.randn_like = randn_like_pt def randn_tensor( - shape: Union[Tuple, List], - generator: Optional[Union[List["paddle.Generator"], - "paddle.Generator"]]=None, - dtype: Optional["paddle.dtype"]=None, - *kwargs, ): + shape: Union[Tuple, List], + generator: Optional[Union[List["paddle.Generator"], "paddle.Generator"]] = None, + dtype: Optional["paddle.dtype"] = None, + *kwargs, + ): """This is a helper function that allows to create random tensors on the desired `device` with the desired `dtype`. When passing a list of generators one can seed each batched size individually. If CPU generators are passed the tensor will always be created on CPU. """ if isinstance(generator, (list, tuple)): batch_size = shape[0] - shape = (1, ) + tuple(shape[1:]) - latents = [ - randn_pt( - shape, generator=generator[i], dtype=dtype) - for i in range(batch_size) - ] + shape = (1,) + tuple(shape[1:]) + latents = [randn_pt(shape, generator=generator[i], dtype=dtype) for i in range(batch_size)] latents = paddle.concat(latents, axis=0) else: latents = randn_pt(shape, generator=generator, dtype=dtype) @@ -157,23 +144,19 @@ def randn_tensor( return latents def rand_tensor( - shape: Union[Tuple, List], - generator: Optional[Union[List["paddle.Generator"], - "paddle.Generator"]]=None, - dtype: Optional["paddle.dtype"]=None, - *kwargs, ): + shape: Union[Tuple, List], + generator: Optional[Union[List["paddle.Generator"], "paddle.Generator"]] = None, + dtype: Optional["paddle.dtype"] = None, + *kwargs, + ): """This is a helper function that allows to create random tensors on the desired `device` with the desired `dtype`. When passing a list of generators one can seed each batched size individually. If CPU generators are passed the tensor will always be created on CPU. """ if isinstance(generator, (list, tuple)): batch_size = shape[0] - shape = (1, ) + tuple(shape[1:]) - latents = [ - rand_pt( - shape, generator=generator[i], dtype=dtype) - for i in range(batch_size) - ] + shape = (1,) + tuple(shape[1:]) + latents = [rand_pt(shape, generator=generator[i], dtype=dtype) for i in range(batch_size)] latents = paddle.concat(latents, axis=0) else: latents = rand_pt(shape, generator=generator, dtype=dtype) @@ -181,18 +164,18 @@ def rand_tensor( return latents def randint_tensor( - low=0, - high=None, - shape: Union[Tuple, List]=[1], - generator: Optional["paddle.Generator"]=None, - dtype: Optional["paddle.dtype"]=None, - *kwargs, ): + low=0, + high=None, + shape: Union[Tuple, List] = [1], + generator: Optional["paddle.Generator"] = None, + dtype: Optional["paddle.dtype"] = None, + *kwargs, + ): """This is a helper function that allows to create random tensors on the desired `device` with the desired `dtype`. When passing a list of generators one can seed each batched size individually. If CPU generators are passed the tensor will always be created on CPU. """ - latents = randint_pt( - low=low, high=high, shape=shape, dtype=dtype, generator=generator) + latents = randint_pt(low=low, high=high, shape=shape, dtype=dtype, generator=generator) return latents diff --git a/ppdiffusers/ppdiffusers/utils/pil_utils.py b/ppdiffusers/ppdiffusers/utils/pil_utils.py index 7d41b9c74c07a..bef4901a7e5f8 100644 --- a/ppdiffusers/ppdiffusers/utils/pil_utils.py +++ b/ppdiffusers/ppdiffusers/utils/pil_utils.py @@ -18,8 +18,7 @@ from packaging import version from PIL import Image -if version.parse(version.parse(PIL.__version__).base_version) >= version.parse( - "9.1.0"): +if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"): PIL_INTERPOLATION = { "linear": PIL.Image.Resampling.BILINEAR, "bilinear": PIL.Image.Resampling.BILINEAR, @@ -60,10 +59,7 @@ def numpy_to_pil(images): images = (images * 255).round().astype("uint8") if images.shape[-1] == 1: # special case for grayscale (single channel) images - pil_images = [ - Image.fromarray( - image.squeeze(), mode="L") for image in images - ] + pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images] else: pil_images = [Image.fromarray(image) for image in images] diff --git a/ppdiffusers/ppdiffusers/utils/testing_utils.py b/ppdiffusers/ppdiffusers/utils/testing_utils.py index 88a8c7e167c47..02e03ca1e944a 100644 --- a/ppdiffusers/ppdiffusers/utils/testing_utils.py +++ b/ppdiffusers/ppdiffusers/utils/testing_utils.py @@ -31,10 +31,16 @@ import PIL.ImageOps import requests -from .import_utils import (BACKENDS_MAPPING, is_compel_available, - is_fastdeploy_available, is_note_seq_available, - is_opencv_available, is_paddle_available, - is_paddle_version, is_torch_available) +from .import_utils import ( + BACKENDS_MAPPING, + is_compel_available, + is_fastdeploy_available, + is_note_seq_available, + is_opencv_available, + is_paddle_available, + is_paddle_version, + is_torch_available, +) from .logging import get_logger global_rng = random.Random() @@ -51,7 +57,8 @@ if paddle_device not in available_backends: raise ValueError( f"unknown paddle backend for ppdiffusers tests: {paddle_device}. Available backends are:" - f" {available_backends}") + f" {available_backends}" + ) logger.info(f"paddle_device overrode to {paddle_device}") else: paddle_device = "gpu" if paddle.is_compiled_with_cuda() else "cpu" @@ -74,25 +81,19 @@ def paddle_all_close(a, b, *args, **kwargs): if not is_paddle_available(): raise ValueError("Paddle needs to be installed to use this function.") if not paddle.allclose(a, b, *args, **kwargs): - assert ( - False - ), f"Max diff is absolute {(a - b).abs().max()}. Diff tensor is {(a - b).abs()}." + assert False, f"Max diff is absolute {(a - b).abs().max()}. Diff tensor is {(a - b).abs()}." return True -def print_tensor_test(tensor, - filename="test_corrections.txt", - expected_tensor_name="expected_slice"): +def print_tensor_test(tensor, filename="test_corrections.txt", expected_tensor_name="expected_slice"): test_name = os.environ.get("PYTEST_CURRENT_TEST") if not paddle.is_tensor(tensor): tensor = paddle.to_tensor(tensor) - tensor_str = str(tensor.detach().cpu().flatten().cast("float32")).replace( - "\n", "") + tensor_str = str(tensor.detach().cpu().flatten().cast("float32")).replace("\n", "") # format is usually: # expected_slice = np.array([-0.5713, -0.3018, -0.9814, 0.04663, -0.879, 0.76, -1.734, 0.1044, 1.161]) - output_str = tensor_str.replace("tensor", - f"{expected_tensor_name} = np.array") + output_str = tensor_str.replace("tensor", f"{expected_tensor_name} = np.array") test_file, test_class, test_fn = test_name.split("::") test_fn = test_fn.split()[0] with open(filename, "a") as f: @@ -182,27 +183,27 @@ def require_paddle_2_5(test_case): """ return unittest.skipUnless( is_paddle_available() and is_paddle_version(">=", "2.5.0"), - "test requires Paddle 2.5", )(test_case) + "test requires Paddle 2.5", + )(test_case) def require_paddle(test_case): """ Decorator marking a test that requires Paddle. These tests are skipped when Paddle isn't installed. """ - return unittest.skipUnless(is_paddle_available(), - "test requires Paddle")(test_case) + return unittest.skipUnless(is_paddle_available(), "test requires Paddle")(test_case) def require_torch(test_case): """Decorator marking a test that requires TORCH.""" - return unittest.skipUnless(is_torch_available(), - "test requires TORCH")(test_case) + return unittest.skipUnless(is_torch_available(), "test requires TORCH")(test_case) def require_paddle_gpu(test_case): """Decorator marking a test that requires CUDA and Paddle.""" - return unittest.skipUnless(is_paddle_available() and paddle_device == "gpu", - "test requires Paddle+CUDA")(test_case) + return unittest.skipUnless(is_paddle_available() and paddle_device == "gpu", "test requires Paddle+CUDA")( + test_case + ) def require_compel(test_case): @@ -210,38 +211,32 @@ def require_compel(test_case): Decorator marking a test that requires compel: https://github.com/damian0815/compel. These tests are skipped when the library is not installed. """ - return unittest.skipUnless(is_compel_available(), - "test requires compel")(test_case) + return unittest.skipUnless(is_compel_available(), "test requires compel")(test_case) def require_fastdeploy(test_case): """ Decorator marking a test that requires fastdeploy. These tests are skipped when fastdeploy isn't installed. """ - return unittest.skipUnless(is_fastdeploy_available(), - "test requires fastdeploy")(test_case) + return unittest.skipUnless(is_fastdeploy_available(), "test requires fastdeploy")(test_case) def require_note_seq(test_case): """ Decorator marking a test that requires note_seq. These tests are skipped when note_seq isn't installed. """ - return unittest.skipUnless(is_note_seq_available(), - "test requires note_seq")(test_case) + return unittest.skipUnless(is_note_seq_available(), "test requires note_seq")(test_case) -def load_numpy(arry: Union[str, np.ndarray], - local_path: Optional[str]=None) -> np.ndarray: +def load_numpy(arry: Union[str, np.ndarray], local_path: Optional[str] = None) -> np.ndarray: if isinstance(arry, str): # local_path = "/home/patrick_huggingface_co/" if local_path is not None: # local_path can be passed to correct images of tests return os.path.join( local_path, - "/".join([ - arry.split("/")[-5], arry.split("/")[-2], - arry.split("/")[-1] - ]), ) + "/".join([arry.split("/")[-5], arry.split("/")[-2], arry.split("/")[-1]]), + ) elif arry.startswith("http://") or arry.startswith("https://"): response = requests.get(arry) response.raise_for_status() @@ -257,7 +252,8 @@ def load_numpy(arry: Union[str, np.ndarray], else: raise ValueError( "Incorrect format used for numpy ndarray. Should be an url linking to an image, a local path, or a" - " ndarray.") + " ndarray." + ) return arry @@ -320,20 +316,17 @@ def preprocess_image(image: PIL.Image, batch_size: int): return 2.0 * image - 1.0 -def export_to_video(video_frames: List[np.ndarray], - output_video_path: str=None) -> str: +def export_to_video(video_frames: List[np.ndarray], output_video_path: str = None) -> str: if is_opencv_available(): import cv2 else: - raise ImportError(BACKENDS_MAPPING["opencv"][1].format( - "export_to_video")) + raise ImportError(BACKENDS_MAPPING["opencv"][1].format("export_to_video")) if output_video_path is None: output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4").name - fourcc = cv2.VideoWriter_fourcc(* "mp4v") + fourcc = cv2.VideoWriter_fourcc(*"mp4v") h, w, c = video_frames[0].shape - video_writer = cv2.VideoWriter( - output_video_path, fourcc, fps=8, frameSize=(w, h)) + video_writer = cv2.VideoWriter(output_video_path, fourcc, fps=8, frameSize=(w, h)) for i in range(len(video_frames)): img = cv2.cvtColor(video_frames[i], cv2.COLOR_RGB2BGR) video_writer.write(img) @@ -344,7 +337,8 @@ def load_hf_numpy(path) -> np.ndarray: if not path.startswith("http://") or path.startswith("https://"): path = os.path.join( "https://huggingface.co/datasets/fusing/diffusers-testing/resolve/main", - urllib.parse.quote(path), ) + urllib.parse.quote(path), + ) return load_numpy(path) @@ -353,7 +347,8 @@ def load_ppnlp_numpy(path) -> np.ndarray: if not path.startswith("http://") or path.startswith("https://"): path = os.path.join( "https://paddlenlp.bj.bcebos.com/models/community/CompVis/data/diffusers-testing", - urllib.parse.quote(path), ) + urllib.parse.quote(path), + ) return load_numpy(path) @@ -444,9 +439,7 @@ def pytest_terminal_summary_main(tr, id): f.write("slowest durations\n") for i, rep in enumerate(dlist): if rep.duration < durations_min: - f.write( - f"{len(dlist)-i} durations < {durations_min} secs were omitted" - ) + f.write(f"{len(dlist)-i} durations < {durations_min} secs were omitted") break f.write(f"{rep.duration:02.2f}s {rep.when:<8} {rep.nodeid}\n") @@ -460,8 +453,7 @@ def summary_failures_short(tr): msg = tr._getfailureheadline(rep) tr.write_sep("_", msg, red=True, bold=True) # chop off the optional leading extra frames, leaving only the last one - longrepr = re.sub(r".*_ _ _ (_ ){10,}_ _ ", "", rep.longreprtext, 0, - re.M | re.S) + longrepr = re.sub(r".*_ _ _ (_ ){10,}_ _ ", "", rep.longreprtext, 0, re.M | re.S) tr._tw.line(longrepr) # note: not printing out any rep.sections to keep the report short @@ -496,9 +488,7 @@ def summary_failures_short(tr): tr.summary_warnings() # normal warnings tr.summary_warnings() # final warnings - tr.reportchars = ( - "wPpsxXEf" # emulate -rA (used in summary_passes() and short_test_summary()) - ) + tr.reportchars = "wPpsxXEf" # emulate -rA (used in summary_passes() and short_test_summary()) with open(report_files["passes"], "w") as f: tr._tw = create_terminal_writer(config, f) tr.summary_passes() diff --git a/ppdiffusers/scripts/cocoeval_keypoints_score/cocoeval_keypoints.py b/ppdiffusers/scripts/cocoeval_keypoints_score/cocoeval_keypoints.py index 0940509378627..ea039412ef292 100644 --- a/ppdiffusers/scripts/cocoeval_keypoints_score/cocoeval_keypoints.py +++ b/ppdiffusers/scripts/cocoeval_keypoints_score/cocoeval_keypoints.py @@ -20,18 +20,8 @@ # This script references https://cocodataset.org/#keypoints-eval. if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument( - "-g", - "--gt", - type=str, - help="Assign the groud true path.", - default=None) - parser.add_argument( - "-d", - "--dt", - type=str, - help="Assign the detection result path.", - default=None) + parser.add_argument("-g", "--gt", type=str, help="Assign the groud true path.", default=None) + parser.add_argument("-d", "--dt", type=str, help="Assign the detection result path.", default=None) args = parser.parse_args() cocoGt = COCO(args.gt) diff --git a/ppdiffusers/scripts/cocoeval_keypoints_score/get_openpose_keypoints_result_coco_format.py b/ppdiffusers/scripts/cocoeval_keypoints_score/get_openpose_keypoints_result_coco_format.py index 9e56042786a43..9679d0b744e9d 100644 --- a/ppdiffusers/scripts/cocoeval_keypoints_score/get_openpose_keypoints_result_coco_format.py +++ b/ppdiffusers/scripts/cocoeval_keypoints_score/get_openpose_keypoints_result_coco_format.py @@ -23,6 +23,7 @@ import paddle import paddlehub as hub from annotator.ppdet_hrnet.det_keypoint_unite_infer import PPDetPose + # import PIL from PIL import Image from tqdm import tqdm @@ -46,10 +47,8 @@ def keypoint_to_openpose_kpts(coco_keypoints_list): l_shoulder_keypoint = coco_keypoints_list[l_shoulder_index] r_shoulder_keypoint = coco_keypoints_list[r_shoulder_index] - neck_keypoint_y = int( - (l_shoulder_keypoint[1] + r_shoulder_keypoint[1]) / 2.0) - neck_keypoint_x = int( - (l_shoulder_keypoint[0] + r_shoulder_keypoint[0]) / 2.0) + neck_keypoint_y = int((l_shoulder_keypoint[1] + r_shoulder_keypoint[1]) / 2.0) + neck_keypoint_x = int((l_shoulder_keypoint[0] + r_shoulder_keypoint[0]) / 2.0) neck_keypoint = [ neck_keypoint_x, neck_keypoint_y, @@ -72,19 +71,19 @@ def __call__(self, oriImg, detect_resolution=512, hand=False): img_scalarfactor = detect_resolution / min(oriImg.shape[:2]) result, poseres = self.ppdetpose_pred(oriImg) result["candidate"] = result["candidate"] * img_scalarfactor - oriImg = cv2.resize( - oriImg, (0, 0), fx=img_scalarfactor, fy=img_scalarfactor) + oriImg = cv2.resize(oriImg, (0, 0), fx=img_scalarfactor, fy=img_scalarfactor) canvas = oriImg.copy() canvas.fill(0) - canvas = self.body_estimation.draw_pose(canvas, result["candidate"], - result["subset"]) + canvas = self.body_estimation.draw_pose(canvas, result["candidate"], result["subset"]) return ( canvas, dict( candidate=result["candidate"].tolist(), - subset=result["subset"].tolist(), ), - poseres, ) + subset=result["subset"].tolist(), + ), + poseres, + ) def ppdetpose_pred(self, image, kpt_threshold=0.3): poseres = self.ppdetpose.ppdet_hrnet_infer(image) @@ -98,7 +97,12 @@ def ppdetpose_pred(self, image, kpt_threshold=0.3): for idx, item in enumerate(openpose_kpts): if item[2] > kpt_threshold: subset[kptid][idx] = posnum - kpt = np.array(item + [posnum, ]) + kpt = np.array( + item + + [ + posnum, + ] + ) candidate = np.vstack((candidate, kpt)) posnum += 1 return {"candidate": candidate, "subset": subset}, poseres @@ -138,7 +142,8 @@ def resize_image(input_image, resolution): img = cv2.resize( input_image, (W, H), - interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA, ) + interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA, + ) return img @@ -151,11 +156,7 @@ def get_keypoints_result_coco_format(paths, detector, do_gt): out_dir_path = pathlib.Path(paths[2]) if not os.path.exists(out_dir_path): os.makedirs(out_dir_path) - files = sorted([ - file - for ext in IMAGE_EXTENSIONS - for file in in_dir_path.glob("*.{}".format(ext)) - ]) + files = sorted([file for ext in IMAGE_EXTENSIONS for file in in_dir_path.glob("*.{}".format(ext))]) output = [] index = -1 for file in tqdm(files): @@ -165,8 +166,7 @@ def get_keypoints_result_coco_format(paths, detector, do_gt): input_image = HWC3(im) canvas, keypoints_result, poseres = detector(input_image) if len(paths) == 3: - Image.fromarray(canvas).save( - os.path.join(out_dir_path, os.path.basename(file))) + Image.fromarray(canvas).save(os.path.join(out_dir_path, os.path.basename(file))) if len(poseres["keypoint"][0]) == 0: sample_dict = { "image_id": index, @@ -209,76 +209,72 @@ def get_keypoints_result_coco_format(paths, detector, do_gt): json.dumps( { "annotations": output, - "images": [{ - "id": item - } for item in list(range(index + 1))], - "categories": [{ - "supercategory": "person", - "id": 1, - "name": "person", - "keypoints": [ - "nose", - "left_eye", - "right_eye", - "left_ear", - "right_ear", - "left_shoulder", - "right_shoulder", - "left_elbow", - "right_elbow", - "left_wrist", - "right_wrist", - "left_hip", - "right_hip", - "left_knee", - "right_knee", - "left_ankle", - "right_ankle", - ], - "skeleton": [ - [16, 14], - [14, 12], - [17, 15], - [15, 13], - [12, 13], - [6, 12], - [7, 13], - [6, 7], - [6, 8], - [7, 9], - [8, 10], - [9, 11], - [2, 3], - [1, 2], - [1, 3], - [2, 4], - [3, 5], - [4, 6], - [5, 7], - ], - }], + "images": [{"id": item} for item in list(range(index + 1))], + "categories": [ + { + "supercategory": "person", + "id": 1, + "name": "person", + "keypoints": [ + "nose", + "left_eye", + "right_eye", + "left_ear", + "right_ear", + "left_shoulder", + "right_shoulder", + "left_elbow", + "right_elbow", + "left_wrist", + "right_wrist", + "left_hip", + "right_hip", + "left_knee", + "right_knee", + "left_ankle", + "right_ankle", + ], + "skeleton": [ + [16, 14], + [14, 12], + [17, 15], + [15, 13], + [12, 13], + [6, 12], + [7, 13], + [6, 7], + [6, 8], + [7, 9], + [8, 10], + [9, 11], + [2, 3], + [1, 2], + [1, 3], + [2, 4], + [3, 5], + [4, 6], + [5, 7], + ], + } + ], }, - indent=4, )) + indent=4, + ) + ) else: json_file.write(json.dumps(output, indent=4)) parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) -parser.add_argument( - "--do_gt", - action="store_true", - help="whether to predict unseen future data") +parser.add_argument("--do_gt", action="store_true", help="whether to predict unseen future data") parser.add_argument( "path", type=str, nargs=3, - help=( - "Paths to the input images dir, output json file, and output openpose images dir" - ), ) + help=("Paths to the input images dir, output json file, and output openpose images dir"), +) -IMAGE_EXTENSIONS = { - "bmp", "jpg", "jpeg", "pgm", "png", "ppm", "tif", "tiff", "webp" -} +IMAGE_EXTENSIONS = {"bmp", "jpg", "jpeg", "pgm", "png", "ppm", "tif", "tiff", "webp"} if __name__ == "__main__": args = parser.parse_args() diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionImageVariation_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionImageVariation_to_ppdiffusers.py index 015f143827f2b..d5fb70e8d90c9 100644 --- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionImageVariation_to_ppdiffusers.py +++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionImageVariation_to_ppdiffusers.py @@ -16,17 +16,24 @@ import paddle import torch -from diffusers import \ - StableDiffusionImageVariationPipeline as \ - DiffusersStableDiffusionImageVariationPipeline -from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPVisionConfig, - CLIPVisionModelWithProjection) - -from ppdiffusers import (AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, - PNDMScheduler) -from ppdiffusers import \ - StableDiffusionImageVariationPipeline as \ - PPDiffusersStableDiffusionImageVariationPipeline +from diffusers import ( + StableDiffusionImageVariationPipeline as DiffusersStableDiffusionImageVariationPipeline, +) +from paddlenlp.transformers import ( + CLIPFeatureExtractor, + CLIPVisionConfig, + CLIPVisionModelWithProjection, +) + +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + LMSDiscreteScheduler, + PNDMScheduler, +) +from ppdiffusers import ( + StableDiffusionImageVariationPipeline as PPDiffusersStableDiffusionImageVariationPipeline, +) from ppdiffusers import UNet2DConditionModel from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker @@ -47,10 +54,7 @@ def convert_to_ppdiffusers(vae_or_unet, dtype="float32"): return new_vae_or_unet -def convert_hf_clip_to_ppnlp_clip(clip, - dtype="float32", - is_text_encoder=True, - need_prefix=False): +def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True, need_prefix=False): new_model_state = {} transformers2ppnlp = { ".encoder.": ".transformer.", @@ -69,9 +73,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, ".vision_model.": ".", } ignore_value = ["position_ids"] - donot_transpose = [ - "embeddings", "norm", "concept_embeds", "special_care_embeds" - ] + donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"] for name, value in clip.state_dict().items(): # step1: ignore position_ids @@ -85,7 +87,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, name = name.replace(hf_name, ppnlp_name) # step4: 0d tensor -> 1d tensor if name == "logit_scale": - value = value.reshape((1, )) + value = value.reshape((1,)) # step5: safety_checker need prefix "clip." if "vision_model" in name and need_prefix: name = "clip." + name @@ -123,8 +125,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, "vision_heads": clip.config.num_attention_heads, "vision_embed_dim": clip.config.hidden_size, "vision_patch_size": clip.config.patch_size, - "vision_mlp_ratio": - clip.config.intermediate_size // clip.config.hidden_size, + "vision_mlp_ratio": clip.config.intermediate_size // clip.config.hidden_size, "vision_hidden_act": clip.config.hidden_act, "projection_dim": clip.config.projection_dim, } @@ -148,17 +149,19 @@ def check_keys(model, state_dict): print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!") -def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path, - output_path=None): +def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path, output_path=None): # 0. load diffusers pipe and convert to ppdiffusers weights format diffusers_pipe = DiffusersStableDiffusionImageVariationPipeline.from_pretrained( - pretrained_model_name_or_path, use_auth_token=True) + pretrained_model_name_or_path, use_auth_token=True + ) vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae) unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet) image_encoder_state_dict, vision_config = convert_hf_clip_to_ppnlp_clip( - diffusers_pipe.image_encoder, is_text_encoder=False, need_prefix=False) + diffusers_pipe.image_encoder, is_text_encoder=False, need_prefix=False + ) safety_checker_state_dict, safety_checker_config = convert_hf_clip_to_ppnlp_clip( - diffusers_pipe.safety_checker, is_text_encoder=False, need_prefix=True) + diffusers_pipe.safety_checker, is_text_encoder=False, need_prefix=True + ) # 1. vae pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config) @@ -173,18 +176,14 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path, # make sure vision_config.update({"projection_dim": pp_unet.config.cross_attention_dim}) - safety_checker_config.update({ - "projection_dim": pp_unet.config.cross_attention_dim - }) + safety_checker_config.update({"projection_dim": pp_unet.config.cross_attention_dim}) # 3. image_encoder - image_encoder = CLIPVisionModelWithProjection( - CLIPVisionConfig.from_dict(vision_config)) + image_encoder = CLIPVisionModelWithProjection(CLIPVisionConfig.from_dict(vision_config)) image_encoder.set_dict(image_encoder_state_dict) check_keys(image_encoder, image_encoder_state_dict) # 4. safety_checker - pp_safety_checker = StableDiffusionSafetyChecker( - CLIPVisionConfig.from_dict(safety_checker_config)) + pp_safety_checker = StableDiffusionSafetyChecker(CLIPVisionConfig.from_dict(safety_checker_config)) pp_safety_checker.set_dict(safety_checker_state_dict) check_keys(pp_safety_checker, safety_checker_state_dict) # 5. scheduler @@ -200,12 +199,10 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path, set_alpha_to_one=False, steps_offset=1, # Make sure the scheduler compatible with PNDM - skip_prk_steps=True, ) + skip_prk_steps=True, + ) elif "lms" in scheduler_type: - pp_scheduler = LMSDiscreteScheduler( - beta_start=beta_start, - beta_end=beta_end, - beta_schedule="scaled_linear") + pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear") elif "ddim" in scheduler_type: pp_scheduler = DDIMScheduler( beta_start=beta_start, @@ -214,12 +211,12 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path, # Make sure the scheduler compatible with DDIM clip_sample=False, set_alpha_to_one=False, - steps_offset=1, ) + steps_offset=1, + ) else: raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") - pp_feature_extractor = CLIPFeatureExtractor.from_pretrained( - "CompVis/stable-diffusion-v1-4/feature_extractor") + pp_feature_extractor = CLIPFeatureExtractor.from_pretrained("CompVis/stable-diffusion-v1-4/feature_extractor") # 7. create ppdiffusers pipe paddle_pipe = PPDiffusersStableDiffusionImageVariationPipeline( @@ -228,15 +225,15 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path, unet=pp_unet, safety_checker=pp_safety_checker, feature_extractor=pp_feature_extractor, - scheduler=pp_scheduler, ) + scheduler=pp_scheduler, + ) # 8. save_pretrained paddle_pipe.save_pretrained(output_path) return paddle_pipe if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Pytorch model weights to Paddle model weights.") + parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.") parser.add_argument( "--pretrained_model_name_or_path", type=str, @@ -247,7 +244,7 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path, "--output_path", type=str, default="sd-image-variations-ppdiffusers", - help="The model output path.", ) + help="The model output path.", + ) args = parser.parse_args() - ppdiffusers_pipe = convert_diffusers_to_ppdiffusers( - args.pretrained_model_name_or_path, args.output_path) + ppdiffusers_pipe = convert_diffusers_to_ppdiffusers(args.pretrained_model_name_or_path, args.output_path) diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionUpscalePipeline_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionUpscalePipeline_to_ppdiffusers.py index 756e12bb3c97b..f0a64446ba1d7 100644 --- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionUpscalePipeline_to_ppdiffusers.py +++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_StableDiffusionUpscalePipeline_to_ppdiffusers.py @@ -17,14 +17,21 @@ import paddle import torch -from diffusers import \ - StableDiffusionUpscalePipeline as DiffusersStableDiffusionUpscalePipeline +from diffusers import ( + StableDiffusionUpscalePipeline as DiffusersStableDiffusionUpscalePipeline, +) from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer -from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler, - LMSDiscreteScheduler, PNDMScheduler) -from ppdiffusers import \ - StableDiffusionUpscalePipeline as PPDiffusersStableDiffusionUpscalePipeline +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + DDPMScheduler, + LMSDiscreteScheduler, + PNDMScheduler, +) +from ppdiffusers import ( + StableDiffusionUpscalePipeline as PPDiffusersStableDiffusionUpscalePipeline, +) from ppdiffusers import UNet2DConditionModel paddle.set_device("cpu") @@ -63,9 +70,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True): ".vision_model.": ".", } ignore_value = ["position_ids"] - donot_transpose = [ - "embeddings", "norm", "concept_embeds", "special_care_embeds" - ] + donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"] for name, value in clip.state_dict().items(): # step1: ignore position_ids @@ -79,7 +84,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True): name = name.replace(hf_name, ppnlp_name) # step4: 0d tensor -> 1d tensor if name == "logit_scale": - value = value.reshape((1, )) + value = value.reshape((1,)) # step5: safety_checker need prefix "clip." if "vision_model" in name: name = "clip." + name @@ -104,23 +109,23 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True): "vision_heads": clip.config.vision_config.num_attention_heads, "vision_embed_dim": clip.config.vision_config.hidden_size, "vision_patch_size": clip.config.vision_config.patch_size, - "vision_mlp_ratio": clip.config.vision_config.intermediate_size // - clip.config.vision_config.hidden_size, + "vision_mlp_ratio": clip.config.vision_config.intermediate_size // clip.config.vision_config.hidden_size, "vision_hidden_act": clip.config.vision_config.hidden_act, "projection_dim": clip.config.projection_dim, } return new_model_state, new_config -def convert_diffusers_stable_diffusion_to_ppdiffusers( - pretrained_model_name_or_path, output_path=None): +def convert_diffusers_stable_diffusion_to_ppdiffusers(pretrained_model_name_or_path, output_path=None): # 0. load diffusers pipe and convert to ppdiffusers weights format diffusers_pipe = DiffusersStableDiffusionUpscalePipeline.from_pretrained( - pretrained_model_name_or_path, use_auth_token=True) + pretrained_model_name_or_path, use_auth_token=True + ) vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae) unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet) text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip( - diffusers_pipe.text_encoder, is_text_encoder=True) + diffusers_pipe.text_encoder, is_text_encoder=True + ) max_noise_level = diffusers_pipe.max_noise_level # 1. vae @@ -134,8 +139,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( pp_unet.set_dict(unet_state_dict) # 3. text_encoder - pp_text_encoder = CLIPTextModel( - CLIPTextConfig.from_dict(text_encoder_config)) + pp_text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_encoder_config)) pp_text_encoder.set_dict(text_encoder_state_dict) # 4. scheduler @@ -150,12 +154,10 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( beta_schedule=beta_schedule, beta_start=beta_start, num_train_timesteps=num_train_timesteps, - skip_prk_steps=True, ) + skip_prk_steps=True, + ) elif "lms" in scheduler_type: - pp_scheduler = LMSDiscreteScheduler( - beta_start=beta_start, - beta_end=beta_end, - beta_schedule=beta_schedule) + pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule=beta_schedule) elif "ddim" in scheduler_type: pp_scheduler = DDIMScheduler( beta_start=beta_start, @@ -164,7 +166,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( clip_sample=False, prediction_type="v_prediction", set_alpha_to_one=False, - steps_offset=1, ) + steps_offset=1, + ) else: raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") @@ -183,18 +186,19 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( set_alpha_to_one=False, steps_offset=1, # Make sure the scheduler compatible with PNDM - skip_prk_steps=True, ) + skip_prk_steps=True, + ) elif "ddpm" in scheduler_type: pp_low_res_scheduler = DDPMScheduler( beta_end=beta_end, beta_schedule=beta_schedule, beta_start=beta_start, - num_train_timesteps=num_train_timesteps, ) + num_train_timesteps=num_train_timesteps, + ) elif "lms" in scheduler_type: pp_low_res_scheduler = LMSDiscreteScheduler( - beta_start=beta_start, - beta_end=beta_end, - beta_schedule=beta_schedule) + beta_start=beta_start, beta_end=beta_end, beta_schedule=beta_schedule + ) elif "ddim" in scheduler_type: pp_low_res_scheduler = DDIMScheduler( beta_start=beta_start, @@ -203,7 +207,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( # Make sure the scheduler compatible with DDIM clip_sample=False, set_alpha_to_one=False, - steps_offset=1, ) + steps_offset=1, + ) else: raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") @@ -219,7 +224,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( tokenizer=pp_tokenizer, unet=pp_unet, low_res_scheduler=pp_low_res_scheduler, - scheduler=pp_scheduler, ) + scheduler=pp_scheduler, + ) # 9. save_pretrained paddle_pipe.save_pretrained(output_path) @@ -227,8 +233,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Pytorch model weights to Paddle model weights.") + parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.") parser.add_argument( "--pretrained_model_name_or_path", type=str, @@ -239,7 +244,9 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( "--output_path", type=str, default="stable-diffusion-x4-upscaler-ppdiffusers", - help="The model output path.", ) + help="The model output path.", + ) args = parser.parse_args() ppdiffusers_pipe = convert_diffusers_stable_diffusion_to_ppdiffusers( - args.pretrained_model_name_or_path, args.output_path) + args.pretrained_model_name_or_path, args.output_path + ) diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_VersatileDiffusion_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_VersatileDiffusion_to_ppdiffusers.py index 4c29f3059b3a1..b3e0ece7e6a03 100644 --- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_VersatileDiffusion_to_ppdiffusers.py +++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_VersatileDiffusion_to_ppdiffusers.py @@ -17,16 +17,26 @@ import paddle import torch -from diffusers import \ - VersatileDiffusionPipeline as DiffusersVersatileDiffusionPipeline +from diffusers import VersatileDiffusionPipeline as DiffusersVersatileDiffusionPipeline from paddlenlp.transformers import ( - CLIPFeatureExtractor, CLIPTextConfig, CLIPTextModelWithProjection, - CLIPTokenizer, CLIPVisionConfig, CLIPVisionModelWithProjection) + CLIPFeatureExtractor, + CLIPTextConfig, + CLIPTextModelWithProjection, + CLIPTokenizer, + CLIPVisionConfig, + CLIPVisionModelWithProjection, +) -from ppdiffusers import (AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, - PNDMScheduler, UNet2DConditionModel) -from ppdiffusers import \ - VersatileDiffusionPipeline as PPDiffusersVersatileDiffusionPipeline +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + UNet2DConditionModel, +) +from ppdiffusers import ( + VersatileDiffusionPipeline as PPDiffusersVersatileDiffusionPipeline, +) from ppdiffusers.pipelines.versatile_diffusion import UNetFlatConditionModel paddle.set_device("cpu") @@ -46,10 +56,7 @@ def convert_to_ppdiffusers(vae_or_unet, dtype="float32"): return new_vae_or_unet -def convert_hf_clip_to_ppnlp_clip(clip, - dtype="float32", - is_text_encoder=True, - need_prefix=False): +def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True, need_prefix=False): new_model_state = {} transformers2ppnlp = { ".encoder.": ".transformer.", @@ -68,9 +75,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, ".vision_model.": ".", } ignore_value = ["position_ids"] - donot_transpose = [ - "embeddings", "norm", "concept_embeds", "special_care_embeds" - ] + donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"] for name, value in clip.state_dict().items(): # step1: ignore position_ids @@ -84,7 +89,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, name = name.replace(hf_name, ppnlp_name) # step4: 0d tensor -> 1d tensor if name == "logit_scale": - value = value.reshape((1, )) + value = value.reshape((1,)) # step5: safety_checker need prefix "clip." if "vision_model" in name and need_prefix: name = "clip." + name @@ -122,8 +127,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, "vision_heads": clip.config.num_attention_heads, "vision_embed_dim": clip.config.hidden_size, "vision_patch_size": clip.config.patch_size, - "vision_mlp_ratio": - clip.config.intermediate_size // clip.config.hidden_size, + "vision_mlp_ratio": clip.config.intermediate_size // clip.config.hidden_size, "vision_hidden_act": clip.config.hidden_act, "projection_dim": clip.config.projection_dim, } @@ -147,20 +151,22 @@ def check_keys(model, state_dict): print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!") -def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path, - output_path=None): +def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path, output_path=None): # 0. load diffusers pipe and convert to ppdiffusers weights format diffusers_pipe = DiffusersVersatileDiffusionPipeline.from_pretrained( - pretrained_model_name_or_path, use_auth_token=True) + pretrained_model_name_or_path, use_auth_token=True + ) vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae) image_unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.image_unet) text_unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.text_unet) text_encoder_state_dict, text_config = convert_hf_clip_to_ppnlp_clip( - diffusers_pipe.text_encoder, is_text_encoder=True, need_prefix=False) + diffusers_pipe.text_encoder, is_text_encoder=True, need_prefix=False + ) image_encoder_state_dict, vision_config = convert_hf_clip_to_ppnlp_clip( - diffusers_pipe.image_encoder, is_text_encoder=False, need_prefix=False) + diffusers_pipe.image_encoder, is_text_encoder=False, need_prefix=False + ) # 1. vae pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config) @@ -179,14 +185,12 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path, check_keys(pp_text_unet, text_unet_state_dict) # 4. image_encoder - pp_image_encoder = CLIPVisionModelWithProjection( - CLIPVisionConfig.from_dict(vision_config)) + pp_image_encoder = CLIPVisionModelWithProjection(CLIPVisionConfig.from_dict(vision_config)) pp_image_encoder.set_dict(image_encoder_state_dict) check_keys(pp_image_encoder, image_encoder_state_dict) # 5. text_encoder - pp_text_encoder = CLIPTextModelWithProjection( - CLIPTextConfig.from_dict(text_config)) + pp_text_encoder = CLIPTextModelWithProjection(CLIPTextConfig.from_dict(text_config)) pp_text_encoder.set_dict(text_encoder_state_dict) check_keys(pp_text_encoder, text_encoder_state_dict) @@ -203,12 +207,10 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path, set_alpha_to_one=False, steps_offset=1, # Make sure the scheduler compatible with PNDM - skip_prk_steps=True, ) + skip_prk_steps=True, + ) elif "lms" in scheduler_type: - pp_scheduler = LMSDiscreteScheduler( - beta_start=beta_start, - beta_end=beta_end, - beta_schedule="scaled_linear") + pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear") elif "ddim" in scheduler_type: pp_scheduler = DDIMScheduler( beta_start=beta_start, @@ -217,13 +219,13 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path, # Make sure the scheduler compatible with DDIM clip_sample=False, set_alpha_to_one=False, - steps_offset=1, ) + steps_offset=1, + ) else: raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") with tempfile.TemporaryDirectory() as tmpdirname: - pp_feature_extractor = CLIPFeatureExtractor.from_pretrained( - "CompVis/stable-diffusion-v1-4/feature_extractor") + pp_feature_extractor = CLIPFeatureExtractor.from_pretrained("CompVis/stable-diffusion-v1-4/feature_extractor") # 7. tokenizer diffusers_pipe.tokenizer.save_pretrained(tmpdirname) pp_tokenizer = CLIPTokenizer.from_pretrained(tmpdirname) @@ -236,15 +238,15 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path, image_unet=pp_image_unet, text_unet=pp_text_unet, vae=pp_vae, - scheduler=pp_scheduler, ) + scheduler=pp_scheduler, + ) # 9. save_pretrained paddle_pipe.save_pretrained(output_path) return paddle_pipe if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Pytorch model weights to Paddle model weights.") + parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.") parser.add_argument( "--pretrained_model_name_or_path", type=str, @@ -255,7 +257,7 @@ def convert_diffusers_to_ppdiffusers(pretrained_model_name_or_path, "--output_path", type=str, default="versatile-diffusion-ppdiffusers", - help="The model output path.", ) + help="The model output path.", + ) args = parser.parse_args() - ppdiffusers_pipe = convert_diffusers_to_ppdiffusers( - args.pretrained_model_name_or_path, args.output_path) + ppdiffusers_pipe = convert_diffusers_to_ppdiffusers(args.pretrained_model_name_or_path, args.output_path) diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_alt_diffusion_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_alt_diffusion_to_ppdiffusers.py index 62de6daa072d9..ff8c68985a249 100644 --- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_alt_diffusion_to_ppdiffusers.py +++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_alt_diffusion_to_ppdiffusers.py @@ -18,17 +18,28 @@ import paddle import torch from diffusers import AltDiffusionPipeline as DiffusersAltDiffusionPipeline -from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPVisionConfig, - XLMRobertaTokenizer) +from paddlenlp.transformers import ( + CLIPFeatureExtractor, + CLIPVisionConfig, + XLMRobertaTokenizer, +) from ppdiffusers import AltDiffusionPipeline as PPDiffusersAltDiffusionPipeline from ppdiffusers import ( - AutoencoderKL, DDIMScheduler, DPMSolverMultistepScheduler, - EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, - HeunDiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler, - UNet2DConditionModel) + AutoencoderKL, + DDIMScheduler, + DPMSolverMultistepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + HeunDiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + UNet2DConditionModel, +) from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import ( - RobertaSeriesConfig, RobertaSeriesModelWithTransformation) + RobertaSeriesConfig, + RobertaSeriesModelWithTransformation, +) from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker paddle.set_device("cpu") @@ -67,9 +78,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True): ".vision_model.": ".", } ignore_value = ["position_ids"] - donot_transpose = [ - "embeddings", "norm", "concept_embeds", "special_care_embeds" - ] + donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"] for name, value in clip.state_dict().items(): # step1: ignore position_ids @@ -83,7 +92,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True): name = name.replace(hf_name, ppnlp_name) # step4: 0d tensor -> 1d tensor if name == "logit_scale": - value = value.reshape((1, )) + value = value.reshape((1,)) # step5: safety_checker need prefix "clip." if "vision_model" in name: name = "clip." + name @@ -108,8 +117,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True): "vision_heads": clip.config.vision_config.num_attention_heads, "vision_embed_dim": clip.config.vision_config.hidden_size, "vision_patch_size": clip.config.vision_config.patch_size, - "vision_mlp_ratio": clip.config.vision_config.intermediate_size // - clip.config.vision_config.hidden_size, + "vision_mlp_ratio": clip.config.vision_config.intermediate_size // clip.config.vision_config.hidden_size, "vision_hidden_act": clip.config.vision_config.hidden_act, "projection_dim": clip.config.projection_dim, } @@ -119,10 +127,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True): def convert_hf_xlm_roberta_to_ppnlp_xlm_roberta(xlm_roberta, dtype="float32"): new_model_state = {} mappings = [ - [ - "embeddings.word_embeddings.weight", - "embeddings.word_embeddings.weight" - ], + ["embeddings.word_embeddings.weight", "embeddings.word_embeddings.weight"], [ "embeddings.position_embeddings.weight", "embeddings.position_embeddings.weight", @@ -224,21 +229,17 @@ def convert_hf_xlm_roberta_to_ppnlp_xlm_roberta(xlm_roberta, dtype="float32"): hf_name = prefix + hf_name pp_name = prefix + pp_name if need_transpose: - new_model_state[pp_name] = ( - state_dict[hf_name].t().cpu().numpy().astype(dtype)) + new_model_state[pp_name] = state_dict[hf_name].t().cpu().numpy().astype(dtype) else: - new_model_state[pp_name] = state_dict[hf_name].cpu().numpy().astype( - dtype) + new_model_state[pp_name] = state_dict[hf_name].cpu().numpy().astype(dtype) new_config = xlm_roberta.config.to_dict() return new_model_state, new_config -def convert_diffusers_stable_diffusion_to_ppdiffusers( - pretrained_model_name_or_path, output_path=None): +def convert_diffusers_stable_diffusion_to_ppdiffusers(pretrained_model_name_or_path, output_path=None): # 0. load diffusers pipe and convert to ppdiffusers weights format - diffusers_pipe = DiffusersAltDiffusionPipeline.from_pretrained( - pretrained_model_name_or_path, use_auth_token=True) + diffusers_pipe = DiffusersAltDiffusionPipeline.from_pretrained(pretrained_model_name_or_path, use_auth_token=True) vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae) unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet) ( @@ -246,7 +247,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( text_encoder_config, ) = convert_hf_xlm_roberta_to_ppnlp_xlm_roberta(diffusers_pipe.text_encoder) safety_checker_state_dict, safety_checker_config = convert_hf_clip_to_ppnlp_clip( - diffusers_pipe.safety_checker, is_text_encoder=False) + diffusers_pipe.safety_checker, is_text_encoder=False + ) # 1. vae pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config) @@ -264,8 +266,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( pp_text_encoder.set_dict(text_encoder_state_dict) # 4. safety_checker - pp_safety_checker = StableDiffusionSafetyChecker( - CLIPVisionConfig.from_dict(safety_checker_config)) + pp_safety_checker = StableDiffusionSafetyChecker(CLIPVisionConfig.from_dict(safety_checker_config)) pp_safety_checker.set_dict(safety_checker_state_dict) # 5. scheduler @@ -281,7 +282,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( num_train_timesteps=num_train_timesteps, steps_offset=1, clip_sample=False, - set_alpha_to_one=False, ) + set_alpha_to_one=False, + ) # make sure scheduler works correctly with DDIM scheduler.register_to_config(clip_sample=False) @@ -296,8 +298,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( elif scheduler_type == "euler": scheduler = EulerDiscreteScheduler.from_config(scheduler.config) elif scheduler_type == "euler-ancestral": - scheduler = EulerAncestralDiscreteScheduler.from_config( - scheduler.config) + scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config) elif scheduler_type == "dpm": scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config) elif scheduler_type == "ddim": @@ -308,8 +309,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( with tempfile.TemporaryDirectory() as tmpdirname: # 6. feature_extractor # diffusers_pipe.feature_extractor.save_pretrained(tmpdirname) - pp_feature_extractor = CLIPFeatureExtractor.from_pretrained( - "CompVis/stable-diffusion-v1-4/feature_extractor") + pp_feature_extractor = CLIPFeatureExtractor.from_pretrained("CompVis/stable-diffusion-v1-4/feature_extractor") # 7. tokenizer diffusers_pipe.tokenizer.save_pretrained(tmpdirname) @@ -323,15 +323,15 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( unet=pp_unet, safety_checker=pp_safety_checker, feature_extractor=pp_feature_extractor, - scheduler=scheduler, ) + scheduler=scheduler, + ) # 9. save_pretrained paddle_pipe.save_pretrained(output_path) return paddle_pipe if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Pytorch model weights to Paddle model weights.") + parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.") parser.add_argument( "--pretrained_model_name_or_path", type=str, @@ -342,7 +342,9 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( "--output_path", type=str, default="AltDiffusion-ppdiffusers", - help="The model output path.", ) + help="The model output path.", + ) args = parser.parse_args() ppdiffusers_pipe = convert_diffusers_stable_diffusion_to_ppdiffusers( - args.pretrained_model_name_or_path, args.output_path) + args.pretrained_model_name_or_path, args.output_path + ) diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_controlnet_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_controlnet_to_ppdiffusers.py index 17aa70d3ef95a..bd8d3e8bbb152 100644 --- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_controlnet_to_ppdiffusers.py +++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_controlnet_to_ppdiffusers.py @@ -40,8 +40,7 @@ def convert_to_ppdiffusers(controlnet, dtype="float32"): if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Pytorch model weights to Paddle model weights.") + parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.") parser.add_argument( "--pretrained_model_name_or_path", type=str, @@ -52,11 +51,11 @@ def convert_to_ppdiffusers(controlnet, dtype="float32"): "--output_path", type=str, default="paddle_models/sd-controlnet-canny", - help="The output path.", ) + help="The output path.", + ) args = parser.parse_args() - th_controlnet = DiffusersControlNetModel.from_pretrained( - args.pretrained_model_name_or_path) + th_controlnet = DiffusersControlNetModel.from_pretrained(args.pretrained_model_name_or_path) controlnet_state_dict = convert_to_ppdiffusers(th_controlnet) pp_controlnet = PPDiffusersControlNetModel.from_config(th_controlnet.config) pp_controlnet.set_dict(controlnet_state_dict) diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_latent_diffusion_model_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_latent_diffusion_model_to_ppdiffusers.py index 021da51309528..7cd30d3c3e077 100644 --- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_latent_diffusion_model_to_ppdiffusers.py +++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_latent_diffusion_model_to_ppdiffusers.py @@ -21,10 +21,8 @@ from paddlenlp.transformers import BertTokenizer from ppdiffusers import AutoencoderKL, DDIMScheduler, LDMBertModel -from ppdiffusers import \ - LDMTextToImagePipeline as PPDiffusersLDMTextToImagePipeline -from ppdiffusers import (LMSDiscreteScheduler, PNDMScheduler, - UNet2DConditionModel) +from ppdiffusers import LDMTextToImagePipeline as PPDiffusersLDMTextToImagePipeline +from ppdiffusers import LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel paddle.set_device("cpu") @@ -87,15 +85,14 @@ def convert_hf_ldmbert_to_ppnlp_ldmbert(ldmbert, dtype="float32"): return new_model_state, new_config -def convert_diffusers_stable_diffusion_to_ppdiffusers( - pretrained_model_name_or_path, output_path=None): +def convert_diffusers_stable_diffusion_to_ppdiffusers(pretrained_model_name_or_path, output_path=None): # 0. load diffusers pipe and convert to ppdiffusers weights format diffusers_pipe = DiffusersLDMTextToImagePipeline.from_pretrained( - pretrained_model_name_or_path, use_auth_token=True) + pretrained_model_name_or_path, use_auth_token=True + ) vqvae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vqvae) unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet) - bert_state_dict, bert_config = convert_hf_ldmbert_to_ppnlp_ldmbert( - diffusers_pipe.bert) + bert_state_dict, bert_config = convert_hf_ldmbert_to_ppnlp_ldmbert(diffusers_pipe.bert) # 1. vqvae pp_vqvae = AutoencoderKL.from_config(diffusers_pipe.vqvae.config) @@ -123,12 +120,10 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( set_alpha_to_one=False, steps_offset=1, # Make sure the scheduler compatible with PNDM - skip_prk_steps=True, ) + skip_prk_steps=True, + ) elif "lms" in scheduler_type: - pp_scheduler = LMSDiscreteScheduler( - beta_start=beta_start, - beta_end=beta_end, - beta_schedule="scaled_linear") + pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear") elif "ddim" in scheduler_type: pp_scheduler = DDIMScheduler( beta_start=beta_start, @@ -137,15 +132,15 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( # Make sure the scheduler compatible with DDIM clip_sample=False, set_alpha_to_one=False, - steps_offset=1, ) + steps_offset=1, + ) else: raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") with tempfile.TemporaryDirectory() as tmpdirname: # 5. tokenizer diffusers_pipe.tokenizer.save_pretrained(tmpdirname) - pp_tokenizer = BertTokenizer.from_pretrained( - tmpdirname, model_max_length=77) + pp_tokenizer = BertTokenizer.from_pretrained(tmpdirname, model_max_length=77) # 6. create ppdiffusers pipe paddle_pipe = PPDiffusersLDMTextToImagePipeline( @@ -153,7 +148,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( bert=pp_bert, tokenizer=pp_tokenizer, unet=pp_unet, - scheduler=pp_scheduler, ) + scheduler=pp_scheduler, + ) # 7. save_pretrained paddle_pipe.save_pretrained(output_path) @@ -161,8 +157,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Pytorch model weights to Paddle model weights.") + parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.") parser.add_argument( "--pretrained_model_name_or_path", type=str, @@ -173,7 +168,9 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( "--output_path", type=str, default="ldm-text2im-large-256-ppdiffusers", - help="The model output path.", ) + help="The model output path.", + ) args = parser.parse_args() ppdiffusers_pipe = convert_diffusers_stable_diffusion_to_ppdiffusers( - args.pretrained_model_name_or_path, args.output_path) + args.pretrained_model_name_or_path, args.output_path + ) diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_paintbyexample_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_paintbyexample_to_ppdiffusers.py index 212808cd405fa..519d032808939 100644 --- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_paintbyexample_to_ppdiffusers.py +++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_paintbyexample_to_ppdiffusers.py @@ -18,15 +18,16 @@ import paddle import torch from diffusers import PaintByExamplePipeline as DiffusersPaintByExamplePipeline + # CLIPImageProcessor need paddlenlp latest from paddlenlp.transformers import CLIPImageProcessor, CLIPVisionConfig from ppdiffusers import AutoencoderKL -from ppdiffusers import \ - PaintByExamplePipeline as PPDiffusersPaintByExamplePipeline +from ppdiffusers import PaintByExamplePipeline as PPDiffusersPaintByExamplePipeline from ppdiffusers import PNDMScheduler, UNet2DConditionModel -from ppdiffusers.pipelines.paint_by_example.image_encoder import \ - PaintByExampleImageEncoder +from ppdiffusers.pipelines.paint_by_example.image_encoder import ( + PaintByExampleImageEncoder, +) paddle.set_device("cpu") @@ -63,9 +64,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32"): ".post_layernorm.": ".ln_post.", } ignore_value = ["position_ids", "mapper"] - donot_transpose = [ - "embeddings", "norm", "concept_embeds", "special_care_embeds" - ] + donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"] for name, value in clip.state_dict().items(): # step1: ignore position_ids and mapper @@ -79,7 +78,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32"): name = name.replace(hf_name, ppnlp_name) # step4: 0d tensor -> 1d tensor if name == "logit_scale": - value = value.reshape((1, )) + value = value.reshape((1,)) new_model_state[name] = value.cpu().numpy().astype(dtype) @@ -93,8 +92,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32"): "vision_heads": clip.config.num_attention_heads, "vision_embed_dim": clip.config.hidden_size, "vision_patch_size": clip.config.patch_size, - "vision_mlp_ratio": - clip.config.intermediate_size // clip.config.hidden_size, + "vision_mlp_ratio": clip.config.intermediate_size // clip.config.hidden_size, "vision_hidden_act": clip.config.hidden_act, "projection_dim": clip.config.projection_dim, } @@ -118,15 +116,14 @@ def check_keys(model, state_dict): print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!") -def convert_diffusers_paintbyexample_to_ppdiffusers( - pretrained_model_name_or_path, output_path=None): +def convert_diffusers_paintbyexample_to_ppdiffusers(pretrained_model_name_or_path, output_path=None): # 0. load diffusers pipe and convert to ppdiffusers weights format diffusers_pipe = DiffusersPaintByExamplePipeline.from_pretrained( - pretrained_model_name_or_path, use_auth_token=True) + pretrained_model_name_or_path, use_auth_token=True + ) vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae) unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet) - image_encoder_state_dict, image_encoder_config = convert_hf_clip_to_ppnlp_clip( - diffusers_pipe.image_encoder) + image_encoder_state_dict, image_encoder_config = convert_hf_clip_to_ppnlp_clip(diffusers_pipe.image_encoder) # 1. vae pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config) @@ -138,8 +135,7 @@ def convert_diffusers_paintbyexample_to_ppdiffusers( check_keys(pp_unet, unet_state_dict) # 3. image_encoder - pp_image_encoder = PaintByExampleImageEncoder( - CLIPVisionConfig.from_dict(image_encoder_config)) + pp_image_encoder = PaintByExampleImageEncoder(CLIPVisionConfig.from_dict(image_encoder_config)) pp_image_encoder.set_dict(image_encoder_state_dict) check_keys(pp_image_encoder, image_encoder_state_dict) # 4. scheduler @@ -158,7 +154,8 @@ def convert_diffusers_paintbyexample_to_ppdiffusers( scheduler=pp_scheduler, safety_checker=None, feature_extractor=feature_extractor, - requires_safety_checker=False, ) + requires_safety_checker=False, + ) # 6. save_pretrained paddle_pipe.save_pretrained(output_path) @@ -166,8 +163,7 @@ def convert_diffusers_paintbyexample_to_ppdiffusers( if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Pytorch model weights to Paddle model weights.") + parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.") parser.add_argument( "--pretrained_model_name_or_path", type=str, @@ -178,7 +174,9 @@ def convert_diffusers_paintbyexample_to_ppdiffusers( "--output_path", type=str, default="./Paint-by-Example", - help="The model output path.", ) + help="The model output path.", + ) args = parser.parse_args() ppdiffusers_pipe = convert_diffusers_paintbyexample_to_ppdiffusers( - args.pretrained_model_name_or_path, args.output_path) + args.pretrained_model_name_or_path, args.output_path + ) diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion2.0_depth_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion2.0_depth_to_ppdiffusers.py index f1d3d6bd2462f..fa189095cbb9d 100644 --- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion2.0_depth_to_ppdiffusers.py +++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion2.0_depth_to_ppdiffusers.py @@ -17,17 +17,22 @@ import paddle import torch -from diffusers import \ - StableDiffusionDepth2ImgPipeline as \ - DiffusersStableDiffusionDepth2ImgPipeline -from paddlenlp.transformers import (CLIPTextConfig, CLIPTextModel, - CLIPTokenizer, DPTConfig, - DPTForDepthEstimation, DPTImageProcessor) +from diffusers import ( + StableDiffusionDepth2ImgPipeline as DiffusersStableDiffusionDepth2ImgPipeline, +) +from paddlenlp.transformers import ( + CLIPTextConfig, + CLIPTextModel, + CLIPTokenizer, + DPTConfig, + DPTForDepthEstimation, + DPTImageProcessor, +) from ppdiffusers import AutoencoderKL, PNDMScheduler -from ppdiffusers import \ - StableDiffusionDepth2ImgPipeline as \ - PPDiffusersStableDiffusionDepth2ImgPipeline +from ppdiffusers import ( + StableDiffusionDepth2ImgPipeline as PPDiffusersStableDiffusionDepth2ImgPipeline, +) from ppdiffusers import UNet2DConditionModel paddle.set_device("cpu") @@ -66,9 +71,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32"): ".vision_model.": ".", } ignore_value = ["position_ids"] - donot_transpose = [ - "embeddings", "norm", "concept_embeds", "special_care_embeds" - ] + donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"] for name, value in clip.state_dict().items(): # step1: ignore position_ids @@ -82,7 +85,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32"): name = name.replace(hf_name, ppnlp_name) # step4: 0d tensor -> 1d tensor if name == "logit_scale": - value = value.reshape((1, )) + value = value.reshape((1,)) new_model_state[name] = value.cpu().numpy().astype(dtype) @@ -117,17 +120,15 @@ def check_keys(model, state_dict): print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!") -def convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers( - pretrained_model_name_or_path, output_path=None): +def convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers(pretrained_model_name_or_path, output_path=None): # 0. load diffusers pipe and convert to ppdiffusers weights format diffusers_pipe = DiffusersStableDiffusionDepth2ImgPipeline.from_pretrained( - pretrained_model_name_or_path, use_auth_token=True) + pretrained_model_name_or_path, use_auth_token=True + ) vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae) unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet) - depth_estimator_state_dict = convert_to_ppdiffusers( - diffusers_pipe.depth_estimator) - text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip( - diffusers_pipe.text_encoder) + depth_estimator_state_dict = convert_to_ppdiffusers(diffusers_pipe.depth_estimator) + text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip(diffusers_pipe.text_encoder) # 1. vae pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config) @@ -138,8 +139,7 @@ def convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers( pp_unet.set_dict(unet_state_dict) check_keys(pp_unet, unet_state_dict) # 3. text_encoder - pp_text_encoder = CLIPTextModel( - CLIPTextConfig.from_dict(text_encoder_config)) + pp_text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_encoder_config)) pp_text_encoder.set_dict(text_encoder_state_dict) check_keys(pp_text_encoder, text_encoder_state_dict) # 4. scheduler @@ -168,7 +168,8 @@ def convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers( unet=pp_unet, feature_extractor=pp_feature_extractor, depth_estimator=pp_depth_estimator, - scheduler=pp_scheduler, ) + scheduler=pp_scheduler, + ) # 9. save_pretrained paddle_pipe.save_pretrained(output_path) @@ -176,8 +177,7 @@ def convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers( if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Pytorch model weights to Paddle model weights.") + parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.") parser.add_argument( "--pretrained_model_name_or_path", type=str, @@ -188,7 +188,9 @@ def convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers( "--output_path", type=str, default="stable-diffusion-2-depth", - help="The model output path.", ) + help="The model output path.", + ) args = parser.parse_args() ppdiffusers_pipe = convert_diffusers_stable_diffusion2_0_depth_to_ppdiffusers( - args.pretrained_model_name_or_path, args.output_path) + args.pretrained_model_name_or_path, args.output_path + ) diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers.py index 9ec5f95b55248..bd8178c872874 100644 --- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers.py +++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers.py @@ -17,18 +17,27 @@ import paddle import torch -from diffusers import \ - StableDiffusionControlNetPipeline as \ - DiffusersStableDiffusionControlNetPipeline -from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextConfig, - CLIPTextModel, CLIPTokenizer, - CLIPVisionConfig) - -from ppdiffusers import (AutoencoderKL, ControlNetModel, DDIMScheduler, - LMSDiscreteScheduler, PNDMScheduler) -from ppdiffusers import \ - StableDiffusionControlNetPipeline as \ - PPDiffusersStableDiffusionControlNetPipeline +from diffusers import ( + StableDiffusionControlNetPipeline as DiffusersStableDiffusionControlNetPipeline, +) +from paddlenlp.transformers import ( + CLIPFeatureExtractor, + CLIPTextConfig, + CLIPTextModel, + CLIPTokenizer, + CLIPVisionConfig, +) + +from ppdiffusers import ( + AutoencoderKL, + ControlNetModel, + DDIMScheduler, + LMSDiscreteScheduler, + PNDMScheduler, +) +from ppdiffusers import ( + StableDiffusionControlNetPipeline as PPDiffusersStableDiffusionControlNetPipeline, +) from ppdiffusers import UNet2DConditionModel from ppdiffusers.configuration_utils import FrozenDict from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker @@ -69,9 +78,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True): ".vision_model.": ".", } ignore_value = ["position_ids"] - donot_transpose = [ - "embeddings", "norm", "concept_embeds", "special_care_embeds" - ] + donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"] for name, value in clip.state_dict().items(): # step1: ignore position_ids @@ -85,7 +92,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True): name = name.replace(hf_name, ppnlp_name) # step4: 0d tensor -> 1d tensor if name == "logit_scale": - value = value.reshape((1, )) + value = value.reshape((1,)) # step5: safety_checker need prefix "clip." if "vision_model" in name: name = "clip." + name @@ -110,26 +117,25 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True): "vision_heads": clip.config.vision_config.num_attention_heads, "vision_embed_dim": clip.config.vision_config.hidden_size, "vision_patch_size": clip.config.vision_config.patch_size, - "vision_mlp_ratio": clip.config.vision_config.intermediate_size // - clip.config.vision_config.hidden_size, + "vision_mlp_ratio": clip.config.vision_config.intermediate_size // clip.config.vision_config.hidden_size, "vision_hidden_act": clip.config.vision_config.hidden_act, "projection_dim": clip.config.projection_dim, } return new_model_state, new_config -def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers( - pretrained_model_name_or_path, output_path=None): +def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers(pretrained_model_name_or_path, output_path=None): # 0. load diffusers pipe and convert to ppdiffusers weights format diffusers_pipe = DiffusersStableDiffusionControlNetPipeline.from_pretrained( - pretrained_model_name_or_path, use_auth_token=True) - requires_safety_checker = getattr(diffusers_pipe, "requires_safety_checker", - False) + pretrained_model_name_or_path, use_auth_token=True + ) + requires_safety_checker = getattr(diffusers_pipe, "requires_safety_checker", False) vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae) unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet) controlnet_state_dict = convert_to_ppdiffusers(diffusers_pipe.controlnet) text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip( - diffusers_pipe.text_encoder, is_text_encoder=True) + diffusers_pipe.text_encoder, is_text_encoder=True + ) # 1. vae pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config) @@ -142,14 +148,12 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers( pp_unet.set_dict(unet_state_dict) # 3. controlnet - pp_controlnet = ControlNetModel.from_config( - diffusers_pipe.controlnet.config) + pp_controlnet = ControlNetModel.from_config(diffusers_pipe.controlnet.config) pp_controlnet.set_dict(controlnet_state_dict) # 4. text_encoder - pp_text_encoder = CLIPTextModel( - CLIPTextConfig.from_dict(text_encoder_config)) + pp_text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_encoder_config)) pp_text_encoder.set_dict(text_encoder_state_dict) # 5. scheduler @@ -165,12 +169,10 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers( set_alpha_to_one=False, steps_offset=1, # Make sure the scheduler compatible with PNDM - skip_prk_steps=True, ) + skip_prk_steps=True, + ) elif "lms" in scheduler_type: - pp_scheduler = LMSDiscreteScheduler( - beta_start=beta_start, - beta_end=beta_end, - beta_schedule="scaled_linear") + pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear") elif "ddim" in scheduler_type: pp_scheduler = DDIMScheduler( beta_start=beta_start, @@ -179,7 +181,8 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers( # Make sure the scheduler compatible with DDIM clip_sample=False, set_alpha_to_one=False, - steps_offset=1, ) + steps_offset=1, + ) else: raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") @@ -192,14 +195,14 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers( # 7. feature_extractor # diffusers_pipe.feature_extractor.save_pretrained(tmpdirname) pp_feature_extractor = CLIPFeatureExtractor.from_pretrained( - "CompVis/stable-diffusion-v1-4/feature_extractor") + "CompVis/stable-diffusion-v1-4/feature_extractor" + ) # 8. safety_checker ( safety_checker_state_dict, - safety_checker_config, ) = convert_hf_clip_to_ppnlp_clip( - diffusers_pipe.safety_checker, is_text_encoder=False) - pp_safety_checker = StableDiffusionSafetyChecker( - CLIPVisionConfig.from_dict(safety_checker_config)) + safety_checker_config, + ) = convert_hf_clip_to_ppnlp_clip(diffusers_pipe.safety_checker, is_text_encoder=False) + pp_safety_checker = StableDiffusionSafetyChecker(CLIPVisionConfig.from_dict(safety_checker_config)) pp_safety_checker.set_dict(safety_checker_state_dict) # 9. create ppdiffusers pipe paddle_pipe = PPDiffusersStableDiffusionControlNetPipeline( @@ -210,7 +213,8 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers( controlnet=pp_controlnet, safety_checker=pp_safety_checker, feature_extractor=pp_feature_extractor, - scheduler=pp_scheduler, ) + scheduler=pp_scheduler, + ) else: # 9. create ppdiffusers pipe paddle_pipe = PPDiffusersStableDiffusionControlNetPipeline( @@ -222,7 +226,8 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers( safety_checker=None, feature_extractor=None, scheduler=pp_scheduler, - requires_safety_checker=False, ) + requires_safety_checker=False, + ) if "runwayml/stable-diffusion-inpainting" in pretrained_model_name_or_path: _internal_dict = dict(paddle_pipe._internal_dict) if _internal_dict["_ppdiffusers_version"] == "0.0.0": @@ -234,8 +239,7 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers( if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Pytorch model weights to Paddle model weights.") + parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.") parser.add_argument( "--pretrained_model_name_or_path", type=str, @@ -246,7 +250,9 @@ def convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers( "--output_path", type=str, default="control_sd15_canny-ppdiffusers", - help="The model output path.", ) + help="The model output path.", + ) args = parser.parse_args() ppdiffusers_pipe = convert_diffusers_stable_diffusion_controlnet_to_ppdiffusers( - args.pretrained_model_name_or_path, args.output_path) + args.pretrained_model_name_or_path, args.output_path + ) diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_to_ppdiffusers.py index 6d3811cc0bc82..a3374a432caa4 100644 --- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_to_ppdiffusers.py +++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_stable_diffusion_to_ppdiffusers.py @@ -17,16 +17,22 @@ import paddle import torch -from diffusers import \ - StableDiffusionPipeline as DiffusersStableDiffusionPipeline -from paddlenlp.transformers import (CLIPFeatureExtractor, CLIPTextConfig, - CLIPTextModel, CLIPTokenizer, - CLIPVisionConfig) - -from ppdiffusers import (AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, - PNDMScheduler) -from ppdiffusers import \ - StableDiffusionPipeline as PPDiffusersStableDiffusionPipeline +from diffusers import StableDiffusionPipeline as DiffusersStableDiffusionPipeline +from paddlenlp.transformers import ( + CLIPFeatureExtractor, + CLIPTextConfig, + CLIPTextModel, + CLIPTokenizer, + CLIPVisionConfig, +) + +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + LMSDiscreteScheduler, + PNDMScheduler, +) +from ppdiffusers import StableDiffusionPipeline as PPDiffusersStableDiffusionPipeline from ppdiffusers import UNet2DConditionModel from ppdiffusers.configuration_utils import FrozenDict from ppdiffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker @@ -67,9 +73,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True): ".vision_model.": ".", } ignore_value = ["position_ids"] - donot_transpose = [ - "embeddings", "norm", "concept_embeds", "special_care_embeds" - ] + donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"] for name, value in clip.state_dict().items(): # step1: ignore position_ids @@ -83,7 +87,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True): name = name.replace(hf_name, ppnlp_name) # step4: 0d tensor -> 1d tensor if name == "logit_scale": - value = value.reshape((1, )) + value = value.reshape((1,)) # step5: safety_checker need prefix "clip." if "vision_model" in name: name = "clip." + name @@ -108,25 +112,24 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True): "vision_heads": clip.config.vision_config.num_attention_heads, "vision_embed_dim": clip.config.vision_config.hidden_size, "vision_patch_size": clip.config.vision_config.patch_size, - "vision_mlp_ratio": clip.config.vision_config.intermediate_size // - clip.config.vision_config.hidden_size, + "vision_mlp_ratio": clip.config.vision_config.intermediate_size // clip.config.vision_config.hidden_size, "vision_hidden_act": clip.config.vision_config.hidden_act, "projection_dim": clip.config.projection_dim, } return new_model_state, new_config -def convert_diffusers_stable_diffusion_to_ppdiffusers( - pretrained_model_name_or_path, output_path=None): +def convert_diffusers_stable_diffusion_to_ppdiffusers(pretrained_model_name_or_path, output_path=None): # 0. load diffusers pipe and convert to ppdiffusers weights format diffusers_pipe = DiffusersStableDiffusionPipeline.from_pretrained( - pretrained_model_name_or_path, use_auth_token=True) - requires_safety_checker = getattr(diffusers_pipe, "requires_safety_checker", - False) + pretrained_model_name_or_path, use_auth_token=True + ) + requires_safety_checker = getattr(diffusers_pipe, "requires_safety_checker", False) vae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vae) unet_state_dict = convert_to_ppdiffusers(diffusers_pipe.unet) text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip( - diffusers_pipe.text_encoder, is_text_encoder=True) + diffusers_pipe.text_encoder, is_text_encoder=True + ) # 1. vae pp_vae = AutoencoderKL.from_config(diffusers_pipe.vae.config) @@ -139,8 +142,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( pp_unet.set_dict(unet_state_dict) # 3. text_encoder - pp_text_encoder = CLIPTextModel( - CLIPTextConfig.from_dict(text_encoder_config)) + pp_text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_encoder_config)) pp_text_encoder.set_dict(text_encoder_state_dict) # 4. scheduler @@ -156,12 +158,10 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( set_alpha_to_one=False, steps_offset=1, # Make sure the scheduler compatible with PNDM - skip_prk_steps=True, ) + skip_prk_steps=True, + ) elif "lms" in scheduler_type: - pp_scheduler = LMSDiscreteScheduler( - beta_start=beta_start, - beta_end=beta_end, - beta_schedule="scaled_linear") + pp_scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear") elif "ddim" in scheduler_type: pp_scheduler = DDIMScheduler( beta_start=beta_start, @@ -170,7 +170,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( # Make sure the scheduler compatible with DDIM clip_sample=False, set_alpha_to_one=False, - steps_offset=1, ) + steps_offset=1, + ) else: raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!") @@ -183,14 +184,14 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( # 6. feature_extractor # diffusers_pipe.feature_extractor.save_pretrained(tmpdirname) pp_feature_extractor = CLIPFeatureExtractor.from_pretrained( - "CompVis/stable-diffusion-v1-4/feature_extractor") + "CompVis/stable-diffusion-v1-4/feature_extractor" + ) # 7. safety_checker ( safety_checker_state_dict, - safety_checker_config, ) = convert_hf_clip_to_ppnlp_clip( - diffusers_pipe.safety_checker, is_text_encoder=False) - pp_safety_checker = StableDiffusionSafetyChecker( - CLIPVisionConfig.from_dict(safety_checker_config)) + safety_checker_config, + ) = convert_hf_clip_to_ppnlp_clip(diffusers_pipe.safety_checker, is_text_encoder=False) + pp_safety_checker = StableDiffusionSafetyChecker(CLIPVisionConfig.from_dict(safety_checker_config)) pp_safety_checker.set_dict(safety_checker_state_dict) # 8. create ppdiffusers pipe paddle_pipe = PPDiffusersStableDiffusionPipeline( @@ -200,7 +201,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( unet=pp_unet, safety_checker=pp_safety_checker, feature_extractor=pp_feature_extractor, - scheduler=pp_scheduler, ) + scheduler=pp_scheduler, + ) else: # 8. create ppdiffusers pipe paddle_pipe = PPDiffusersStableDiffusionPipeline( @@ -211,7 +213,8 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( safety_checker=None, feature_extractor=None, scheduler=pp_scheduler, - requires_safety_checker=False, ) + requires_safety_checker=False, + ) if "runwayml/stable-diffusion-inpainting" in pretrained_model_name_or_path: _internal_dict = dict(paddle_pipe._internal_dict) if _internal_dict["_ppdiffusers_version"] == "0.0.0": @@ -223,8 +226,7 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Pytorch model weights to Paddle model weights.") + parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.") parser.add_argument( "--pretrained_model_name_or_path", type=str, @@ -235,7 +237,9 @@ def convert_diffusers_stable_diffusion_to_ppdiffusers( "--output_path", type=str, default="stable-diffusion-v1-5-ppdiffusers", - help="The model output path.", ) + help="The model output path.", + ) args = parser.parse_args() ppdiffusers_pipe = convert_diffusers_stable_diffusion_to_ppdiffusers( - args.pretrained_model_name_or_path, args.output_path) + args.pretrained_model_name_or_path, args.output_path + ) diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_unclip_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_unclip_to_ppdiffusers.py index c5c28bfce9e02..204766187c39c 100644 --- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_unclip_to_ppdiffusers.py +++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_unclip_to_ppdiffusers.py @@ -18,8 +18,11 @@ import paddle import torch from diffusers import UnCLIPPipeline as DiffusersUnCLIPPipeline -from paddlenlp.transformers import (CLIPTextConfig, CLIPTextModelWithProjection, - CLIPTokenizer) +from paddlenlp.transformers import ( + CLIPTextConfig, + CLIPTextModelWithProjection, + CLIPTokenizer, +) from ppdiffusers import PriorTransformer from ppdiffusers import UnCLIPPipeline as PPDiffusersUnCLIPPipeline @@ -43,10 +46,7 @@ def convert_to_ppdiffusers(vae_or_unet, dtype="float32", prefix=""): return new_vae_or_unet -def convert_hf_clip_to_ppnlp_clip(clip, - dtype="float32", - is_text_encoder=True, - need_prefix=False): +def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True, need_prefix=False): new_model_state = {} transformers2ppnlp = { ".encoder.": ".transformer.", @@ -65,9 +65,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, ".vision_model.": ".", } ignore_value = ["position_ids"] - donot_transpose = [ - "embeddings", "norm", "concept_embeds", "special_care_embeds" - ] + donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"] for name, value in clip.state_dict().items(): # step1: ignore position_ids @@ -81,7 +79,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, name = name.replace(hf_name, ppnlp_name) # step4: 0d tensor -> 1d tensor if name == "logit_scale": - value = value.reshape((1, )) + value = value.reshape((1,)) # step5: safety_checker need prefix "clip." if "vision_model" in name and need_prefix: name = "clip." + name @@ -119,8 +117,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, "vision_heads": clip.config.num_attention_heads, "vision_embed_dim": clip.config.hidden_size, "vision_patch_size": clip.config.patch_size, - "vision_mlp_ratio": - clip.config.intermediate_size // clip.config.hidden_size, + "vision_mlp_ratio": clip.config.intermediate_size // clip.config.hidden_size, "vision_hidden_act": clip.config.hidden_act, "projection_dim": clip.config.projection_dim, } @@ -144,20 +141,17 @@ def check_keys(model, state_dict): print(f"{cls_name} Found mismatched_keys {mismatched_keys_str}!") -def convert_diffusers_unclip_to_ppdiffusers(pretrained_model_name_or_path, - output_path=None): +def convert_diffusers_unclip_to_ppdiffusers(pretrained_model_name_or_path, output_path=None): # 0. load diffusers pipe and convert to ppdiffusers weights format - diffusers_pipe = DiffusersUnCLIPPipeline.from_pretrained( - pretrained_model_name_or_path, use_auth_token=True) + diffusers_pipe = DiffusersUnCLIPPipeline.from_pretrained(pretrained_model_name_or_path, use_auth_token=True) prior_state_dict = convert_to_ppdiffusers(diffusers_pipe.prior) decoder_state_dict = convert_to_ppdiffusers(diffusers_pipe.decoder) text_proj_state_dict = convert_to_ppdiffusers(diffusers_pipe.text_proj) - super_res_first_state_dict = convert_to_ppdiffusers( - diffusers_pipe.super_res_first) - super_res_last_state_dict = convert_to_ppdiffusers( - diffusers_pipe.super_res_last) + super_res_first_state_dict = convert_to_ppdiffusers(diffusers_pipe.super_res_first) + super_res_last_state_dict = convert_to_ppdiffusers(diffusers_pipe.super_res_last) text_encoder_state_dict, text_config = convert_hf_clip_to_ppnlp_clip( - diffusers_pipe.text_encoder, is_text_encoder=True, need_prefix=False) + diffusers_pipe.text_encoder, is_text_encoder=True, need_prefix=False + ) pp_prior = PriorTransformer.from_config(diffusers_pipe.prior.config) pp_prior.set_dict(prior_state_dict) @@ -167,32 +161,25 @@ def convert_diffusers_unclip_to_ppdiffusers(pretrained_model_name_or_path, pp_decoder.set_dict(decoder_state_dict) check_keys(pp_decoder, decoder_state_dict) - pp_text_proj = UnCLIPTextProjModel.from_config( - diffusers_pipe.text_proj.config) + pp_text_proj = UnCLIPTextProjModel.from_config(diffusers_pipe.text_proj.config) pp_text_proj.set_dict(text_proj_state_dict) check_keys(pp_text_proj, text_proj_state_dict) - pp_super_res_first = UNet2DModel.from_config( - diffusers_pipe.super_res_first.config) + pp_super_res_first = UNet2DModel.from_config(diffusers_pipe.super_res_first.config) pp_super_res_first.set_dict(super_res_first_state_dict) check_keys(pp_super_res_first, super_res_first_state_dict) - pp_super_res_last = UNet2DModel.from_config( - diffusers_pipe.super_res_last.config) + pp_super_res_last = UNet2DModel.from_config(diffusers_pipe.super_res_last.config) pp_super_res_last.set_dict(super_res_last_state_dict) check_keys(pp_super_res_last, super_res_last_state_dict) - pp_text_encoder = CLIPTextModelWithProjection( - CLIPTextConfig.from_dict(text_config)) + pp_text_encoder = CLIPTextModelWithProjection(CLIPTextConfig.from_dict(text_config)) pp_text_encoder.set_dict(text_encoder_state_dict) check_keys(pp_text_encoder, text_encoder_state_dict) - pp_prior_scheduler = UnCLIPScheduler.from_config( - diffusers_pipe.prior_scheduler.config) - pp_decoder_scheduler = UnCLIPScheduler.from_config( - diffusers_pipe.decoder_scheduler.config) - pp_super_res_scheduler = UnCLIPScheduler.from_config( - diffusers_pipe.super_res_scheduler.config) + pp_prior_scheduler = UnCLIPScheduler.from_config(diffusers_pipe.prior_scheduler.config) + pp_decoder_scheduler = UnCLIPScheduler.from_config(diffusers_pipe.decoder_scheduler.config) + pp_super_res_scheduler = UnCLIPScheduler.from_config(diffusers_pipe.super_res_scheduler.config) with tempfile.TemporaryDirectory() as tmpdirname: # 5. feature_extractor @@ -209,15 +196,15 @@ def convert_diffusers_unclip_to_ppdiffusers(pretrained_model_name_or_path, super_res_last=pp_super_res_last, prior_scheduler=pp_prior_scheduler, decoder_scheduler=pp_decoder_scheduler, - super_res_scheduler=pp_super_res_scheduler, ) + super_res_scheduler=pp_super_res_scheduler, + ) # 6. save_pretrained paddle_pipe.save_pretrained(output_path) return paddle_pipe if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Pytorch model weights to Paddle model weights.") + parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.") parser.add_argument( "--pretrained_model_name_or_path", type=str, @@ -228,7 +215,7 @@ def convert_diffusers_unclip_to_ppdiffusers(pretrained_model_name_or_path, "--output_path", type=str, default="./karlo-v1-alpha", - help="The model output path.", ) + help="The model output path.", + ) args = parser.parse_args() - ppdiffusers_pipe = convert_diffusers_unclip_to_ppdiffusers( - args.pretrained_model_name_or_path, args.output_path) + ppdiffusers_pipe = convert_diffusers_unclip_to_ppdiffusers(args.pretrained_model_name_or_path, args.output_path) diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_vq_diffusion_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_vq_diffusion_to_ppdiffusers.py index eb8c950cc052e..d5c0fad1746bf 100644 --- a/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_vq_diffusion_to_ppdiffusers.py +++ b/ppdiffusers/scripts/convert_diffusers_model/convert_diffusers_vq_diffusion_to_ppdiffusers.py @@ -23,8 +23,7 @@ from ppdiffusers import Transformer2DModel from ppdiffusers import VQDiffusionPipeline as PPDiffusersVQDiffusionPipeline from ppdiffusers import VQDiffusionScheduler, VQModel -from ppdiffusers.pipelines.vq_diffusion import \ - LearnedClassifierFreeSamplingEmbeddings +from ppdiffusers.pipelines.vq_diffusion import LearnedClassifierFreeSamplingEmbeddings paddle.set_device("cpu") @@ -62,9 +61,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True): ".vision_model.": ".", } ignore_value = ["position_ids"] - donot_transpose = [ - "embeddings", "norm", "concept_embeds", "special_care_embeds" - ] + donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"] for name, value in clip.state_dict().items(): # step1: ignore position_ids @@ -78,7 +75,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True): name = name.replace(hf_name, ppnlp_name) # step4: 0d tensor -> 1d tensor if name == "logit_scale": - value = value.reshape((1, )) + value = value.reshape((1,)) # step5: safety_checker need prefix "clip." if "vision_model" in name: name = "clip." + name @@ -103,20 +100,17 @@ def convert_hf_clip_to_ppnlp_clip(clip, dtype="float32", is_text_encoder=True): "vision_heads": clip.config.vision_config.num_attention_heads, "vision_embed_dim": clip.config.vision_config.hidden_size, "vision_patch_size": clip.config.vision_config.patch_size, - "vision_mlp_ratio": clip.config.vision_config.intermediate_size // - clip.config.vision_config.hidden_size, + "vision_mlp_ratio": clip.config.vision_config.intermediate_size // clip.config.vision_config.hidden_size, "vision_hidden_act": clip.config.vision_config.hidden_act, "projection_dim": clip.config.projection_dim, } return new_model_state, new_config -def convert_diffusers_vq_diffusion_to_ppdiffusers(pretrained_model_name_or_path, - output_path=None): +def convert_diffusers_vq_diffusion_to_ppdiffusers(pretrained_model_name_or_path, output_path=None): # 0. load diffusers pipe and convert to ppdiffusers weights format - diffusers_pipe = DiffusersVQDiffusionPipeline.from_pretrained( - pretrained_model_name_or_path, use_auth_token=True) + diffusers_pipe = DiffusersVQDiffusionPipeline.from_pretrained(pretrained_model_name_or_path, use_auth_token=True) # 1. vqvae vqvae_state_dict = convert_to_ppdiffusers(diffusers_pipe.vqvae) @@ -124,35 +118,33 @@ def convert_diffusers_vq_diffusion_to_ppdiffusers(pretrained_model_name_or_path, transformer_state_dict = convert_to_ppdiffusers(diffusers_pipe.transformer) # 3. learned_classifier_free_sampling_embeddings learned_classifier_free_sampling_embeddings_state_dict = convert_to_ppdiffusers( - diffusers_pipe.learned_classifier_free_sampling_embeddings) + diffusers_pipe.learned_classifier_free_sampling_embeddings + ) # 4.text_encoder text_encoder_state_dict, text_encoder_config = convert_hf_clip_to_ppnlp_clip( - diffusers_pipe.text_encoder, is_text_encoder=True) + diffusers_pipe.text_encoder, is_text_encoder=True + ) # 1. vqvae pp_vqvae = VQModel.from_config(diffusers_pipe.vqvae.config) pp_vqvae.set_dict(vqvae_state_dict) # 2. transformer - pp_transformer = Transformer2DModel.from_config( - diffusers_pipe.transformer.config) + pp_transformer = Transformer2DModel.from_config(diffusers_pipe.transformer.config) pp_transformer.set_dict(transformer_state_dict) # 3. pp_learned_classifier_free_sampling_embeddings - pp_learned_classifier_free_sampling_embeddings = ( - LearnedClassifierFreeSamplingEmbeddings.from_config( - diffusers_pipe.learned_classifier_free_sampling_embeddings.config)) - pp_learned_classifier_free_sampling_embeddings.set_dict( - learned_classifier_free_sampling_embeddings_state_dict) + pp_learned_classifier_free_sampling_embeddings = LearnedClassifierFreeSamplingEmbeddings.from_config( + diffusers_pipe.learned_classifier_free_sampling_embeddings.config + ) + pp_learned_classifier_free_sampling_embeddings.set_dict(learned_classifier_free_sampling_embeddings_state_dict) # 4. text_encoder - pp_text_encoder = CLIPTextModel( - CLIPTextConfig.from_dict(text_encoder_config)) + pp_text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(text_encoder_config)) pp_text_encoder.set_dict(text_encoder_state_dict) # 5. scheduler - pp_scheduler = VQDiffusionScheduler.from_config( - diffusers_pipe.scheduler.config) + pp_scheduler = VQDiffusionScheduler.from_config(diffusers_pipe.scheduler.config) with tempfile.TemporaryDirectory() as tmpdirname: # 6. tokenizer @@ -166,7 +158,8 @@ def convert_diffusers_vq_diffusion_to_ppdiffusers(pretrained_model_name_or_path, tokenizer=pp_tokenizer, transformer=pp_transformer, learned_classifier_free_sampling_embeddings=pp_learned_classifier_free_sampling_embeddings, - scheduler=pp_scheduler, ) + scheduler=pp_scheduler, + ) # 8. save_pretrained paddle_pipe.save_pretrained(output_path) @@ -174,8 +167,7 @@ def convert_diffusers_vq_diffusion_to_ppdiffusers(pretrained_model_name_or_path, if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Pytorch model weights to Paddle model weights.") + parser = argparse.ArgumentParser(description="Pytorch model weights to Paddle model weights.") parser.add_argument( "--pretrained_model_name_or_path", type=str, @@ -186,7 +178,9 @@ def convert_diffusers_vq_diffusion_to_ppdiffusers(pretrained_model_name_or_path, "--output_path", type=str, default="microsoft/vq-diffusion-ithq-ppdiffusers", - help="The model output path.", ) + help="The model output path.", + ) args = parser.parse_args() ppdiffusers_pipe = convert_diffusers_vq_diffusion_to_ppdiffusers( - args.pretrained_model_name_or_path, args.output_path) + args.pretrained_model_name_or_path, args.output_path + ) diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_orig_sd_ckpt_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_orig_sd_ckpt_to_ppdiffusers.py index 41b5460d10922..b57a9ef31149d 100644 --- a/ppdiffusers/scripts/convert_diffusers_model/convert_orig_sd_ckpt_to_ppdiffusers.py +++ b/ppdiffusers/scripts/convert_diffusers_model/convert_orig_sd_ckpt_to_ppdiffusers.py @@ -30,10 +30,17 @@ from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer from ppdiffusers import ( - AutoencoderKL, DDIMScheduler, DPMSolverMultistepScheduler, - EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, - HeunDiscreteScheduler, LMSDiscreteScheduler, PNDMScheduler, - StableDiffusionPipeline, UNet2DConditionModel) + AutoencoderKL, + DDIMScheduler, + DPMSolverMultistepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + HeunDiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + StableDiffusionPipeline, + UNet2DConditionModel, +) paddle.set_device("cpu") MZ_ZIP_LOCAL_DIR_HEADER_SIZE = 30 @@ -116,8 +123,7 @@ def get_data_iostream(file: str, file_name="data.pkl"): FILENAME = f"archive/{file_name}".encode("latin") padding_size_plus_fbxx = 4 + 14 data_iostream = [] - offset = MZ_ZIP_LOCAL_DIR_HEADER_SIZE + len( - FILENAME) + padding_size_plus_fbxx + offset = MZ_ZIP_LOCAL_DIR_HEADER_SIZE + len(FILENAME) + padding_size_plus_fbxx with open(file, "rb") as r: r.seek(offset) for bytes_data in io.BytesIO(r.read()): @@ -130,8 +136,7 @@ def get_data_iostream(file: str, file_name="data.pkl"): return out, offset + len(out) -def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad, - backward_hooks): +def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad, backward_hooks): if isinstance(storage, TensorMeta): storage.size = size return storage @@ -169,8 +174,7 @@ def persistent_load_stage1(saved_id): data_iostream, pre_offset = get_data_iostream(path, file_name="data.pkl") # 1. read the structure of storage - unpickler_stage1 = UnpicklerWrapperStage( - io.BytesIO(data_iostream), **pickle_load_args) + unpickler_stage1 = UnpicklerWrapperStage(io.BytesIO(data_iostream), **pickle_load_args) unpickler_stage1.persistent_load = persistent_load_stage1 result_stage1 = unpickler_stage1.load() @@ -202,17 +206,15 @@ def extract_maybe_dict(result): # `MZ_ZIP_LOCAL_DIR_HEADER_SIZE` is from: https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/inline_container.cc#L186 # `16` is the fixed characters size from binary file. # `filename_with_fb` is the length of dynamic data key name - file_handler.seek( - MZ_ZIP_LOCAL_DIR_HEADER_SIZE + 16 + filename_with_fb, 1) + file_handler.seek(MZ_ZIP_LOCAL_DIR_HEADER_SIZE + 16 + filename_with_fb, 1) - padding_offset = np.frombuffer( - file_handler.read(2)[:1], dtype=np.uint8)[0] + padding_offset = np.frombuffer(file_handler.read(2)[:1], dtype=np.uint8)[0] file_handler.read(padding_offset) # save the tensor info in result to re-use memory stage1_key_to_tensor[key] = np.frombuffer( - file_handler.read(tensor_meta.nbytes), - dtype=tensor_meta.dtype).reshape(tensor_meta.size) + file_handler.read(tensor_meta.nbytes), dtype=tensor_meta.dtype + ).reshape(tensor_meta.size) def persistent_load_stage2(saved_id): assert isinstance(saved_id, tuple) @@ -220,8 +222,7 @@ def persistent_load_stage2(saved_id): return stage1_key_to_tensor[key] # 4. read the structure of storage - unpickler_stage2 = UnpicklerWrapperStage( - io.BytesIO(data_iostream), **pickle_load_args) + unpickler_stage2 = UnpicklerWrapperStage(io.BytesIO(data_iostream), **pickle_load_args) unpickler_stage2.persistent_load = persistent_load_stage2 result_stage2 = unpickler_stage2.load() @@ -253,8 +254,7 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0): new_item = new_item.replace("emb_layers.1", "time_emb_proj") new_item = new_item.replace("skip_connection", "conv_shortcut") - new_item = shave_segments( - new_item, n_shave_prefix_segments=n_shave_prefix_segments) + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) mapping.append({"old": old_item, "new": new_item}) @@ -270,8 +270,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0): new_item = old_item new_item = new_item.replace("nin_shortcut", "conv_shortcut") - new_item = shave_segments( - new_item, n_shave_prefix_segments=n_shave_prefix_segments) + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) mapping.append({"old": old_item, "new": new_item}) @@ -313,8 +312,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): new_item = new_item.replace("proj_out.weight", "proj_attn.weight") new_item = new_item.replace("proj_out.bias", "proj_attn.bias") - new_item = shave_segments( - new_item, n_shave_prefix_segments=n_shave_prefix_segments) + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) mapping.append({"old": old_item, "new": new_item}) @@ -322,12 +320,13 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): def assign_to_checkpoint( - paths, - checkpoint, - old_checkpoint, - attention_paths_to_split=None, - additional_replacements=None, - config=None, ): + paths, + checkpoint, + old_checkpoint, + attention_paths_to_split=None, + additional_replacements=None, + config=None, +): """ This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits attention layers, and takes into account additional replacements @@ -335,9 +334,7 @@ def assign_to_checkpoint( Assigns the weights to the new checkpoint. """ - assert isinstance( - paths, - list), "Paths should be a list of dicts containing 'old' and 'new' keys." + assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys." # Splits the attention layers into three variables. if attention_paths_to_split is not None: @@ -345,13 +342,11 @@ def assign_to_checkpoint( old_tensor = old_checkpoint[path] channels = old_tensor.shape[0] // 3 - target_shape = (-1, channels) if len(old_tensor.shape) == 3 else ( - -1) + target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1) num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3 - old_tensor = old_tensor.reshape((num_heads, 3 * channels // - num_heads) + old_tensor.shape[1:]) + old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:]) query, key, value = np.split(old_tensor, 3, axis=1) @@ -363,8 +358,7 @@ def assign_to_checkpoint( new_path = path["new"] # These have already been assigned - if (attention_paths_to_split is not None and - new_path in attention_paths_to_split): + if attention_paths_to_split is not None and new_path in attention_paths_to_split: continue # Global renaming happens here @@ -374,8 +368,7 @@ def assign_to_checkpoint( if additional_replacements is not None: for replacement in additional_replacements: - new_path = new_path.replace(replacement["old"], - replacement["new"]) + new_path = new_path.replace(replacement["old"], replacement["new"]) # proj_attn.weight has to be converted from conv 1D to linear if "proj_attn.weight" in new_path: @@ -403,34 +396,28 @@ def create_unet_diffusers_config(original_config, image_size: int): unet_params = original_config.model.params.unet_config.params vae_params = original_config.model.params.first_stage_config.params.ddconfig - block_out_channels = [ - unet_params.model_channels * mult for mult in unet_params.channel_mult - ] + block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult] down_block_types = [] resolution = 1 for i in range(len(block_out_channels)): - block_type = ("CrossAttnDownBlock2D" - if resolution in unet_params.attention_resolutions else - "DownBlock2D") + block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D" down_block_types.append(block_type) if i != len(block_out_channels) - 1: resolution *= 2 up_block_types = [] for i in range(len(block_out_channels)): - block_type = ("CrossAttnUpBlock2D" - if resolution in unet_params.attention_resolutions else - "UpBlock2D") + block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" up_block_types.append(block_type) resolution //= 2 - vae_scale_factor = 2**(len(vae_params.ch_mult) - 1) + vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1) head_dim = unet_params.num_heads if "num_heads" in unet_params else None - use_linear_projection = (unet_params.use_linear_in_transformer - if "use_linear_in_transformer" in unet_params else - False) + use_linear_projection = ( + unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False + ) if use_linear_projection: # stable diffusion 2-base-512 and 2-768 if head_dim is None: @@ -446,7 +433,8 @@ def create_unet_diffusers_config(original_config, image_size: int): layers_per_block=unet_params.num_res_blocks, cross_attention_dim=unet_params.context_dim, attention_head_dim=head_dim, - use_linear_projection=use_linear_projection, ) + use_linear_projection=use_linear_projection, + ) return config @@ -470,7 +458,8 @@ def create_vae_diffusers_config(original_config, image_size: int): up_block_types=tuple(up_block_types), block_out_channels=tuple(block_out_channels), latent_channels=vae_params.z_channels, - layers_per_block=vae_params.num_res_blocks, ) + layers_per_block=vae_params.num_res_blocks, + ) return config @@ -479,14 +468,12 @@ def create_diffusers_schedular(original_config): num_train_timesteps=original_config.model.params.timesteps, beta_start=original_config.model.params.linear_start, beta_end=original_config.model.params.linear_end, - beta_schedule="scaled_linear", ) + beta_schedule="scaled_linear", + ) return schedular -def convert_ldm_unet_checkpoint(checkpoint, - config, - path=None, - extract_ema=False): +def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False): """ Takes a state dict and a config, and returns a converted checkpoint. """ @@ -507,8 +494,7 @@ def convert_ldm_unet_checkpoint(checkpoint, for key in keys: if key.startswith("model.diffusion_model"): flat_ema_key = "model_ema." + "".join(key.split(".")[1:]) - unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop( - flat_ema_key) + unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key) else: print( "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA" @@ -521,17 +507,12 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint = {} - new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict[ - "time_embed.0.weight"] - new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict[ - "time_embed.0.bias"] - new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict[ - "time_embed.2.weight"] - new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict[ - "time_embed.2.bias"] - - new_checkpoint["conv_in.weight"] = unet_state_dict[ - "input_blocks.0.0.weight"] + new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"] + new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"] + new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"] + new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"] + + new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"] new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"] new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"] @@ -540,35 +521,23 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"] # Retrieves the keys for the input blocks only - num_input_blocks = len({ - ".".join(layer.split(".")[:2]) - for layer in unet_state_dict if "input_blocks" in layer - }) + num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer}) input_blocks = { - layer_id: - [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key] + layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key] for layer_id in range(num_input_blocks) } # Retrieves the keys for the middle blocks only - num_middle_blocks = len({ - ".".join(layer.split(".")[:2]) - for layer in unet_state_dict if "middle_block" in layer - }) + num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer}) middle_blocks = { - layer_id: - [key for key in unet_state_dict if f"middle_block.{layer_id}" in key] + layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key] for layer_id in range(num_middle_blocks) } # Retrieves the keys for the output blocks only - num_output_blocks = len({ - ".".join(layer.split(".")[:2]) - for layer in unet_state_dict if "output_blocks" in layer - }) + num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer}) output_blocks = { - layer_id: - [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key] + layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key] for layer_id in range(num_output_blocks) } @@ -577,21 +546,17 @@ def convert_ldm_unet_checkpoint(checkpoint, layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1) resnets = [ - key for key in input_blocks[i] - if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in - key - ] - attentions = [ - key for key in input_blocks[i] if f"input_blocks.{i}.1" in key + key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key ] + attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key] if f"input_blocks.{i}.0.op.weight" in unet_state_dict: - new_checkpoint[ - f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop( - f"input_blocks.{i}.0.op.weight") - new_checkpoint[ - f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop( - f"input_blocks.{i}.0.op.bias") + new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop( + f"input_blocks.{i}.0.op.weight" + ) + new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop( + f"input_blocks.{i}.0.op.bias" + ) paths = renew_resnet_paths(resnets) meta_path = { @@ -603,7 +568,8 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) if len(attentions): paths = renew_attention_paths(attentions) @@ -616,19 +582,18 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) resnet_0 = middle_blocks[0] attentions = middle_blocks[1] resnet_1 = middle_blocks[2] resnet_0_paths = renew_resnet_paths(resnet_0) - assign_to_checkpoint( - resnet_0_paths, new_checkpoint, unet_state_dict, config=config) + assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config) resnet_1_paths = renew_resnet_paths(resnet_1) - assign_to_checkpoint( - resnet_1_paths, new_checkpoint, unet_state_dict, config=config) + assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config) attentions_paths = renew_attention_paths(attentions) meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"} @@ -637,14 +602,13 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) for i in range(num_output_blocks): block_id = i // (config["layers_per_block"] + 1) layer_in_block_id = i % (config["layers_per_block"] + 1) - output_block_layers = [ - shave_segments(name, 2) for name in output_blocks[i] - ] + output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]] output_block_list = {} for layer in output_block_layers: @@ -655,12 +619,8 @@ def convert_ldm_unet_checkpoint(checkpoint, output_block_list[layer_id] = [layer_name] if len(output_block_list) > 1: - resnets = [ - key for key in output_blocks[i] if f"output_blocks.{i}.0" in key - ] - attentions = [ - key for key in output_blocks[i] if f"output_blocks.{i}.1" in key - ] + resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key] + attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key] resnet_0_paths = renew_resnet_paths(resnets) paths = renew_resnet_paths(resnets) @@ -674,31 +634,30 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) if ["conv.weight", "conv.bias"] in output_block_list.values(): - index = list(output_block_list.values()).index( - ["conv.weight", "conv.bias"]) - new_checkpoint[ - f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[ - f"output_blocks.{i}.{index}.conv.weight"] - new_checkpoint[ - f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[ - f"output_blocks.{i}.{index}.conv.bias"] + index = list(output_block_list.values()).index(["conv.weight", "conv.bias"]) + new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[ + f"output_blocks.{i}.{index}.conv.weight" + ] + new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[ + f"output_blocks.{i}.{index}.conv.bias" + ] # Clear attentions as they have been attributed above. if len(attentions) == 2: attentions = [] if ["conv.bias", "conv.weight"] in output_block_list.values(): - index = list(output_block_list.values()).index( - ["conv.bias", "conv.weight"]) - new_checkpoint[ - f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[ - f"output_blocks.{i}.{index}.conv.weight"] - new_checkpoint[ - f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[ - f"output_blocks.{i}.{index}.conv.bias"] + index = list(output_block_list.values()).index(["conv.bias", "conv.weight"]) + new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[ + f"output_blocks.{i}.{index}.conv.weight" + ] + new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[ + f"output_blocks.{i}.{index}.conv.bias" + ] # Clear attentions as they have been attributed above. if len(attentions) == 2: @@ -708,27 +667,28 @@ def convert_ldm_unet_checkpoint(checkpoint, paths = renew_attention_paths(attentions) meta_path = { "old": f"output_blocks.{i}.1", - "new": - f"up_blocks.{block_id}.attentions.{layer_in_block_id}", + "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}", } assign_to_checkpoint( paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) else: - resnet_0_paths = renew_resnet_paths( - output_block_layers, n_shave_prefix_segments=1) + resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1) for path in resnet_0_paths: old_path = ".".join(["output_blocks", str(i), path["old"]]) - new_path = ".".join([ - "up_blocks", - str(block_id), - "resnets", - str(layer_in_block_id), - path["new"], - ]) + new_path = ".".join( + [ + "up_blocks", + str(block_id), + "resnets", + str(layer_in_block_id), + path["new"], + ] + ) new_checkpoint[new_path] = unet_state_dict[old_path] @@ -746,107 +706,74 @@ def convert_ldm_vae_checkpoint(checkpoint, config): new_checkpoint = {} - new_checkpoint["encoder.conv_in.weight"] = vae_state_dict[ - "encoder.conv_in.weight"] - new_checkpoint["encoder.conv_in.bias"] = vae_state_dict[ - "encoder.conv_in.bias"] - new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[ - "encoder.conv_out.weight"] - new_checkpoint["encoder.conv_out.bias"] = vae_state_dict[ - "encoder.conv_out.bias"] - new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[ - "encoder.norm_out.weight"] - new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[ - "encoder.norm_out.bias"] - - new_checkpoint["decoder.conv_in.weight"] = vae_state_dict[ - "decoder.conv_in.weight"] - new_checkpoint["decoder.conv_in.bias"] = vae_state_dict[ - "decoder.conv_in.bias"] - new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[ - "decoder.conv_out.weight"] - new_checkpoint["decoder.conv_out.bias"] = vae_state_dict[ - "decoder.conv_out.bias"] - new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[ - "decoder.norm_out.weight"] - new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[ - "decoder.norm_out.bias"] + new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"] + new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"] + new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"] + new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"] + new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"] + new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"] + + new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"] + new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"] + new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"] + new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"] + new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"] + new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"] new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"] new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"] - new_checkpoint["post_quant_conv.weight"] = vae_state_dict[ - "post_quant_conv.weight"] - new_checkpoint["post_quant_conv.bias"] = vae_state_dict[ - "post_quant_conv.bias"] + new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"] + new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"] # Retrieves the keys for the encoder down blocks only - num_down_blocks = len({ - ".".join(layer.split(".")[:3]) - for layer in vae_state_dict if "encoder.down" in layer - }) + num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer}) down_blocks = { - layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] - for layer_id in range(num_down_blocks) + layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks) } # Retrieves the keys for the decoder up blocks only - num_up_blocks = len({ - ".".join(layer.split(".")[:3]) - for layer in vae_state_dict if "decoder.up" in layer - }) + num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer}) up_blocks = { - layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] - for layer_id in range(num_up_blocks) + layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks) } for i in range(num_down_blocks): - resnets = [ - key for key in down_blocks[i] - if f"down.{i}" in key and f"down.{i}.downsample" not in key - ] + resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key] if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict: - new_checkpoint[ - f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop( - f"encoder.down.{i}.downsample.conv.weight") - new_checkpoint[ - f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop( - f"encoder.down.{i}.downsample.conv.bias") + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop( + f"encoder.down.{i}.downsample.conv.weight" + ) + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop( + f"encoder.down.{i}.downsample.conv.bias" + ) paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"down.{i}.block", - "new": f"down_blocks.{i}.resnets" - } + meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key] num_mid_res_blocks = 2 for i in range(1, num_mid_res_blocks + 1): - resnets = [ - key for key in mid_resnets if f"encoder.mid.block_{i}" in key - ] + resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key] paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"mid.block_{i}", - "new": f"mid_block.resnets.{i - 1}" - } + meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) - mid_attentions = [ - key for key in vae_state_dict if "encoder.mid.attn" in key - ] + mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key] paths = renew_vae_attention_paths(mid_attentions) meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} assign_to_checkpoint( @@ -854,58 +781,50 @@ def convert_ldm_vae_checkpoint(checkpoint, config): new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) conv_attn_to_linear(new_checkpoint) for i in range(num_up_blocks): block_id = num_up_blocks - 1 - i resnets = [ - key for key in up_blocks[block_id] - if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key + key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key ] if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict: - new_checkpoint[ - f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[ - f"decoder.up.{block_id}.upsample.conv.weight"] - new_checkpoint[ - f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[ - f"decoder.up.{block_id}.upsample.conv.bias"] + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[ + f"decoder.up.{block_id}.upsample.conv.weight" + ] + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[ + f"decoder.up.{block_id}.upsample.conv.bias" + ] paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"up.{block_id}.block", - "new": f"up_blocks.{i}.resnets" - } + meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key] num_mid_res_blocks = 2 for i in range(1, num_mid_res_blocks + 1): - resnets = [ - key for key in mid_resnets if f"decoder.mid.block_{i}" in key - ] + resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key] paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"mid.block_{i}", - "new": f"mid_block.resnets.{i - 1}" - } + meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) - mid_attentions = [ - key for key in vae_state_dict if "decoder.mid.attn" in key - ] + mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key] paths = renew_vae_attention_paths(mid_attentions) meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} assign_to_checkpoint( @@ -913,14 +832,13 @@ def convert_ldm_vae_checkpoint(checkpoint, config): new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) conv_attn_to_linear(new_checkpoint) return new_checkpoint -def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, - diffusers_vae_unet_checkpoint, - dtype="float32"): +def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint, dtype="float32"): need_transpose = [] for k, v in vae_or_unet.named_sublayers(include_self=True): if isinstance(v, paddle.nn.Linear): @@ -955,7 +873,7 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"): clip = {} for key in checkpoint.keys(): if key.startswith("cond_stage_model.transformer"): - clip[key[len("cond_stage_model.transformer."):]] = checkpoint[key] + clip[key[len("cond_stage_model.transformer.") :]] = checkpoint[key] new_model_state = {} transformers2ppnlp = { @@ -975,9 +893,7 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"): ".vision_model.": ".", } ignore_value = ["position_ids"] - donot_transpose = [ - "embeddings", "norm", "concept_embeds", "special_care_embeds" - ] + donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"] for name, value in clip.items(): # step1: ignore position_ids if any(i in name for i in ignore_value): @@ -990,17 +906,14 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"): name = name.replace(hf_name, ppnlp_name) # step4: 0d tensor -> 1d tensor if name == "logit_scale": - value = value.reshape((1, )) + value = value.reshape((1,)) new_model_state[name] = value.astype(dtype) new_config = { - "max_text_length": - new_model_state["text_model.positional_embedding.weight"].shape[0], - "vocab_size": - new_model_state["text_model.token_embedding.weight"].shape[0], - "text_embed_dim": - new_model_state["text_model.token_embedding.weight"].shape[1], + "max_text_length": new_model_state["text_model.positional_embedding.weight"].shape[0], + "vocab_size": new_model_state["text_model.token_embedding.weight"].shape[0], + "text_embed_dim": new_model_state["text_model.token_embedding.weight"].shape[1], "text_heads": 12, "text_layers": 12, "text_hidden_act": "quick_gelu", @@ -1019,7 +932,8 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"): default=None, type=str, required=True, - help="Path to the checkpoint to convert.", ) + help="Path to the checkpoint to convert.", + ) parser.add_argument( "--original_config_file", default=None, @@ -1045,13 +959,15 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"): "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights" " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield" " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning." - ), ) + ), + ) parser.add_argument( "--dump_path", default=None, type=str, required=True, - help="Path to the output model.", ) + help="Path to the output model.", + ) args = parser.parse_args() image_size = 512 @@ -1061,14 +977,14 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"): if args.original_config_file is None: get_path_from_url( "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/v1-inference.yaml", - root_dir="./", ) + root_dir="./", + ) args.original_config_file = "./v1-inference.yaml" original_config = OmegaConf.load(args.original_config_file) if args.num_in_channels is not None: - original_config["model"]["params"]["unet_config"]["params"][ - "in_channels"] = args.num_in_channels + original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = args.num_in_channels num_train_timesteps = original_config.model.params.timesteps beta_start = original_config.model.params.linear_start @@ -1081,7 +997,8 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"): num_train_timesteps=num_train_timesteps, steps_offset=1, clip_sample=False, - set_alpha_to_one=False, ) + set_alpha_to_one=False, + ) # make sure scheduler works correctly with DDIM scheduler.register_to_config(clip_sample=False) @@ -1096,44 +1013,37 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"): elif args.scheduler_type == "euler": scheduler = EulerDiscreteScheduler.from_config(scheduler.config) elif args.scheduler_type == "euler-ancestral": - scheduler = EulerAncestralDiscreteScheduler.from_config( - scheduler.config) + scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config) elif args.scheduler_type == "dpm": scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config) elif args.scheduler_type == "ddim": scheduler = scheduler else: - raise ValueError( - f"Scheduler of type {args.scheduler_type} doesn't exist!") + raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!") # 1. Convert the UNet2DConditionModel model. - diffusers_unet_config = create_unet_diffusers_config( - original_config, image_size=image_size) + diffusers_unet_config = create_unet_diffusers_config(original_config, image_size=image_size) diffusers_unet_checkpoint = convert_ldm_unet_checkpoint( checkpoint, diffusers_unet_config, path=args.checkpoint_path, - extract_ema=args.extract_ema, ) + extract_ema=args.extract_ema, + ) unet = UNet2DConditionModel.from_config(diffusers_unet_config) - ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers( - unet, diffusers_unet_checkpoint) + ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(unet, diffusers_unet_checkpoint) check_keys(unet, ppdiffusers_unet_checkpoint) unet.load_dict(ppdiffusers_unet_checkpoint) # 2. Convert the VAE model. - vae_config = create_vae_diffusers_config( - original_config, image_size=image_size) - diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, - vae_config) + vae_config = create_vae_diffusers_config(original_config, image_size=image_size) + diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config) vae = AutoencoderKL.from_config(vae_config) - ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers( - vae, diffusers_vae_checkpoint) + ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(vae, diffusers_vae_checkpoint) check_keys(vae, ppdiffusers_vae_checkpoint) vae.load_dict(ppdiffusers_vae_checkpoint) # 3. Convert the text_encoder model. - text_model_state_dict, text_config = convert_hf_clip_to_ppnlp_clip( - checkpoint, dtype="float32") + text_model_state_dict, text_config = convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32") text_model = CLIPTextModel(CLIPTextConfig.from_dict(text_config)) text_model.eval() check_keys(text_model, text_model_state_dict) @@ -1150,5 +1060,6 @@ def convert_hf_clip_to_ppnlp_clip(checkpoint, dtype="float32"): scheduler=scheduler, safety_checker=None, feature_extractor=None, - requires_safety_checker=False, ) + requires_safety_checker=False, + ) pipe.save_pretrained(args.dump_path) diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_orig_stablediffusion2.0_ckpt_to_ppdiffusers.py b/ppdiffusers/scripts/convert_diffusers_model/convert_orig_stablediffusion2.0_ckpt_to_ppdiffusers.py index 96786f7bd3255..55fd755445702 100644 --- a/ppdiffusers/scripts/convert_diffusers_model/convert_orig_stablediffusion2.0_ckpt_to_ppdiffusers.py +++ b/ppdiffusers/scripts/convert_diffusers_model/convert_orig_stablediffusion2.0_ckpt_to_ppdiffusers.py @@ -27,10 +27,15 @@ from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer from transformers import CLIPTextModel as HFCLIPTextModel -from ppdiffusers import (AutoencoderKL, DDIMScheduler, - EulerAncestralDiscreteScheduler, LMSDiscreteScheduler, - PNDMScheduler, StableDiffusionPipeline, - UNet2DConditionModel) +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + EulerAncestralDiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + StableDiffusionPipeline, + UNet2DConditionModel, +) paddle.set_device("cpu") @@ -60,8 +65,7 @@ def renew_resnet_paths(old_list, n_shave_prefix_segments=0): new_item = new_item.replace("emb_layers.1", "time_emb_proj") new_item = new_item.replace("skip_connection", "conv_shortcut") - new_item = shave_segments( - new_item, n_shave_prefix_segments=n_shave_prefix_segments) + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) mapping.append({"old": old_item, "new": new_item}) @@ -77,8 +81,7 @@ def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0): new_item = old_item new_item = new_item.replace("nin_shortcut", "conv_shortcut") - new_item = shave_segments( - new_item, n_shave_prefix_segments=n_shave_prefix_segments) + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) mapping.append({"old": old_item, "new": new_item}) @@ -120,8 +123,7 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): new_item = new_item.replace("proj_out.weight", "proj_attn.weight") new_item = new_item.replace("proj_out.bias", "proj_attn.bias") - new_item = shave_segments( - new_item, n_shave_prefix_segments=n_shave_prefix_segments) + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) mapping.append({"old": old_item, "new": new_item}) @@ -129,21 +131,20 @@ def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): def assign_to_checkpoint( - paths, - checkpoint, - old_checkpoint, - attention_paths_to_split=None, - additional_replacements=None, - config=None, ): + paths, + checkpoint, + old_checkpoint, + attention_paths_to_split=None, + additional_replacements=None, + config=None, +): """ This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits attention layers, and takes into account additional replacements that may arise. Assigns the weights to the new checkpoint. """ - assert isinstance( - paths, - list), "Paths should be a list of dicts containing 'old' and 'new' keys." + assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys." # Splits the attention layers into three variables. if attention_paths_to_split is not None: @@ -151,13 +152,11 @@ def assign_to_checkpoint( old_tensor = old_checkpoint[path] channels = old_tensor.shape[0] // 3 - target_shape = (-1, channels) if len(old_tensor.shape) == 3 else ( - -1) + target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1) num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3 - old_tensor = old_tensor.reshape((num_heads, 3 * channels // - num_heads) + old_tensor.shape[1:]) + old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:]) query, key, value = old_tensor.split(channels // num_heads, dim=1) checkpoint[path_map["query"]] = query.reshape(target_shape) @@ -168,8 +167,7 @@ def assign_to_checkpoint( new_path = path["new"] # These have already been assigned - if (attention_paths_to_split is not None and - new_path in attention_paths_to_split): + if attention_paths_to_split is not None and new_path in attention_paths_to_split: continue # Global renaming happens here @@ -179,8 +177,7 @@ def assign_to_checkpoint( if additional_replacements is not None: for replacement in additional_replacements: - new_path = new_path.replace(replacement["old"], - replacement["new"]) + new_path = new_path.replace(replacement["old"], replacement["new"]) # proj_attn.weight has to be converted from conv 1D to linear if "proj_attn.weight" in new_path: @@ -207,25 +204,19 @@ def create_unet_diffusers_config(original_config): """ unet_params = original_config.model.params.unet_config.params - block_out_channels = [ - unet_params.model_channels * mult for mult in unet_params.channel_mult - ] + block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult] down_block_types = [] resolution = 1 for i in range(len(block_out_channels)): - block_type = ("CrossAttnDownBlock2D" - if resolution in unet_params.attention_resolutions else - "DownBlock2D") + block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D" down_block_types.append(block_type) if i != len(block_out_channels) - 1: resolution *= 2 up_block_types = [] for i in range(len(block_out_channels)): - block_type = ("CrossAttnUpBlock2D" - if resolution in unet_params.attention_resolutions else - "UpBlock2D") + block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D" up_block_types.append(block_type) resolution //= 2 @@ -242,7 +233,8 @@ def create_unet_diffusers_config(original_config): block_out_channels=tuple(block_out_channels), layers_per_block=unet_params.num_res_blocks, cross_attention_dim=unet_params.context_dim, - attention_head_dim=attention_head_dim, ) + attention_head_dim=attention_head_dim, + ) return config @@ -266,14 +258,12 @@ def create_vae_diffusers_config(original_config): up_block_types=tuple(up_block_types), block_out_channels=tuple(block_out_channels), latent_channels=vae_params.z_channels, - layers_per_block=vae_params.num_res_blocks, ) + layers_per_block=vae_params.num_res_blocks, + ) return config -def convert_ldm_unet_checkpoint(checkpoint, - config, - path=None, - extract_ema=False): +def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False): """ Takes a state dict and a config, and returns a converted checkpoint. """ @@ -294,8 +284,7 @@ def convert_ldm_unet_checkpoint(checkpoint, for key in keys: if key.startswith("model.diffusion_model"): flat_ema_key = "model_ema." + "".join(key.split(".")[1:]) - unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop( - flat_ema_key) + unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key) else: print( "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA" @@ -308,17 +297,12 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint = {} - new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict[ - "time_embed.0.weight"] - new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict[ - "time_embed.0.bias"] - new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict[ - "time_embed.2.weight"] - new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict[ - "time_embed.2.bias"] - - new_checkpoint["conv_in.weight"] = unet_state_dict[ - "input_blocks.0.0.weight"] + new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"] + new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"] + new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"] + new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"] + + new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"] new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"] new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"] @@ -327,35 +311,23 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"] # Retrieves the keys for the input blocks only - num_input_blocks = len({ - ".".join(layer.split(".")[:2]) - for layer in unet_state_dict if "input_blocks" in layer - }) + num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer}) input_blocks = { - layer_id: - [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key] + layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key] for layer_id in range(num_input_blocks) } # Retrieves the keys for the middle blocks only - num_middle_blocks = len({ - ".".join(layer.split(".")[:2]) - for layer in unet_state_dict if "middle_block" in layer - }) + num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer}) middle_blocks = { - layer_id: - [key for key in unet_state_dict if f"middle_block.{layer_id}" in key] + layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key] for layer_id in range(num_middle_blocks) } # Retrieves the keys for the output blocks only - num_output_blocks = len({ - ".".join(layer.split(".")[:2]) - for layer in unet_state_dict if "output_blocks" in layer - }) + num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer}) output_blocks = { - layer_id: - [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key] + layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key] for layer_id in range(num_output_blocks) } @@ -364,21 +336,17 @@ def convert_ldm_unet_checkpoint(checkpoint, layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1) resnets = [ - key for key in input_blocks[i] - if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in - key - ] - attentions = [ - key for key in input_blocks[i] if f"input_blocks.{i}.1" in key + key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key ] + attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key] if f"input_blocks.{i}.0.op.weight" in unet_state_dict: - new_checkpoint[ - f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop( - f"input_blocks.{i}.0.op.weight") - new_checkpoint[ - f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop( - f"input_blocks.{i}.0.op.bias") + new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop( + f"input_blocks.{i}.0.op.weight" + ) + new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop( + f"input_blocks.{i}.0.op.bias" + ) paths = renew_resnet_paths(resnets) meta_path = { @@ -390,7 +358,8 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) if len(attentions): paths = renew_attention_paths(attentions) @@ -403,19 +372,18 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) resnet_0 = middle_blocks[0] attentions = middle_blocks[1] resnet_1 = middle_blocks[2] resnet_0_paths = renew_resnet_paths(resnet_0) - assign_to_checkpoint( - resnet_0_paths, new_checkpoint, unet_state_dict, config=config) + assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config) resnet_1_paths = renew_resnet_paths(resnet_1) - assign_to_checkpoint( - resnet_1_paths, new_checkpoint, unet_state_dict, config=config) + assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config) attentions_paths = renew_attention_paths(attentions) meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"} @@ -424,14 +392,13 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) for i in range(num_output_blocks): block_id = i // (config["layers_per_block"] + 1) layer_in_block_id = i % (config["layers_per_block"] + 1) - output_block_layers = [ - shave_segments(name, 2) for name in output_blocks[i] - ] + output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]] output_block_list = {} for layer in output_block_layers: @@ -442,12 +409,8 @@ def convert_ldm_unet_checkpoint(checkpoint, output_block_list[layer_id] = [layer_name] if len(output_block_list) > 1: - resnets = [ - key for key in output_blocks[i] if f"output_blocks.{i}.0" in key - ] - attentions = [ - key for key in output_blocks[i] if f"output_blocks.{i}.1" in key - ] + resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key] + attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key] resnet_0_paths = renew_resnet_paths(resnets) paths = renew_resnet_paths(resnets) @@ -461,17 +424,17 @@ def convert_ldm_unet_checkpoint(checkpoint, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) if ["conv.weight", "conv.bias"] in output_block_list.values(): - index = list(output_block_list.values()).index( - ["conv.weight", "conv.bias"]) - new_checkpoint[ - f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[ - f"output_blocks.{i}.{index}.conv.weight"] - new_checkpoint[ - f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[ - f"output_blocks.{i}.{index}.conv.bias"] + index = list(output_block_list.values()).index(["conv.weight", "conv.bias"]) + new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[ + f"output_blocks.{i}.{index}.conv.weight" + ] + new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[ + f"output_blocks.{i}.{index}.conv.bias" + ] # Clear attentions as they have been attributed above. if len(attentions) == 2: @@ -481,27 +444,28 @@ def convert_ldm_unet_checkpoint(checkpoint, paths = renew_attention_paths(attentions) meta_path = { "old": f"output_blocks.{i}.1", - "new": - f"up_blocks.{block_id}.attentions.{layer_in_block_id}", + "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}", } assign_to_checkpoint( paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) else: - resnet_0_paths = renew_resnet_paths( - output_block_layers, n_shave_prefix_segments=1) + resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1) for path in resnet_0_paths: old_path = ".".join(["output_blocks", str(i), path["old"]]) - new_path = ".".join([ - "up_blocks", - str(block_id), - "resnets", - str(layer_in_block_id), - path["new"], - ]) + new_path = ".".join( + [ + "up_blocks", + str(block_id), + "resnets", + str(layer_in_block_id), + path["new"], + ] + ) new_checkpoint[new_path] = unet_state_dict[old_path] @@ -519,107 +483,74 @@ def convert_ldm_vae_checkpoint(checkpoint, config): new_checkpoint = {} - new_checkpoint["encoder.conv_in.weight"] = vae_state_dict[ - "encoder.conv_in.weight"] - new_checkpoint["encoder.conv_in.bias"] = vae_state_dict[ - "encoder.conv_in.bias"] - new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[ - "encoder.conv_out.weight"] - new_checkpoint["encoder.conv_out.bias"] = vae_state_dict[ - "encoder.conv_out.bias"] - new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[ - "encoder.norm_out.weight"] - new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[ - "encoder.norm_out.bias"] - - new_checkpoint["decoder.conv_in.weight"] = vae_state_dict[ - "decoder.conv_in.weight"] - new_checkpoint["decoder.conv_in.bias"] = vae_state_dict[ - "decoder.conv_in.bias"] - new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[ - "decoder.conv_out.weight"] - new_checkpoint["decoder.conv_out.bias"] = vae_state_dict[ - "decoder.conv_out.bias"] - new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[ - "decoder.norm_out.weight"] - new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[ - "decoder.norm_out.bias"] + new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"] + new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"] + new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"] + new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"] + new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"] + new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"] + + new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"] + new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"] + new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"] + new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"] + new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"] + new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"] new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"] new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"] - new_checkpoint["post_quant_conv.weight"] = vae_state_dict[ - "post_quant_conv.weight"] - new_checkpoint["post_quant_conv.bias"] = vae_state_dict[ - "post_quant_conv.bias"] + new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"] + new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"] # Retrieves the keys for the encoder down blocks only - num_down_blocks = len({ - ".".join(layer.split(".")[:3]) - for layer in vae_state_dict if "encoder.down" in layer - }) + num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer}) down_blocks = { - layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] - for layer_id in range(num_down_blocks) + layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks) } # Retrieves the keys for the decoder up blocks only - num_up_blocks = len({ - ".".join(layer.split(".")[:3]) - for layer in vae_state_dict if "decoder.up" in layer - }) + num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer}) up_blocks = { - layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] - for layer_id in range(num_up_blocks) + layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks) } for i in range(num_down_blocks): - resnets = [ - key for key in down_blocks[i] - if f"down.{i}" in key and f"down.{i}.downsample" not in key - ] + resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key] if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict: - new_checkpoint[ - f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop( - f"encoder.down.{i}.downsample.conv.weight") - new_checkpoint[ - f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop( - f"encoder.down.{i}.downsample.conv.bias") + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop( + f"encoder.down.{i}.downsample.conv.weight" + ) + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop( + f"encoder.down.{i}.downsample.conv.bias" + ) paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"down.{i}.block", - "new": f"down_blocks.{i}.resnets" - } + meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key] num_mid_res_blocks = 2 for i in range(1, num_mid_res_blocks + 1): - resnets = [ - key for key in mid_resnets if f"encoder.mid.block_{i}" in key - ] + resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key] paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"mid.block_{i}", - "new": f"mid_block.resnets.{i - 1}" - } + meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) - mid_attentions = [ - key for key in vae_state_dict if "encoder.mid.attn" in key - ] + mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key] paths = renew_vae_attention_paths(mid_attentions) meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} assign_to_checkpoint( @@ -627,58 +558,50 @@ def convert_ldm_vae_checkpoint(checkpoint, config): new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) conv_attn_to_linear(new_checkpoint) for i in range(num_up_blocks): block_id = num_up_blocks - 1 - i resnets = [ - key for key in up_blocks[block_id] - if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key + key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key ] if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict: - new_checkpoint[ - f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[ - f"decoder.up.{block_id}.upsample.conv.weight"] - new_checkpoint[ - f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[ - f"decoder.up.{block_id}.upsample.conv.bias"] + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[ + f"decoder.up.{block_id}.upsample.conv.weight" + ] + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[ + f"decoder.up.{block_id}.upsample.conv.bias" + ] paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"up.{block_id}.block", - "new": f"up_blocks.{i}.resnets" - } + meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key] num_mid_res_blocks = 2 for i in range(1, num_mid_res_blocks + 1): - resnets = [ - key for key in mid_resnets if f"decoder.mid.block_{i}" in key - ] + resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key] paths = renew_vae_resnet_paths(resnets) - meta_path = { - "old": f"mid.block_{i}", - "new": f"mid_block.resnets.{i - 1}" - } + meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} assign_to_checkpoint( paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) - mid_attentions = [ - key for key in vae_state_dict if "decoder.mid.attn" in key - ] + mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key] paths = renew_vae_attention_paths(mid_attentions) meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} assign_to_checkpoint( @@ -686,14 +609,13 @@ def convert_ldm_vae_checkpoint(checkpoint, config): new_checkpoint, vae_state_dict, additional_replacements=[meta_path], - config=config, ) + config=config, + ) conv_attn_to_linear(new_checkpoint) return new_checkpoint -def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, - diffusers_vae_unet_checkpoint, - dtype="float32"): +def convert_diffusers_vae_unet_to_ppdiffusers(vae_or_unet, diffusers_vae_unet_checkpoint, dtype="float32"): need_transpose = [] for k, v in vae_or_unet.named_sublayers(include_self=True): if isinstance(v, paddle.nn.Linear): @@ -745,9 +667,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"): ".vision_model.": ".", } ignore_value = ["position_ids"] - donot_transpose = [ - "embeddings", "norm", "concept_embeds", "special_care_embeds" - ] + donot_transpose = ["embeddings", "norm", "concept_embeds", "special_care_embeds"] for name, value in clip.state_dict().items(): if f".{layer_need_to_ignore}." in name: continue @@ -762,7 +682,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"): name = name.replace(hf_name, ppnlp_name) # step4: 0d tensor -> 1d tensor if name == "logit_scale": - value = value.reshape((1, )) + value = value.reshape((1,)) # step5: safety_checker need prefix "clip." new_model_state[name] = value.cpu().numpy().astype(dtype) @@ -788,7 +708,8 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"): default=None, type=str, required=True, - help="Path to the checkpoint to convert.", ) + help="Path to the checkpoint to convert.", + ) parser.add_argument( "--original_config_file", default="v2-inference.yaml", @@ -809,13 +730,15 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"): "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights" " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield" " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning." - ), ) + ), + ) parser.add_argument( "--dump_path", default=None, type=str, required=True, - help="Path to the output model.", ) + help="Path to the output model.", + ) args = parser.parse_args() @@ -836,26 +759,23 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"): checkpoint, diffusers_unet_config, path=args.checkpoint_path, - extract_ema=args.extract_ema, ) + extract_ema=args.extract_ema, + ) unet = UNet2DConditionModel(**diffusers_unet_config) - ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers( - unet, diffusers_unet_checkpoint) + ppdiffusers_unet_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(unet, diffusers_unet_checkpoint) check_keys(unet, ppdiffusers_unet_checkpoint) unet.load_dict(ppdiffusers_unet_checkpoint) # 2. Convert the VAE model. vae_config = create_vae_diffusers_config(original_config) - diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, - vae_config) + diffusers_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config) vae = AutoencoderKL(**vae_config) - ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers( - vae, diffusers_vae_checkpoint) + ppdiffusers_vae_checkpoint = convert_diffusers_vae_unet_to_ppdiffusers(vae, diffusers_vae_checkpoint) check_keys(vae, ppdiffusers_vae_checkpoint) vae.load_dict(ppdiffusers_vae_checkpoint) # 3. Convert the text model. - text_model_type = original_config.model.params.cond_stage_config.target.split( - ".")[-1] + text_model_type = original_config.model.params.cond_stage_config.target.split(".")[-1] layer = original_config.model.params.cond_stage_config.params.layer if layer == "last": layer_idx = 0 @@ -867,19 +787,16 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"): if text_model_type != "FrozenOpenCLIPEmbedder": print("We only support FrozenOpenCLIPEmbedder as text_encoder!") - clip = HFCLIPTextModel.from_pretrained( - "laion/CLIP-ViT-H-14-laion2B-s32B-b79K") - ppdiffusers_clip_checkpoint, clip_config = convert_hf_clip_to_ppnlp_clip( - clip, layer_idx) + clip = HFCLIPTextModel.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K") + ppdiffusers_clip_checkpoint, clip_config = convert_hf_clip_to_ppnlp_clip(clip, layer_idx) text_encoder = CLIPTextModel(CLIPTextConfig.from_dict(clip_config)) text_encoder.load_dict(ppdiffusers_clip_checkpoint) # 5. load tokenizer. pp_tokenizer = CLIPTokenizer.from_pretrained( - "laion/CLIP-ViT-H-14-laion2B-s32B-b79K", - pad_token="!", - model_max_length=77) + "laion/CLIP-ViT-H-14-laion2B-s32B-b79K", pad_token="!", model_max_length=77 + ) # 6. Convert scheduler. num_train_timesteps = original_config.model.params.timesteps @@ -894,17 +811,14 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"): set_alpha_to_one=False, steps_offset=1, # Make sure the scheduler compatible with PNDM - skip_prk_steps=True, ) + skip_prk_steps=True, + ) elif args.scheduler_type == "lms": - scheduler = LMSDiscreteScheduler( - beta_start=beta_start, - beta_end=beta_end, - beta_schedule="scaled_linear") + scheduler = LMSDiscreteScheduler(beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear") elif args.scheduler_type == "euler-ancestral": scheduler = EulerAncestralDiscreteScheduler( - beta_start=beta_start, - beta_end=beta_end, - beta_schedule="scaled_linear") + beta_start=beta_start, beta_end=beta_end, beta_schedule="scaled_linear" + ) elif args.scheduler_type == "ddim": scheduler = DDIMScheduler( beta_start=beta_start, @@ -913,10 +827,10 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"): # Make sure the scheduler compatible with DDIM clip_sample=False, set_alpha_to_one=False, - steps_offset=1, ) + steps_offset=1, + ) else: - raise ValueError( - f"Scheduler of type {args.scheduler_type} doesn't exist!") + raise ValueError(f"Scheduler of type {args.scheduler_type} doesn't exist!") pipe = StableDiffusionPipeline( vae=vae, @@ -926,6 +840,7 @@ def convert_hf_clip_to_ppnlp_clip(clip, layer_idx, dtype="float32"): scheduler=scheduler, safety_checker=None, feature_extractor=None, - requires_safety_checker=False, ) + requires_safety_checker=False, + ) pipe.save_pretrained(args.dump_path) diff --git a/ppdiffusers/scripts/convert_diffusers_model/convert_ppdiffusers_stable_diffusion_to_fastdeploy.py b/ppdiffusers/scripts/convert_diffusers_model/convert_ppdiffusers_stable_diffusion_to_fastdeploy.py index 7caddb24c95d2..b7bed2a4b3b35 100644 --- a/ppdiffusers/scripts/convert_diffusers_model/convert_ppdiffusers_stable_diffusion_to_fastdeploy.py +++ b/ppdiffusers/scripts/convert_diffusers_model/convert_ppdiffusers_stable_diffusion_to_fastdeploy.py @@ -19,22 +19,20 @@ import paddle -from ppdiffusers import (FastDeployStableDiffusionInpaintPipeline, - FastDeployStableDiffusionMegaPipeline, - StableDiffusionPipeline) +from ppdiffusers import ( + FastDeployStableDiffusionInpaintPipeline, + FastDeployStableDiffusionMegaPipeline, + StableDiffusionPipeline, +) from ppdiffusers.fastdeploy_utils import FastDeployRuntimeModel -def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(model_path: str, - output_path: str, - mode: bool=False): - pipeline = StableDiffusionPipeline.from_pretrained( - model_path, safety_checker=None, feature_extractor=None) +def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(model_path: str, output_path: str, mode: bool = False): + pipeline = StableDiffusionPipeline.from_pretrained(model_path, safety_checker=None, feature_extractor=None) output_path = Path(output_path) # get arguments - cross_attention_dim = ( - pipeline.unet.config.cross_attention_dim) # 768 or 1024 or 1280 + cross_attention_dim = pipeline.unet.config.cross_attention_dim # 768 or 1024 or 1280 unet_channels = pipeline.unet.config.in_channels # 4 or 9 vae_in_channels = pipeline.vae.config.in_channels # 3 vae_latent_channels = pipeline.vae.config.latent_channels # 4 @@ -42,14 +40,12 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(model_path: str, f"cross_attention_dim: {cross_attention_dim}\n", f"unet_in_channels: {unet_channels}\n", f"vae_encoder_in_channels: {vae_in_channels}\n", - f"vae_decoder_latent_channels: {vae_latent_channels}", ) + f"vae_decoder_latent_channels: {vae_latent_channels}", + ) # 1. Convert text_encoder text_encoder = paddle.jit.to_static( pipeline.text_encoder, - input_spec=[ - paddle.static.InputSpec( - shape=[None, None], dtype="int64", name="input_ids") - ], # input_ids + input_spec=[paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")], # input_ids ) save_path = os.path.join(args.output_path, "text_encoder", "inference") paddle.jit.save(text_encoder, save_path) @@ -60,17 +56,15 @@ def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(model_path: str, unet = paddle.jit.to_static( pipeline.unet, input_spec=[ - paddle.static.InputSpec( - shape=[None, unet_channels, None, None], - dtype="float32", - name="sample"), # sample - paddle.static.InputSpec( - shape=[1], dtype="int64", name="timestep"), # timestep + paddle.static.InputSpec(shape=[None, unet_channels, None, None], dtype="float32", name="sample"), # sample + paddle.static.InputSpec(shape=[1], dtype="int64", name="timestep"), # timestep paddle.static.InputSpec( shape=[None, None, cross_attention_dim], dtype="float32", - name="encoder_hidden_states", ), # encoder_hidden_states - ], ) + name="encoder_hidden_states", + ), # encoder_hidden_states + ], + ) save_path = os.path.join(args.output_path, "unet", "inference") paddle.jit.save(unet, save_path) print(f"Save unet model in {save_path} successfully.") @@ -87,8 +81,7 @@ def forward_vae_encoder_sample(self, z): if mode: vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder) else: - vae_encoder.forward = MethodType(forward_vae_encoder_sample, - vae_encoder) + vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder) vae_encoder = paddle.jit.to_static( vae_encoder, @@ -98,7 +91,8 @@ def forward_vae_encoder_sample(self, z): dtype="float32", name="sample", # N, C, H, W ), # latent - ], ) + ], + ) # Save vae_encoder in static graph model. save_path = os.path.join(args.output_path, "vae_encoder", "inference") paddle.jit.save(vae_encoder, save_path) @@ -117,8 +111,10 @@ def forward_vae_decoder(self, z): paddle.static.InputSpec( shape=[None, vae_latent_channels, None, None], dtype="float32", - name="latent_sample", ), # latent_sample - ], ) + name="latent_sample", + ), # latent_sample + ], + ) # Save vae_decoder in static graph model. save_path = os.path.join(args.output_path, "vae_decoder", "inference") paddle.jit.save(vae_decoder, save_path) @@ -131,18 +127,16 @@ def forward_vae_decoder(self, z): fd_pipe_cls = FastDeployStableDiffusionMegaPipeline fastdeploy_pipeline = fd_pipe_cls( - vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / - "vae_encoder"), - vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path / - "vae_decoder"), - text_encoder=FastDeployRuntimeModel.from_pretrained(output_path / - "text_encoder"), + vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"), + vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_decoder"), + text_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder"), unet=FastDeployRuntimeModel.from_pretrained(output_path / "unet"), tokenizer=pipeline.tokenizer, scheduler=pipeline.scheduler, safety_checker=None, feature_extractor=None, - requires_safety_checker=False, ) + requires_safety_checker=False, + ) fastdeploy_pipeline.save_pretrained(output_path) print("FastDeploy pipeline saved to", output_path) @@ -174,17 +168,13 @@ def forward_vae_decoder(self, z): required=True, help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).", ) - parser.add_argument( - "--output_path", - type=str, - required=True, - help="Path to the output model.") + parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.") parser.add_argument( "--mode", action="store_true", default=False, - help="Export the vae encoder in mode or sample", ) + help="Export the vae encoder in mode or sample", + ) args = parser.parse_args() - convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( - args.model_path, args.output_path, args.mode) + convert_ppdiffusers_pipeline_to_fastdeploy_pipeline(args.model_path, args.output_path, args.mode) diff --git a/ppdiffusers/scripts/fid_clip_score/compute_fid_clip_score.py b/ppdiffusers/scripts/fid_clip_score/compute_fid_clip_score.py index e8def2f35e60a..6a27ffff944e8 100644 --- a/ppdiffusers/scripts/fid_clip_score/compute_fid_clip_score.py +++ b/ppdiffusers/scripts/fid_clip_score/compute_fid_clip_score.py @@ -53,9 +53,9 @@ def compute_clip_score(model, processor, texts, images_path, batch_size=64): all_text_embeds = [] all_image_embeds = [] for text, image_path in tqdm( - zip( - batchify(texts, batch_size), batchify(images_path, batch_size)), - total=math.ceil(len(texts) / batch_size), ): + zip(batchify(texts, batch_size), batchify(images_path, batch_size)), + total=math.ceil(len(texts) / batch_size), + ): assert len(text) == len(image_path) batch_inputs = processor( text=text, @@ -63,56 +63,52 @@ def compute_clip_score(model, processor, texts, images_path, batch_size=64): return_tensors="pd", max_length=processor.tokenizer.model_max_length, padding="max_length", - truncation=True, ) - text_embeds = model.get_text_features( - input_ids=batch_inputs["input_ids"]) - image_embeds = model.get_image_features( - pixel_values=batch_inputs["pixel_values"]) + truncation=True, + ) + text_embeds = model.get_text_features(input_ids=batch_inputs["input_ids"]) + image_embeds = model.get_image_features(pixel_values=batch_inputs["pixel_values"]) all_text_embeds.append(text_embeds) all_image_embeds.append(image_embeds) all_text_embeds = paddle.concat(all_text_embeds) all_image_embeds = paddle.concat(all_image_embeds) - all_text_embeds = all_text_embeds / all_text_embeds.norm( - axis=-1, keepdim=True) - all_image_embeds = all_image_embeds / all_image_embeds.norm( - axis=-1, keepdim=True) - clip_score = (all_image_embeds * - all_text_embeds).sum(-1) * model.logit_scale.exp() + all_text_embeds = all_text_embeds / all_text_embeds.norm(axis=-1, keepdim=True) + all_image_embeds = all_image_embeds / all_image_embeds.norm(axis=-1, keepdim=True) + clip_score = (all_image_embeds * all_text_embeds).sum(-1) * model.logit_scale.exp() return clip_score if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument( - "--image_path", default=None, nargs="+", type=str, help="image_path") + parser.add_argument("--image_path", default=None, nargs="+", type=str, help="image_path") parser.add_argument( "--output_file", default="statistic_results.json", type=str, - help="output file name", ) + help="output file name", + ) parser.add_argument( "--text_file_name", default="coco30k", choices=["coco1k", "coco10k", "coco30k"], type=str, - help="text file.", ) + help="text file.", + ) parser.add_argument( "--clip_model_name_or_path", default="openai/clip-vit-base-patch32", type=str, - help="clip_model_name_or_path", ) - parser.add_argument( - "--fid_batch_size", default=32, type=int, help="fid_batch_size") - parser.add_argument( - "--clip_batch_size", default=64, type=int, help="clip_batch_size") - parser.add_argument( - "--resolution", default=256, type=int, help="resolution of images") + help="clip_model_name_or_path", + ) + parser.add_argument("--fid_batch_size", default=32, type=int, help="fid_batch_size") + parser.add_argument("--clip_batch_size", default=64, type=int, help="clip_batch_size") + parser.add_argument("--resolution", default=256, type=int, help="resolution of images") parser.add_argument("--device", default="gpu", type=str, help="device") parser.add_argument( "--only_fid", action="store_true", - help=("Only eval fid. "), ) + help=("Only eval fid. "), + ) args = parser.parse_args() paddle.set_device(args.device) @@ -127,11 +123,9 @@ def compute_clip_score(model, processor, texts, images_path, batch_size=64): else: os.environ["FLAG_IMAGE_NUM"] = "1000" dataset_name = f"coco_{args.resolution}_{image_num}.npz" - fid_target_file = get_path_from_url(base_url + dataset_name, - cache_path) + ".npz" + fid_target_file = get_path_from_url(base_url + dataset_name, cache_path) + ".npz" - text_file = get_path_from_url(base_url + text_file_name + ".tsv", - cache_path) + text_file = get_path_from_url(base_url + text_file_name + ".tsv", cache_path) df = pd.read_csv(text_file, sep="\t") texts = df["caption_en"].tolist() if not args.only_fid: @@ -149,18 +143,16 @@ def compute_clip_score(model, processor, texts, images_path, batch_size=64): [fid_target_file, path], batch_size=args.fid_batch_size, dims=2048, - num_workers=4, ) + num_workers=4, + ) results["fid"].append(fid_value) if not args.only_fid: # clip score - images_path = sorted([ - image_path - for ext in IMAGE_EXTENSIONS - for image_path in pathlib.Path(path).glob("*.{}".format(ext)) - ]) - clip_score = compute_clip_score(model, processor, texts, - images_path, args.clip_batch_size) + images_path = sorted( + [image_path for ext in IMAGE_EXTENSIONS for image_path in pathlib.Path(path).glob("*.{}".format(ext))] + ) + clip_score = compute_clip_score(model, processor, texts, images_path, args.clip_batch_size) if "clip_score" not in results: results["clip_score"] = [] _clip_score = clip_score.mean().item() diff --git a/ppdiffusers/scripts/fid_clip_score/fid_score.py b/ppdiffusers/scripts/fid_clip_score/fid_score.py index c73e4597015ad..9c6a81cb351c9 100755 --- a/ppdiffusers/scripts/fid_clip_score/fid_score.py +++ b/ppdiffusers/scripts/fid_clip_score/fid_score.py @@ -67,42 +67,37 @@ def tqdm(x): from inception import InceptionV3 parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) -parser.add_argument( - "--batch-size", type=int, default=50, help="Batch size to use") -parser.add_argument( - "--resolution", type=int, default=None, help="The resolution to resize.") +parser.add_argument("--batch-size", type=int, default=50, help="Batch size to use") +parser.add_argument("--resolution", type=int, default=None, help="The resolution to resize.") parser.add_argument( "--num-workers", type=int, - help=("Number of processes to use for data loading. " - "Defaults to `min(8, num_cpus)`"), ) -parser.add_argument( - "--device", - type=str, - default=None, - help="Device to use. Like cuda, cuda:0 or cpu") + help=("Number of processes to use for data loading. " "Defaults to `min(8, num_cpus)`"), +) +parser.add_argument("--device", type=str, default=None, help="Device to use. Like cuda, cuda:0 or cpu") parser.add_argument( "--dims", type=int, default=2048, choices=list(InceptionV3.BLOCK_INDEX_BY_DIM), - help=("Dimensionality of Inception features to use. " - "By default, uses pool3 features"), ) + help=("Dimensionality of Inception features to use. " "By default, uses pool3 features"), +) parser.add_argument( "--save-stats", action="store_true", - help=("Generate an npz archive from a directory of samples. " - "The first path is used as input and the second as output."), ) + help=( + "Generate an npz archive from a directory of samples. " + "The first path is used as input and the second as output." + ), +) parser.add_argument( "path", type=str, nargs=2, - help=("Paths to the generated images or " - "to .npz statistic files"), ) + help=("Paths to the generated images or " "to .npz statistic files"), +) -IMAGE_EXTENSIONS = { - "bmp", "jpg", "jpeg", "pgm", "png", "ppm", "tif", "tiff", "webp" -} +IMAGE_EXTENSIONS = {"bmp", "jpg", "jpeg", "pgm", "png", "ppm", "tif", "tiff", "webp"} class ImagePathDataset(paddle.io.Dataset): @@ -125,12 +120,7 @@ def __getitem__(self, i): return {"img": img} -def get_activations(files, - model, - batch_size=50, - dims=2048, - num_workers=1, - resolution=None): +def get_activations(files, model, batch_size=50, dims=2048, num_workers=1, resolution=None): """Calculates the activations of the pool_3 layer for all images. Params: @@ -152,18 +142,17 @@ def get_activations(files, model.eval() if batch_size > len(files): - print(("Warning: batch size is bigger than the data size. " - "Setting batch size to data size")) + print(("Warning: batch size is bigger than the data size. " "Setting batch size to data size")) batch_size = len(files) - dataset = ImagePathDataset( - files, transforms=TF.ToTensor(), resolution=resolution) + dataset = ImagePathDataset(files, transforms=TF.ToTensor(), resolution=resolution) dataloader = paddle.io.DataLoader( dataset, batch_size=batch_size, shuffle=False, drop_last=False, - num_workers=num_workers, ) + num_workers=num_workers, + ) pred_arr = np.empty((len(files), dims)) @@ -181,7 +170,7 @@ def get_activations(files, pred = pred.squeeze(3).squeeze(2).cpu().numpy() - pred_arr[start_idx:start_idx + pred.shape[0]] = pred + pred_arr[start_idx : start_idx + pred.shape[0]] = pred start_idx = start_idx + pred.shape[0] @@ -216,18 +205,15 @@ def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6): sigma1 = np.atleast_2d(sigma1) sigma2 = np.atleast_2d(sigma2) - assert (mu1.shape == mu2.shape - ), "Training and test mean vectors have different lengths" - assert (sigma1.shape == sigma2.shape - ), "Training and test covariances have different dimensions" + assert mu1.shape == mu2.shape, "Training and test mean vectors have different lengths" + assert sigma1.shape == sigma2.shape, "Training and test covariances have different dimensions" diff = mu1 - mu2 # Product might be almost singular covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False) if not np.isfinite(covmean).all(): - msg = ("fid calculation produces singular product; " - "adding %s to diagonal of cov estimates") % eps + msg = ("fid calculation produces singular product; " "adding %s to diagonal of cov estimates") % eps print(msg) offset = np.eye(sigma1.shape[0]) * eps covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset)) @@ -244,12 +230,7 @@ def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6): return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean -def calculate_activation_statistics(files, - model, - batch_size=50, - dims=2048, - num_workers=1, - resolution=None): +def calculate_activation_statistics(files, model, batch_size=50, dims=2048, num_workers=1, resolution=None): """Calculation of the statistics used by the FID. Params: -- files : List of image files paths @@ -266,43 +247,28 @@ def calculate_activation_statistics(files, -- sigma : The covariance matrix of the activations of the pool_3 layer of the inception model. """ - act = get_activations( - files, model, batch_size, dims, num_workers, resolution=resolution) + act = get_activations(files, model, batch_size, dims, num_workers, resolution=resolution) mu = np.mean(act, axis=0) sigma = np.cov(act, rowvar=False) return mu, sigma -def compute_statistics_of_path(path, - model, - batch_size, - dims, - num_workers=1, - resolution=None): +def compute_statistics_of_path(path, model, batch_size, dims, num_workers=1, resolution=None): if path.endswith(".npz"): with np.load(path) as f: m, s = f["mu"][:], f["sigma"][:] else: path = pathlib.Path(path) - files = sorted([ - file - for ext in IMAGE_EXTENSIONS - for file in path.glob("*.{}".format(ext)) - ]) + files = sorted([file for ext in IMAGE_EXTENSIONS for file in path.glob("*.{}".format(ext))]) FLAG_IMAGE_NUM = os.getenv("FLAG_IMAGE_NUM", None) if FLAG_IMAGE_NUM is not None: - files = files[:int(FLAG_IMAGE_NUM)] - m, s = calculate_activation_statistics( - files, model, batch_size, dims, num_workers, resolution=resolution) + files = files[: int(FLAG_IMAGE_NUM)] + m, s = calculate_activation_statistics(files, model, batch_size, dims, num_workers, resolution=resolution) return m, s -def calculate_fid_given_paths(paths, - batch_size, - dims, - num_workers=1, - resolution=None): +def calculate_fid_given_paths(paths, batch_size, dims, num_workers=1, resolution=None): """Calculates the FID of two paths""" for p in paths: if not os.path.exists(p): @@ -312,11 +278,9 @@ def calculate_fid_given_paths(paths, model = InceptionV3([block_idx]) - m1, s1 = compute_statistics_of_path( - paths[0], model, batch_size, dims, num_workers, resolution=resolution) + m1, s1 = compute_statistics_of_path(paths[0], model, batch_size, dims, num_workers, resolution=resolution) - m2, s2 = compute_statistics_of_path( - paths[1], model, batch_size, dims, num_workers, resolution=resolution) + m2, s2 = compute_statistics_of_path(paths[1], model, batch_size, dims, num_workers, resolution=resolution) fid_value = calculate_frechet_distance(m1, s1, m2, s2) @@ -337,8 +301,7 @@ def save_fid_stats(paths, batch_size, dims, num_workers=1, resolution=None): print(f"Saving statistics for {paths[0]}") - m1, s1 = compute_statistics_of_path( - paths[0], model, batch_size, dims, num_workers, resolution=resolution) + m1, s1 = compute_statistics_of_path(paths[0], model, batch_size, dims, num_workers, resolution=resolution) np.savez_compressed(paths[1], mu=m1, sigma=s1) @@ -367,15 +330,13 @@ def main(): args.batch_size, args.dims, num_workers, - resolution=args.resolution, ) + resolution=args.resolution, + ) return fid_value = calculate_fid_given_paths( - args.path, - args.batch_size, - args.dims, - num_workers, - resolution=args.resolution) + args.path, args.batch_size, args.dims, num_workers, resolution=args.resolution + ) print("FID: ", fid_value) diff --git a/ppdiffusers/scripts/fid_clip_score/inception.py b/ppdiffusers/scripts/fid_clip_score/inception.py index 9aecdf265779a..bbdff9a933432 100644 --- a/ppdiffusers/scripts/fid_clip_score/inception.py +++ b/ppdiffusers/scripts/fid_clip_score/inception.py @@ -21,7 +21,8 @@ # http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz FID_WEIGHTS_URL = ( "https://paddlenlp.bj.bcebos.com/models/mseitzer/pp_inception-2015-12-05-6726825d.pdparams", - "8e2ae24c34c5c8b81d45167bb9361f4c", ) + "8e2ae24c34c5c8b81d45167bb9361f4c", +) WEIGHTS_PATH = "pp_inception-2015-12-05-6726825d.pdparams" @@ -47,17 +48,18 @@ class ConvNormActivation(nn.Sequential): """ def __init__( - self, - in_channels, - out_channels, - kernel_size=3, - stride=1, - padding=None, - groups=1, - norm_layer=nn.BatchNorm2D, - activation_layer=nn.ReLU, - dilation=1, - bias=None, ): + self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=None, + groups=1, + norm_layer=nn.BatchNorm2D, + activation_layer=nn.ReLU, + dilation=1, + bias=None, + ): if padding is None: padding = (kernel_size - 1) // 2 * dilation if bias is None: @@ -71,7 +73,8 @@ def __init__( padding, dilation=dilation, groups=groups, - bias_attr=bias, ) + bias_attr=bias, + ) ] if norm_layer is not None: # The hyperparameter of BatchNorm2D is different from PaddlePaddle. @@ -97,12 +100,13 @@ class InceptionV3(nn.Layer): } def __init__( - self, - output_blocks=(DEFAULT_BLOCK_INDEX, ), - resize_input=True, - normalize_input=True, - requires_grad=False, - use_fid_inception=True, ): + self, + output_blocks=(DEFAULT_BLOCK_INDEX,), + resize_input=True, + normalize_input=True, + requires_grad=False, + use_fid_inception=True, + ): """Build pretrained InceptionV3 Parameters @@ -211,8 +215,7 @@ def forward(self, inp): outp = [] x = inp if self.resize_input: - x = F.interpolate( - x, size=(299, 299), mode="bilinear", align_corners=False) + x = F.interpolate(x, size=(299, 299), mode="bilinear", align_corners=False) if self.normalize_input: x = 2 * x - 1 # Scale from range (0, 1) to range (-1, 1) @@ -235,8 +238,7 @@ def hack_bn_layer(layer): def _inception_v3(*args, **kwargs): """Wraps `paddle.vision.models.inception_v3`""" - return paddle.vision.models.inception_v3(*args, - **kwargs).apply(hack_bn_layer) + return paddle.vision.models.inception_v3(*args, **kwargs).apply(hack_bn_layer) def fid_inception_v3(): @@ -248,8 +250,7 @@ def fid_inception_v3(): This method first constructs paddle.vision's Inception and then patches the necessary parts that are different in the FID Inception model. """ - inception = _inception_v3( - num_classes=1008, with_pool=True, pretrained=False) + inception = _inception_v3(num_classes=1008, with_pool=True, pretrained=False) inception.inception_block_list[0] = InceptionA(192, pool_features=32) inception.inception_block_list[1] = InceptionA(256, pool_features=64) inception.inception_block_list[2] = InceptionA(288, pool_features=64) @@ -260,8 +261,7 @@ def fid_inception_v3(): inception.inception_block_list[9] = InceptionE_1(1280) inception.inception_block_list[10] = InceptionE_2(2048) - weight_path = get_weights_path_from_url(FID_WEIGHTS_URL[0], - FID_WEIGHTS_URL[1]) + weight_path = get_weights_path_from_url(FID_WEIGHTS_URL[0], FID_WEIGHTS_URL[1]) state_dict = paddle.load(weight_path) inception.set_state_dict(state_dict) return inception @@ -275,49 +275,55 @@ def __init__(self, num_channels, pool_features): out_channels=64, kernel_size=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch5x5_1 = ConvNormActivation( in_channels=num_channels, out_channels=48, kernel_size=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch5x5_2 = ConvNormActivation( in_channels=48, out_channels=64, kernel_size=5, padding=2, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch3x3dbl_1 = ConvNormActivation( in_channels=num_channels, out_channels=64, kernel_size=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch3x3dbl_2 = ConvNormActivation( in_channels=64, out_channels=96, kernel_size=3, padding=1, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch3x3dbl_3 = ConvNormActivation( in_channels=96, out_channels=96, kernel_size=3, padding=1, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) # Patch: Tensorflow's average pool does not use the padded zero's in # its average calculation - self.branch_pool = nn.AvgPool2D( - kernel_size=3, stride=1, padding=1, exclusive=True) + self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True) self.branch_pool_conv = ConvNormActivation( in_channels=num_channels, out_channels=pool_features, kernel_size=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) def forward(self, x): branch1x1 = self.branch1x1(x) @@ -330,8 +336,7 @@ def forward(self, x): branch_pool = self.branch_pool(x) branch_pool = self.branch_pool_conv(branch_pool) - x = paddle.concat( - [branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1) + x = paddle.concat([branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1) return x @@ -343,7 +348,8 @@ def __init__(self, num_channels, channels_7x7): out_channels=192, kernel_size=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch7x7_1 = ConvNormActivation( in_channels=num_channels, @@ -351,62 +357,70 @@ def __init__(self, num_channels, channels_7x7): kernel_size=1, stride=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch7x7_2 = ConvNormActivation( in_channels=channels_7x7, out_channels=channels_7x7, kernel_size=(1, 7), stride=1, padding=(0, 3), - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch7x7_3 = ConvNormActivation( in_channels=channels_7x7, out_channels=192, kernel_size=(7, 1), stride=1, padding=(3, 0), - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch7x7dbl_1 = ConvNormActivation( in_channels=num_channels, out_channels=channels_7x7, kernel_size=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch7x7dbl_2 = ConvNormActivation( in_channels=channels_7x7, out_channels=channels_7x7, kernel_size=(7, 1), padding=(3, 0), - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch7x7dbl_3 = ConvNormActivation( in_channels=channels_7x7, out_channels=channels_7x7, kernel_size=(1, 7), padding=(0, 3), - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch7x7dbl_4 = ConvNormActivation( in_channels=channels_7x7, out_channels=channels_7x7, kernel_size=(7, 1), padding=(3, 0), - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch7x7dbl_5 = ConvNormActivation( in_channels=channels_7x7, out_channels=192, kernel_size=(1, 7), padding=(0, 3), - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) # Patch: Tensorflow's average pool does not use the padded zero's in # its average calculation - self.branch_pool = nn.AvgPool2D( - kernel_size=3, stride=1, padding=1, exclusive=True) + self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True) self.branch_pool_conv = ConvNormActivation( in_channels=num_channels, out_channels=192, kernel_size=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) def forward(self, x): branch1x1 = self.branch1x1(x) @@ -424,8 +438,7 @@ def forward(self, x): branch_pool = self.branch_pool(x) branch_pool = self.branch_pool_conv(branch_pool) - x = paddle.concat( - [branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1) + x = paddle.concat([branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1) return x @@ -438,61 +451,69 @@ def __init__(self, num_channels): out_channels=320, kernel_size=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch3x3_1 = ConvNormActivation( in_channels=num_channels, out_channels=384, kernel_size=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch3x3_2a = ConvNormActivation( in_channels=384, out_channels=384, kernel_size=(1, 3), padding=(0, 1), - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch3x3_2b = ConvNormActivation( in_channels=384, out_channels=384, kernel_size=(3, 1), padding=(1, 0), - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch3x3dbl_1 = ConvNormActivation( in_channels=num_channels, out_channels=448, kernel_size=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch3x3dbl_2 = ConvNormActivation( in_channels=448, out_channels=384, kernel_size=3, padding=1, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch3x3dbl_3a = ConvNormActivation( in_channels=384, out_channels=384, kernel_size=(1, 3), padding=(0, 1), - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) self.branch3x3dbl_3b = ConvNormActivation( in_channels=384, out_channels=384, kernel_size=(3, 1), padding=(1, 0), - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) # Patch: Tensorflow's average pool does not use the padded zero's in # its average calculation - self.branch_pool = nn.AvgPool2D( - kernel_size=3, stride=1, padding=1, exclusive=True) + self.branch_pool = nn.AvgPool2D(kernel_size=3, stride=1, padding=1, exclusive=True) self.branch_pool_conv = ConvNormActivation( in_channels=num_channels, out_channels=192, kernel_size=1, padding=0, - activation_layer=nn.ReLU, ) + activation_layer=nn.ReLU, + ) def forward(self, x): branch1x1 = self.branch1x1(x) @@ -515,8 +536,7 @@ def forward(self, x): branch_pool = self.branch_pool(x) branch_pool = self.branch_pool_conv(branch_pool) - x = paddle.concat( - [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1) + x = paddle.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1) return x @@ -549,6 +569,5 @@ def forward(self, x): branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1) branch_pool = self.branch_pool_conv(branch_pool) - x = paddle.concat( - [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1) + x = paddle.concat([branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1) return x diff --git a/ppdiffusers/setup.py b/ppdiffusers/setup.py index bb412f60fc4f4..a5d0f3cf3b5e9 100644 --- a/ppdiffusers/setup.py +++ b/ppdiffusers/setup.py @@ -57,10 +57,7 @@ def read_requirements(): keywords=["ppdiffusers", "paddle", "paddlemix"], install_requires=REQUIRED_PACKAGES, python_requires=">=3.6", - entry_points={ - "console_scripts": - ["ppdiffusers-cli=ppdiffusers.commands.ppdiffusers_cli:main"] - }, + entry_points={"console_scripts": ["ppdiffusers-cli=ppdiffusers.commands.ppdiffusers_cli:main"]}, classifiers=[ "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.6", @@ -70,4 +67,5 @@ def read_requirements(): "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", ], - license="Apache 2.0", ) + license="Apache 2.0", +) diff --git a/ppdiffusers/tests/fixtures/custom_pipeline/pipeline.py b/ppdiffusers/tests/fixtures/custom_pipeline/pipeline.py index 7f987b99141b8..aa10a342c68d4 100644 --- a/ppdiffusers/tests/fixtures/custom_pipeline/pipeline.py +++ b/ppdiffusers/tests/fixtures/custom_pipeline/pipeline.py @@ -38,13 +38,14 @@ def __init__(self, unet, scheduler): @paddle.no_grad() def __call__( - self, - batch_size: int=1, - generator: Optional[paddle.Generator]=None, - num_inference_steps: int=50, - output_type: Optional[str]="pil", - return_dict: bool=True, - **kwargs, ) -> Union[ImagePipelineOutput, Tuple]: + self, + batch_size: int = 1, + generator: Optional[paddle.Generator] = None, + num_inference_steps: int = 50, + output_type: Optional[str] = "pil", + return_dict: bool = True, + **kwargs, + ) -> Union[ImagePipelineOutput, Tuple]: r""" Args: batch_size (`int`, *optional*, defaults to 1): @@ -74,8 +75,10 @@ def __call__( batch_size, self.unet.config.in_channels, self.unet.config.sample_size, - self.unet.config.sample_size, ), - generator=generator, ) + self.unet.config.sample_size, + ), + generator=generator, + ) # set step values self.scheduler.set_timesteps(num_inference_steps) @@ -95,6 +98,6 @@ def __call__( image = self.numpy_to_pil(image) if not return_dict: - return (image, ), "This is a local test" + return (image,), "This is a local test" return ImagePipelineOutput(images=image), "This is a local test" diff --git a/ppdiffusers/tests/fixtures/custom_pipeline/what_ever.py b/ppdiffusers/tests/fixtures/custom_pipeline/what_ever.py index d562cd9e580cc..ebdc7650dafd2 100644 --- a/ppdiffusers/tests/fixtures/custom_pipeline/what_ever.py +++ b/ppdiffusers/tests/fixtures/custom_pipeline/what_ever.py @@ -38,13 +38,14 @@ def __init__(self, unet, scheduler): @paddle.no_grad() def __call__( - self, - batch_size: int=1, - generator: Optional[paddle.Generator]=None, - num_inference_steps: int=50, - output_type: Optional[str]="pil", - return_dict: bool=True, - **kwargs, ) -> Union[ImagePipelineOutput, Tuple]: + self, + batch_size: int = 1, + generator: Optional[paddle.Generator] = None, + num_inference_steps: int = 50, + output_type: Optional[str] = "pil", + return_dict: bool = True, + **kwargs, + ) -> Union[ImagePipelineOutput, Tuple]: r""" Args: batch_size (`int`, *optional*, defaults to 1): @@ -74,8 +75,10 @@ def __call__( batch_size, self.unet.config.in_channels, self.unet.config.sample_size, - self.unet.config.sample_size, ), - generator=generator, ) + self.unet.config.sample_size, + ), + generator=generator, + ) # set step values self.scheduler.set_timesteps(num_inference_steps) @@ -95,6 +98,6 @@ def __call__( image = self.numpy_to_pil(image) if not return_dict: - return (image, ), "This is a local test" + return (image,), "This is a local test" return ImagePipelineOutput(images=image), "This is a local test" diff --git a/ppdiffusers/tests/models/test_attention_processor.py b/ppdiffusers/tests/models/test_attention_processor.py index 84b2d1e9263cb..f47ddfa4abb1d 100644 --- a/ppdiffusers/tests/models/test_attention_processor.py +++ b/ppdiffusers/tests/models/test_attention_processor.py @@ -16,12 +16,11 @@ import paddle -from ppdiffusers.models.attention_processor import (Attention, - AttnAddedKVProcessor) +from ppdiffusers.models.attention_processor import Attention, AttnAddedKVProcessor class AttnAddedKVProcessorTests(unittest.TestCase): - def get_constructor_arguments(self, only_cross_attention: bool=False): + def get_constructor_arguments(self, only_cross_attention: bool = False): query_dim = 10 if only_cross_attention: @@ -59,8 +58,7 @@ def test_only_cross_attention(self): paddle.seed(0) - constructor_args = self.get_constructor_arguments( - only_cross_attention=False) + constructor_args = self.get_constructor_arguments(only_cross_attention=False) attn = Attention(**constructor_args) self.assertTrue(attn.to_k is not None) @@ -68,7 +66,8 @@ def test_only_cross_attention(self): forward_args = self.get_forward_arguments( query_dim=constructor_args["query_dim"], - added_kv_proj_dim=constructor_args["added_kv_proj_dim"], ) + added_kv_proj_dim=constructor_args["added_kv_proj_dim"], + ) self_and_cross_attn_out = attn(**forward_args) @@ -76,8 +75,7 @@ def test_only_cross_attention(self): paddle.seed(0) - constructor_args = self.get_constructor_arguments( - only_cross_attention=True) + constructor_args = self.get_constructor_arguments(only_cross_attention=True) attn = Attention(**constructor_args) self.assertTrue(attn.to_k is None) @@ -85,7 +83,8 @@ def test_only_cross_attention(self): forward_args = self.get_forward_arguments( query_dim=constructor_args["query_dim"], - added_kv_proj_dim=constructor_args["added_kv_proj_dim"], ) + added_kv_proj_dim=constructor_args["added_kv_proj_dim"], + ) only_cross_attn_out = attn(**forward_args) diff --git a/ppdiffusers/tests/models/test_layers_utils.py b/ppdiffusers/tests/models/test_layers_utils.py index 6bfcd5b37fbab..32480c6e215df 100644 --- a/ppdiffusers/tests/models/test_layers_utils.py +++ b/ppdiffusers/tests/models/test_layers_utils.py @@ -19,8 +19,12 @@ import paddle import paddle.nn -from ppdiffusers.models.attention import (GEGLU, AdaLayerNorm, ApproximateGELU, - AttentionBlock) +from ppdiffusers.models.attention import ( + GEGLU, + AdaLayerNorm, + ApproximateGELU, + AttentionBlock, +) from ppdiffusers.models.embeddings import get_timestep_embedding from ppdiffusers.models.resnet import Downsample2D, ResnetBlock2D, Upsample2D from ppdiffusers.models.transformer_2d import Transformer2DModel @@ -31,8 +35,8 @@ def test_timestep_embeddings(self): embedding_dim = 256 timesteps = paddle.arange(start=16) t1 = get_timestep_embedding(timesteps, embedding_dim) - assert (t1[0, :embedding_dim // 2] - 0).abs().sum() < 1e-05 - assert (t1[0, embedding_dim // 2:] - 1).abs().sum() < 1e-05 + assert (t1[0, : embedding_dim // 2] - 0).abs().sum() < 1e-05 + assert (t1[0, embedding_dim // 2 :] - 1).abs().sum() < 1e-05 assert (t1[:, -1] - 1).abs().sum() < 1e-05 grad_mean = np.abs(np.gradient(t1, axis=-1)).mean(axis=1) prev_grad = 0.0 @@ -49,72 +53,59 @@ def test_timestep_defaults(self): embedding_dim, flip_sin_to_cos=False, downscale_freq_shift=1, - max_period=10000, ) + max_period=10000, + ) assert paddle.allclose(t1.cpu(), t2.cpu(), atol=0.01) def test_timestep_flip_sin_cos(self): embedding_dim = 16 timesteps = paddle.arange(start=10) - t1 = get_timestep_embedding( - timesteps, embedding_dim, flip_sin_to_cos=True) - t1 = paddle.concat( - x=[t1[:, embedding_dim // 2:], t1[:, :embedding_dim // 2]], axis=-1) - t2 = get_timestep_embedding( - timesteps, embedding_dim, flip_sin_to_cos=False) + t1 = get_timestep_embedding(timesteps, embedding_dim, flip_sin_to_cos=True) + t1 = paddle.concat(x=[t1[:, embedding_dim // 2 :], t1[:, : embedding_dim // 2]], axis=-1) + t2 = get_timestep_embedding(timesteps, embedding_dim, flip_sin_to_cos=False) assert paddle.allclose(t1.cpu(), t2.cpu(), atol=0.01) def test_timestep_downscale_freq_shift(self): embedding_dim = 16 timesteps = paddle.arange(start=10) - t1 = get_timestep_embedding( - timesteps, embedding_dim, downscale_freq_shift=0) - t2 = get_timestep_embedding( - timesteps, embedding_dim, downscale_freq_shift=1) - cosine_half = (t1 - t2)[:, embedding_dim // 2:] + t1 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=0) + t2 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=1) + cosine_half = (t1 - t2)[:, embedding_dim // 2 :] assert (np.abs((cosine_half <= 0).numpy()) - 1).sum() < 1e-05 def test_sinoid_embeddings_hardcoded(self): embedding_dim = 64 timesteps = paddle.arange(start=128) - t1 = get_timestep_embedding( - timesteps, - embedding_dim, - downscale_freq_shift=1, - flip_sin_to_cos=False) - t2 = get_timestep_embedding( - timesteps, - embedding_dim, - downscale_freq_shift=0, - flip_sin_to_cos=True) + t1 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=1, flip_sin_to_cos=False) + t2 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=0, flip_sin_to_cos=True) t3 = get_timestep_embedding(timesteps, embedding_dim, scale=1000) assert paddle.allclose( t1[23:26, 47:50].flatten().cpu(), - paddle.to_tensor([ - 0.9646, 0.9804, 0.9892, 0.9615, 0.9787, 0.9882, 0.9582, 0.9769, - 0.9872 - ]), - atol=0.01, ) + paddle.to_tensor([0.9646, 0.9804, 0.9892, 0.9615, 0.9787, 0.9882, 0.9582, 0.9769, 0.9872]), + atol=0.01, + ) assert paddle.allclose( t2[23:26, 47:50].flatten().cpu(), - paddle.to_tensor([ - 0.3019, 0.228, 0.1716, 0.3146, 0.2377, 0.179, 0.3272, 0.2474, - 0.1864 - ]), - atol=0.01, ) + paddle.to_tensor([0.3019, 0.228, 0.1716, 0.3146, 0.2377, 0.179, 0.3272, 0.2474, 0.1864]), + atol=0.01, + ) assert paddle.allclose( t3[23:26, 47:50].flatten().cpu(), - paddle.to_tensor([ - -0.9801, - -0.9464, - -0.9349, - -0.3952, - 0.8887, - -0.9709, - 0.5299, - -0.2853, - -0.9927, - ]), - atol=0.01, ) + paddle.to_tensor( + [ + -0.9801, + -0.9464, + -0.9349, + -0.3952, + 0.8887, + -0.9709, + 0.5299, + -0.2853, + -0.9927, + ] + ), + atol=0.01, + ) class Upsample2DBlockTests(unittest.TestCase): @@ -126,19 +117,20 @@ def test_upsample_default(self): upsampled = upsample(sample) assert tuple(upsampled.shape) == (1, 32, 64, 64) output_slice = upsampled[0, -1, -3:, -3:] - expected_slice = paddle.to_tensor([ - -1.50215650, - -0.12905766, - -0.12905766, - -1.97015178, - 0.78776687, - 0.78776687, - -1.97015178, - 0.78776687, - 0.78776687, - ]) - assert paddle.allclose( - output_slice.flatten(), expected_slice, atol=0.01) + expected_slice = paddle.to_tensor( + [ + -1.50215650, + -0.12905766, + -0.12905766, + -1.97015178, + 0.78776687, + 0.78776687, + -1.97015178, + 0.78776687, + 0.78776687, + ] + ) + assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01) def test_upsample_with_conv(self): paddle.seed(0) @@ -148,19 +140,20 @@ def test_upsample_with_conv(self): upsampled = upsample(sample) assert tuple(upsampled.shape) == (1, 32, 64, 64) output_slice = upsampled[0, -1, -3:, -3:] - expected_slice = paddle.to_tensor([ - 0.4583871364593506, - -0.8221798539161682, - -0.8228907585144043, - 0.3325321078300476, - -0.24422502517700195, - 1.344732642173767, - 0.5239212512969971, - -0.4814918637275696, - 0.17928099632263184, - ]) - assert paddle.allclose( - output_slice.flatten(), expected_slice, atol=0.01) + expected_slice = paddle.to_tensor( + [ + 0.4583871364593506, + -0.8221798539161682, + -0.8228907585144043, + 0.3325321078300476, + -0.24422502517700195, + 1.344732642173767, + 0.5239212512969971, + -0.4814918637275696, + 0.17928099632263184, + ] + ) + assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01) def test_upsample_with_conv_out_dim(self): paddle.seed(0) @@ -170,42 +163,43 @@ def test_upsample_with_conv_out_dim(self): upsampled = upsample(sample) assert tuple(upsampled.shape) == (1, 64, 64, 64) output_slice = upsampled[0, -1, -3:, -3:] - expected_slice = paddle.to_tensor([ - 0.9049283266067505, - -1.6125869750976562, - -1.0837469100952148, - 0.24520659446716309, - -0.6669139266014099, - 0.5660533905029297, - 1.1056761741638184, - 2.1717309951782227, - 0.7197026610374451, - ]) - assert paddle.allclose( - output_slice.flatten(), expected_slice, atol=0.01) + expected_slice = paddle.to_tensor( + [ + 0.9049283266067505, + -1.6125869750976562, + -1.0837469100952148, + 0.24520659446716309, + -0.6669139266014099, + 0.5660533905029297, + 1.1056761741638184, + 2.1717309951782227, + 0.7197026610374451, + ] + ) + assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01) def test_upsample_with_transpose(self): paddle.seed(0) sample = paddle.randn(shape=[1, 32, 32, 32]) - upsample = Upsample2D( - channels=32, use_conv=False, use_conv_transpose=True) + upsample = Upsample2D(channels=32, use_conv=False, use_conv_transpose=True) with paddle.no_grad(): upsampled = upsample(sample) assert tuple(upsampled.shape) == (1, 32, 64, 64) output_slice = upsampled[0, -1, -3:, -3:] - expected_slice = paddle.to_tensor([ - -0.05951342731714249, - 0.26951998472213745, - 0.2600363492965698, - 1.12237548828125, - -0.07744798064231873, - 0.006375734228640795, - 0.6678807735443115, - 0.44324278831481934, - -0.10978640615940094, - ]) - assert paddle.allclose( - output_slice.flatten(), expected_slice, atol=0.01) + expected_slice = paddle.to_tensor( + [ + -0.05951342731714249, + 0.26951998472213745, + 0.2600363492965698, + 1.12237548828125, + -0.07744798064231873, + 0.006375734228640795, + 0.6678807735443115, + 0.44324278831481934, + -0.10978640615940094, + ] + ) + assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01) class Downsample2DBlockTests(unittest.TestCase): @@ -217,17 +211,19 @@ def test_downsample_default(self): downsampled = downsample(sample) assert tuple(downsampled.shape) == (1, 32, 32, 32) output_slice = downsampled[0, -1, -3:, -3:] - expected_slice = paddle.to_tensor([ - -0.24012964963912964, - -0.034197285771369934, - -1.0328047275543213, - 0.7861506938934326, - -0.2086063176393509, - -0.3999312222003937, - 0.25081655383110046, - -0.23891538381576538, - -1.4398303031921387, - ]) + expected_slice = paddle.to_tensor( + [ + -0.24012964963912964, + -0.034197285771369934, + -1.0328047275543213, + 0.7861506938934326, + -0.2086063176393509, + -0.3999312222003937, + 0.25081655383110046, + -0.23891538381576538, + -1.4398303031921387, + ] + ) max_diff = (output_slice.flatten() - expected_slice).abs().sum().item() assert max_diff <= 0.001 @@ -239,19 +235,20 @@ def test_downsample_with_conv(self): downsampled = downsample(sample) assert tuple(downsampled.shape) == (1, 32, 32, 32) output_slice = downsampled[0, -1, -3:, -3:] - expected_slice = paddle.to_tensor([ - -0.009430217556655407, - 0.8657761216163635, - 1.7985490560531616, - -0.61894291639328, - -2.5752196311950684, - 1.2352519035339355, - 0.6046919822692871, - -1.6499173641204834, - -1.5272349119186401, - ]) - assert paddle.allclose( - output_slice.flatten(), expected_slice, atol=0.01) + expected_slice = paddle.to_tensor( + [ + -0.009430217556655407, + 0.8657761216163635, + 1.7985490560531616, + -0.61894291639328, + -2.5752196311950684, + 1.2352519035339355, + 0.6046919822692871, + -1.6499173641204834, + -1.5272349119186401, + ] + ) + assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01) def test_downsample_with_conv_pad1(self): paddle.seed(0) @@ -261,19 +258,20 @@ def test_downsample_with_conv_pad1(self): downsampled = downsample(sample) assert tuple(downsampled.shape) == (1, 32, 32, 32) output_slice = downsampled[0, -1, -3:, -3:] - expected_slice = paddle.to_tensor([ - -0.009430217556655407, - 0.8657761216163635, - 1.7985490560531616, - -0.61894291639328, - -2.5752196311950684, - 1.2352519035339355, - 0.6046919822692871, - -1.6499173641204834, - -1.5272349119186401, - ]) - assert paddle.allclose( - output_slice.flatten(), expected_slice, atol=0.01) + expected_slice = paddle.to_tensor( + [ + -0.009430217556655407, + 0.8657761216163635, + 1.7985490560531616, + -0.61894291639328, + -2.5752196311950684, + 1.2352519035339355, + 0.6046919822692871, + -1.6499173641204834, + -1.5272349119186401, + ] + ) + assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01) def test_downsample_with_conv_out_dim(self): paddle.seed(0) @@ -283,19 +281,20 @@ def test_downsample_with_conv_out_dim(self): downsampled = downsample(sample) assert tuple(downsampled.shape) == (1, 16, 32, 32) output_slice = downsampled[0, -1, -3:, -3:] - expected_slice = paddle.to_tensor([ - 0.10819266736507416, - 0.43043053150177, - -0.7322822213172913, - -1.923148512840271, - 1.0195047855377197, - 0.48796477913856506, - 1.6765365600585938, - -4.072991847991943, - 0.8763526082038879, - ]) - assert paddle.allclose( - output_slice.flatten(), expected_slice, atol=0.01) + expected_slice = paddle.to_tensor( + [ + 0.10819266736507416, + 0.43043053150177, + -0.7322822213172913, + -1.923148512840271, + 1.0195047855377197, + 0.48796477913856506, + 1.6765365600585938, + -4.072991847991943, + 0.8763526082038879, + ] + ) + assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01) class ResnetBlock2DTests(unittest.TestCase): @@ -308,43 +307,44 @@ def test_resnet_default(self): output_tensor = resnet_block(sample, temb) assert tuple(output_tensor.shape) == (1, 32, 64, 64) output_slice = output_tensor[0, -1, -3:, -3:] - expected_slice = paddle.to_tensor([ - 1.9816107749938965, - 1.4443503618240356, - -1.0354782342910767, - 0.23985600471496582, - -1.0868161916732788, - -1.5830397605895996, - -0.041037797927856445, - -1.2574901580810547, - -0.5504958629608154, - ]) - assert paddle.allclose( - output_slice.flatten(), expected_slice, atol=0.01) + expected_slice = paddle.to_tensor( + [ + 1.9816107749938965, + 1.4443503618240356, + -1.0354782342910767, + 0.23985600471496582, + -1.0868161916732788, + -1.5830397605895996, + -0.041037797927856445, + -1.2574901580810547, + -0.5504958629608154, + ] + ) + assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01) def test_restnet_with_use_in_shortcut(self): paddle.seed(0) sample = paddle.randn(shape=[1, 32, 64, 64]) temb = paddle.randn(shape=[1, 128]) - resnet_block = ResnetBlock2D( - in_channels=32, temb_channels=128, use_in_shortcut=True) + resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, use_in_shortcut=True) with paddle.no_grad(): output_tensor = resnet_block(sample, temb) assert tuple(output_tensor.shape) == (1, 32, 64, 64) output_slice = output_tensor[0, -1, -3:, -3:] - expected_slice = paddle.to_tensor([ - -0.9861348867416382, - -1.097771406173706, - 0.268703430891037, - 0.40997087955474854, - -4.26219367980957, - 1.758486270904541, - -0.8979732990264893, - 0.30774950981140137, - 3.2780206203460693, - ]) - assert paddle.allclose( - output_slice.flatten(), expected_slice, atol=0.01) + expected_slice = paddle.to_tensor( + [ + -0.9861348867416382, + -1.097771406173706, + 0.268703430891037, + 0.40997087955474854, + -4.26219367980957, + 1.758486270904541, + -0.8979732990264893, + 0.30774950981140137, + 3.2780206203460693, + ] + ) + assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01) def test_resnet_up(self): paddle.seed(0) @@ -355,91 +355,92 @@ def test_resnet_up(self): output_tensor = resnet_block(sample, temb) assert tuple(output_tensor.shape) == (1, 32, 128, 128) output_slice = output_tensor[0, -1, -3:, -3:] - expected_slice = paddle.to_tensor([ - 0.2874237298965454, - -2.6432056427001953, - -2.1900298595428467, - -0.48899877071380615, - -1.1637755632400513, - -1.084446907043457, - -1.1333439350128174, - 0.2726985812187195, - -0.014697253704071045, - ]) - assert paddle.allclose( - output_slice.flatten(), expected_slice, atol=0.01) + expected_slice = paddle.to_tensor( + [ + 0.2874237298965454, + -2.6432056427001953, + -2.1900298595428467, + -0.48899877071380615, + -1.1637755632400513, + -1.084446907043457, + -1.1333439350128174, + 0.2726985812187195, + -0.014697253704071045, + ] + ) + assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01) def test_resnet_down(self): paddle.seed(0) sample = paddle.randn(shape=[1, 32, 64, 64]) temb = paddle.randn(shape=[1, 128]) - resnet_block = ResnetBlock2D( - in_channels=32, temb_channels=128, down=True) + resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, down=True) with paddle.no_grad(): output_tensor = resnet_block(sample, temb) assert tuple(output_tensor.shape) == (1, 32, 32, 32) output_slice = output_tensor[0, -1, -3:, -3:] - expected_slice = paddle.to_tensor([ - 1.54087495803833, - 0.26700693368911743, - -0.540952742099762, - 2.7190208435058594, - -0.09766747057437897, - 0.23407122492790222, - 0.47980907559394836, - 0.6348602771759033, - -0.75424242019653322, - ]) - assert paddle.allclose( - output_slice.flatten(), expected_slice, atol=0.01) + expected_slice = paddle.to_tensor( + [ + 1.54087495803833, + 0.26700693368911743, + -0.540952742099762, + 2.7190208435058594, + -0.09766747057437897, + 0.23407122492790222, + 0.47980907559394836, + 0.6348602771759033, + -0.75424242019653322, + ] + ) + assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01) def test_restnet_with_kernel_fir(self): paddle.seed(0) sample = paddle.randn(shape=[1, 32, 64, 64]) temb = paddle.randn(shape=[1, 128]) - resnet_block = ResnetBlock2D( - in_channels=32, temb_channels=128, kernel="fir", down=True) + resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, kernel="fir", down=True) with paddle.no_grad(): output_tensor = resnet_block(sample, temb) assert tuple(output_tensor.shape) == (1, 32, 32, 32) output_slice = output_tensor[0, -1, -3:, -3:] - expected_slice = paddle.to_tensor([ - 0.9914248585700989, - 0.4773162007331848, - -0.021942138671875, - 2.482321262359619, - 0.18839354813098907, - 0.1516135334968567, - 0.7221578359603882, - 0.3920581340789795, - -0.24661940336227417, - ]) - assert paddle.allclose( - output_slice.flatten(), expected_slice, atol=0.01) + expected_slice = paddle.to_tensor( + [ + 0.9914248585700989, + 0.4773162007331848, + -0.021942138671875, + 2.482321262359619, + 0.18839354813098907, + 0.1516135334968567, + 0.7221578359603882, + 0.3920581340789795, + -0.24661940336227417, + ] + ) + assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01) def test_restnet_with_kernel_sde_vp(self): paddle.seed(0) sample = paddle.randn(shape=[1, 32, 64, 64]) temb = paddle.randn(shape=[1, 128]) - resnet_block = ResnetBlock2D( - in_channels=32, temb_channels=128, kernel="sde_vp", down=True) + resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, kernel="sde_vp", down=True) with paddle.no_grad(): output_tensor = resnet_block(sample, temb) assert tuple(output_tensor.shape) == (1, 32, 32, 32) output_slice = output_tensor[0, -1, -3:, -3:] - expected_slice = paddle.to_tensor([ - 1.54087495803833, - 0.26700693368911743, - -0.540952742099762, - 2.7190208435058594, - -0.09766747057437897, - 0.23407122492790222, - 0.47980907559394836, - 0.6348602771759033, - -0.7542424201965332, - ]) - assert paddle.allclose( - output_slice.flatten(), expected_slice, atol=0.01) + expected_slice = paddle.to_tensor( + [ + 1.54087495803833, + 0.26700693368911743, + -0.540952742099762, + 2.7190208435058594, + -0.09766747057437897, + 0.23407122492790222, + 0.47980907559394836, + 0.6348602771759033, + -0.7542424201965332, + ] + ) + assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01) class AttentionBlockTests(unittest.TestCase): @@ -451,50 +452,49 @@ def test_attention_block_default(self): num_head_channels=1, rescale_output_factor=1.0, eps=1e-06, - norm_num_groups=32, ) + norm_num_groups=32, + ) with paddle.no_grad(): attention_scores = attentionBlock(sample) assert attention_scores.shape == [1, 32, 64, 64] output_slice = attention_scores[0, -1, -3:, -3:] - expected_slice = paddle.to_tensor([ - 1.638939619064331, - -0.15776772797107697, - -1.1130025386810303, - -0.8540273904800415, - -0.5696781873703003, - -2.0493741035461426, - -0.3732607960700989, - -1.740313172340393, - -0.5271167755126953, - ]) - assert paddle.allclose( - output_slice.flatten(), expected_slice, atol=0.01) + expected_slice = paddle.to_tensor( + [ + 1.638939619064331, + -0.15776772797107697, + -1.1130025386810303, + -0.8540273904800415, + -0.5696781873703003, + -2.0493741035461426, + -0.3732607960700989, + -1.740313172340393, + -0.5271167755126953, + ] + ) + assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01) def test_attention_block_sd(self): paddle.seed(0) sample = paddle.randn(shape=[1, 512, 64, 64]) - attentionBlock = AttentionBlock( - channels=512, - rescale_output_factor=1.0, - eps=1e-06, - norm_num_groups=32) + attentionBlock = AttentionBlock(channels=512, rescale_output_factor=1.0, eps=1e-06, norm_num_groups=32) with paddle.no_grad(): attention_scores = attentionBlock(sample) assert attention_scores.shape == [1, 512, 64, 64] output_slice = attention_scores[0, -1, -3:, -3:] - expected_slice = paddle.to_tensor([ - -0.8007570505142212, - -0.770350992679596, - -3.5278191566467285, - -2.0540268421173096, - -0.7711739540100098, - -0.8278288245201111, - -0.48292720317840576, - 1.6039936542510986, - 0.626724362373352, - ]) - assert paddle.allclose( - output_slice.flatten(), expected_slice, atol=0.01) + expected_slice = paddle.to_tensor( + [ + -0.8007570505142212, + -0.770350992679596, + -3.5278191566467285, + -2.0540268421173096, + -0.7711739540100098, + -0.8278288245201111, + -0.48292720317840576, + 1.6039936542510986, + 0.626724362373352, + ] + ) + assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01) class Transformer2DModelTests(unittest.TestCase): @@ -506,24 +506,26 @@ def test_spatial_transformer_default(self): num_attention_heads=1, attention_head_dim=32, dropout=0.0, - cross_attention_dim=None, ) + cross_attention_dim=None, + ) with paddle.no_grad(): attention_scores = spatial_transformer_block(sample).sample assert attention_scores.shape == [1, 32, 64, 64] output_slice = attention_scores[0, -1, -3:, -3:] - expected_slice = paddle.to_tensor([ - 2.6310853958129883, - 5.990478515625, - 0.5715246200561523, - -2.5269505977630615, - -2.853764057159424, - -5.163403511047363, - 0.2880846858024597, - -5.925153732299805, - 2.316770076751709, - ]) - assert paddle.allclose( - output_slice.flatten(), expected_slice, atol=0.01) + expected_slice = paddle.to_tensor( + [ + 2.6310853958129883, + 5.990478515625, + 0.5715246200561523, + -2.5269505977630615, + -2.853764057159424, + -5.163403511047363, + 0.2880846858024597, + -5.925153732299805, + 2.316770076751709, + ] + ) + assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01) def test_spatial_transformer_cross_attention_dim(self): paddle.seed(0) @@ -533,25 +535,27 @@ def test_spatial_transformer_cross_attention_dim(self): num_attention_heads=2, attention_head_dim=32, dropout=0.0, - cross_attention_dim=64, ) + cross_attention_dim=64, + ) with paddle.no_grad(): context = paddle.randn(shape=[1, 4, 64]) attention_scores = spatial_transformer_block(sample, context).sample assert attention_scores.shape == [1, 64, 64, 64] output_slice = attention_scores[0, -1, -3:, -3:] - expected_slice = paddle.to_tensor([ - -0.08756911754608154, - -3.94197940826416, - -0.25678586959838867, - 2.1481714248657227, - 2.327033042907715, - 0.29948690533638, - 1.3845969438552856, - 0.7825677394866943, - 1.4856826066970825, - ]) - assert paddle.allclose( - output_slice.flatten(), expected_slice, atol=0.01) + expected_slice = paddle.to_tensor( + [ + -0.08756911754608154, + -3.94197940826416, + -0.25678586959838867, + 2.1481714248657227, + 2.327033042907715, + 0.29948690533638, + 1.3845969438552856, + 0.7825677394866943, + 1.4856826066970825, + ] + ) + assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01) def test_spatial_transformer_timestep(self): paddle.seed(0) @@ -563,44 +567,45 @@ def test_spatial_transformer_timestep(self): attention_head_dim=32, dropout=0.0, cross_attention_dim=64, - num_embeds_ada_norm=num_embeds_ada_norm, ) + num_embeds_ada_norm=num_embeds_ada_norm, + ) with paddle.no_grad(): timestep_1 = paddle.to_tensor(1, dtype="int64") timestep_2 = paddle.to_tensor(2, dtype="int64") - attention_scores_1 = spatial_transformer_block( - sample, timestep=timestep_1).sample - attention_scores_2 = spatial_transformer_block( - sample, timestep=timestep_2).sample + attention_scores_1 = spatial_transformer_block(sample, timestep=timestep_1).sample + attention_scores_2 = spatial_transformer_block(sample, timestep=timestep_2).sample assert tuple(attention_scores_1.shape) == (1, 64, 64, 64) assert tuple(attention_scores_2.shape) == (1, 64, 64, 64) output_slice_1 = attention_scores_1[0, -1, -3:, -3:] output_slice_2 = attention_scores_2[0, -1, -3:, -3:] - expected_slice_1 = paddle.to_tensor([ - -0.15322405099868774, - -1.265586018562317, - -5.424124717712402, - -0.7333418130874634, - -0.5904415249824524, - 0.9293081760406494, - 1.1033945083618164, - -5.200987815856934, - -0.7598087787628174, - ]) - expected_slice_2 = paddle.to_tensor([ - 0.12572699785232544, - -1.0498149394989014, - -5.207070350646973, - -0.41757693886756897, - -0.25374162197113037, - 1.152648687362671, - 1.422953724861145, - -4.933906078338623, - -0.564710259437561, - ]) - assert paddle.allclose( - output_slice_1.flatten(), expected_slice_1, atol=0.01) - assert paddle.allclose( - output_slice_2.flatten(), expected_slice_2, atol=0.01) + expected_slice_1 = paddle.to_tensor( + [ + -0.15322405099868774, + -1.265586018562317, + -5.424124717712402, + -0.7333418130874634, + -0.5904415249824524, + 0.9293081760406494, + 1.1033945083618164, + -5.200987815856934, + -0.7598087787628174, + ] + ) + expected_slice_2 = paddle.to_tensor( + [ + 0.12572699785232544, + -1.0498149394989014, + -5.207070350646973, + -0.41757693886756897, + -0.25374162197113037, + 1.152648687362671, + 1.422953724861145, + -4.933906078338623, + -0.564710259437561, + ] + ) + assert paddle.allclose(output_slice_1.flatten(), expected_slice_1, atol=0.01) + assert paddle.allclose(output_slice_2.flatten(), expected_slice_2, atol=0.01) def test_spatial_transformer_dropout(self): paddle.seed(0) @@ -610,24 +615,26 @@ def test_spatial_transformer_dropout(self): num_attention_heads=2, attention_head_dim=16, dropout=0.3, - cross_attention_dim=None, ).eval() + cross_attention_dim=None, + ).eval() with paddle.no_grad(): attention_scores = spatial_transformer_block(sample).sample assert attention_scores.shape == [1, 32, 64, 64] output_slice = attention_scores[0, -1, -3:, -3:] - expected_slice = paddle.to_tensor([ - 2.535370349884033, - 6.2350993156433105, - 0.8244613409042358, - -2.6684911251068115, - -2.758057117462158, - -5.176937103271484, - 0.3372979760169983, - -5.837750434875488, - 2.3483340740203857, - ]) - assert paddle.allclose( - output_slice.flatten(), expected_slice, atol=0.01) + expected_slice = paddle.to_tensor( + [ + 2.535370349884033, + 6.2350993156433105, + 0.8244613409042358, + -2.6684911251068115, + -2.758057117462158, + -5.176937103271484, + 0.3372979760169983, + -5.837750434875488, + 2.3483340740203857, + ] + ) + assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01) def test_spatial_transformer_discrete(self): paddle.seed(0) @@ -637,99 +644,75 @@ def test_spatial_transformer_discrete(self): num_attention_heads=1, attention_head_dim=32, num_vector_embeds=num_embed, - sample_size=16, ).eval() + sample_size=16, + ).eval() with paddle.no_grad(): attention_scores = spatial_transformer_block(sample).sample assert attention_scores.shape == [1, num_embed - 1, 32] output_slice = attention_scores[0, -2:, -3:] - expected_slice = paddle.to_tensor([ - -0.14130862057209015, - -0.14278407394886017, - -0.498604953289032, - -3.2408740520477295, - -3.852043390274048, - -2.099970579147339, - ]) - assert paddle.allclose( - output_slice.flatten(), expected_slice, atol=0.01) + expected_slice = paddle.to_tensor( + [ + -0.14130862057209015, + -0.14278407394886017, + -0.498604953289032, + -3.2408740520477295, + -3.852043390274048, + -2.099970579147339, + ] + ) + assert paddle.allclose(output_slice.flatten(), expected_slice, atol=0.01) def test_spatial_transformer_default_norm_layers(self): - spatial_transformer_block = Transformer2DModel( - num_attention_heads=1, attention_head_dim=32, in_channels=32) - assert (spatial_transformer_block.transformer_blocks[0].norm1.__class__ - == paddle.nn.LayerNorm) - assert (spatial_transformer_block.transformer_blocks[0].norm3.__class__ - == paddle.nn.LayerNorm) + spatial_transformer_block = Transformer2DModel(num_attention_heads=1, attention_head_dim=32, in_channels=32) + assert spatial_transformer_block.transformer_blocks[0].norm1.__class__ == paddle.nn.LayerNorm + assert spatial_transformer_block.transformer_blocks[0].norm3.__class__ == paddle.nn.LayerNorm def test_spatial_transformer_ada_norm_layers(self): spatial_transformer_block = Transformer2DModel( num_attention_heads=1, attention_head_dim=32, in_channels=32, - num_embeds_ada_norm=5, ) - assert (spatial_transformer_block.transformer_blocks[0].norm1.__class__ - == AdaLayerNorm) - assert (spatial_transformer_block.transformer_blocks[0].norm3.__class__ - == paddle.nn.LayerNorm) + num_embeds_ada_norm=5, + ) + assert spatial_transformer_block.transformer_blocks[0].norm1.__class__ == AdaLayerNorm + assert spatial_transformer_block.transformer_blocks[0].norm3.__class__ == paddle.nn.LayerNorm def test_spatial_transformer_default_ff_layers(self): - spatial_transformer_block = Transformer2DModel( - num_attention_heads=1, attention_head_dim=32, in_channels=32) - assert ( - spatial_transformer_block.transformer_blocks[0].ff.net[0].__class__ - == GEGLU) - assert ( - spatial_transformer_block.transformer_blocks[0].ff.net[1].__class__ - == paddle.nn.Dropout) - assert ( - spatial_transformer_block.transformer_blocks[0].ff.net[2].__class__ - == paddle.nn.Linear) + spatial_transformer_block = Transformer2DModel(num_attention_heads=1, attention_head_dim=32, in_channels=32) + assert spatial_transformer_block.transformer_blocks[0].ff.net[0].__class__ == GEGLU + assert spatial_transformer_block.transformer_blocks[0].ff.net[1].__class__ == paddle.nn.Dropout + assert spatial_transformer_block.transformer_blocks[0].ff.net[2].__class__ == paddle.nn.Linear dim = 32 inner_dim = 128 - assert (spatial_transformer_block.transformer_blocks[0].ff.net[0] - .proj.weight.shape[0] == dim) - assert (spatial_transformer_block.transformer_blocks[0].ff.net[0] - .proj.weight.shape[1] == inner_dim * 2) - assert (spatial_transformer_block.transformer_blocks[0].ff.net[2] - .weight.shape[0] == inner_dim) - assert (spatial_transformer_block.transformer_blocks[0].ff.net[2] - .weight.shape[1] == dim) + assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.weight.shape[0] == dim + assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.weight.shape[1] == inner_dim * 2 + assert spatial_transformer_block.transformer_blocks[0].ff.net[2].weight.shape[0] == inner_dim + assert spatial_transformer_block.transformer_blocks[0].ff.net[2].weight.shape[1] == dim def test_spatial_transformer_geglu_approx_ff_layers(self): spatial_transformer_block = Transformer2DModel( num_attention_heads=1, attention_head_dim=32, in_channels=32, - activation_fn="geglu-approximate", ) - assert ( - spatial_transformer_block.transformer_blocks[0].ff.net[0].__class__ - == ApproximateGELU) - assert ( - spatial_transformer_block.transformer_blocks[0].ff.net[1].__class__ - == paddle.nn.Dropout) - assert ( - spatial_transformer_block.transformer_blocks[0].ff.net[2].__class__ - == paddle.nn.Linear) + activation_fn="geglu-approximate", + ) + assert spatial_transformer_block.transformer_blocks[0].ff.net[0].__class__ == ApproximateGELU + assert spatial_transformer_block.transformer_blocks[0].ff.net[1].__class__ == paddle.nn.Dropout + assert spatial_transformer_block.transformer_blocks[0].ff.net[2].__class__ == paddle.nn.Linear dim = 32 inner_dim = 128 - assert (spatial_transformer_block.transformer_blocks[0].ff.net[0] - .proj.weight.shape[0] == dim) - assert (spatial_transformer_block.transformer_blocks[0].ff.net[0] - .proj.weight.shape[1] == inner_dim) - assert (spatial_transformer_block.transformer_blocks[0].ff.net[2] - .weight.shape[0] == inner_dim) - assert (spatial_transformer_block.transformer_blocks[0].ff.net[2] - .weight.shape[1] == dim) + assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.weight.shape[0] == dim + assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.weight.shape[1] == inner_dim + assert spatial_transformer_block.transformer_blocks[0].ff.net[2].weight.shape[0] == inner_dim + assert spatial_transformer_block.transformer_blocks[0].ff.net[2].weight.shape[1] == dim def test_spatial_transformer_attention_bias(self): spatial_transformer_block = Transformer2DModel( num_attention_heads=1, attention_head_dim=32, in_channels=32, - attention_bias=True, ) - assert (spatial_transformer_block.transformer_blocks[0].attn1.to_q.bias - is not None) - assert (spatial_transformer_block.transformer_blocks[0].attn1.to_k.bias - is not None) - assert (spatial_transformer_block.transformer_blocks[0].attn1.to_v.bias - is not None) + attention_bias=True, + ) + assert spatial_transformer_block.transformer_blocks[0].attn1.to_q.bias is not None + assert spatial_transformer_block.transformer_blocks[0].attn1.to_k.bias is not None + assert spatial_transformer_block.transformer_blocks[0].attn1.to_v.bias is not None diff --git a/ppdiffusers/tests/models/test_lora_layers.py b/ppdiffusers/tests/models/test_lora_layers.py index 14c192e1e5ea8..97335fe48e3b5 100644 --- a/ppdiffusers/tests/models/test_lora_layers.py +++ b/ppdiffusers/tests/models/test_lora_layers.py @@ -20,8 +20,12 @@ import paddle.nn as nn from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer -from ppdiffusers import (AutoencoderKL, DDIMScheduler, StableDiffusionPipeline, - UNet2DConditionModel) +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + StableDiffusionPipeline, + UNet2DConditionModel, +) from ppdiffusers.loaders import AttnProcsLayers, LoraLoaderMixin from ppdiffusers.models.attention_processor import LoRAAttnProcessor from ppdiffusers.utils import TEXT_ENCODER_ATTN_MODULE, floats_tensor @@ -30,19 +34,16 @@ def create_unet_lora_layers(unet: nn.Layer): lora_attn_procs = {} for name in unet.attn_processors.keys(): - cross_attention_dim = (None if name.endswith("attn1.processor") else - unet.config.cross_attention_dim) + cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim if name.startswith("mid_block"): hidden_size = unet.config.block_out_channels[-1] elif name.startswith("up_blocks"): block_id = int(name[len("up_blocks.")]) - hidden_size = list(reversed(unet.config.block_out_channels))[ - block_id] + hidden_size = list(reversed(unet.config.block_out_channels))[block_id] elif name.startswith("down_blocks"): block_id = int(name[len("down_blocks.")]) hidden_size = unet.config.block_out_channels[block_id] - lora_attn_procs[name] = LoRAAttnProcessor( - hidden_size=hidden_size, cross_attention_dim=cross_attention_dim) + lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim) unet_lora_layers = AttnProcsLayers(lora_attn_procs) return lora_attn_procs, unet_lora_layers @@ -52,8 +53,8 @@ def create_text_encoder_lora_layers(text_encoder: nn.Layer): for name, module in text_encoder.named_sublayers(include_self=True): if name.endswith(TEXT_ENCODER_ATTN_MODULE): text_lora_attn_procs[name] = LoRAAttnProcessor( - hidden_size=module.out_proj.weight.shape[1], - cross_attention_dim=None) + hidden_size=module.out_proj.weight.shape[1], cross_attention_dim=None + ) text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs) return text_encoder_lora_layers @@ -70,14 +71,16 @@ def get_dummy_components(self): out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, ) + cross_attention_dim=32, + ) scheduler = DDIMScheduler( beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False, - steps_offset=1, ) + steps_offset=1, + ) paddle.seed(0) vae = AutoencoderKL( block_out_channels=[32, 64], @@ -85,7 +88,8 @@ def get_dummy_components(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, ) + latent_channels=4, + ) text_encoder_config = CLIPTextConfig( bos_token_id=0, eos_token_id=2, @@ -95,11 +99,11 @@ def get_dummy_components(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, ) + vocab_size=1000, + ) text_encoder = CLIPTextModel(text_encoder_config) text_encoder.eval() - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") unet_lora_attn_procs, unet_lora_layers = create_unet_lora_layers(unet) text_encoder_lora_layers = create_text_encoder_lora_layers(text_encoder) @@ -128,11 +132,7 @@ def get_dummy_inputs(self): generator = paddle.Generator().manual_seed(0) noise = floats_tensor((batch_size, num_channels) + sizes) - input_ids = paddle.randint( - 1, - sequence_length, - size=(batch_size, sequence_length), - generator=generator) + input_ids = paddle.randint(1, sequence_length, size=(batch_size, sequence_length), generator=generator) pipeline_inputs = { "prompt": "A painting of a squirrel eating a burger", @@ -158,22 +158,17 @@ def test_lora_save_load(self): LoraLoaderMixin.save_lora_weights( save_directory=tmpdirname, unet_lora_layers=lora_components["unet_lora_layers"], - text_encoder_lora_layers=lora_components[ - "text_encoder_lora_layers"], - to_diffusers=False, ) - self.assertTrue( - os.path.isfile( - os.path.join(tmpdirname, "paddle_lora_weights.pdparams"))) + text_encoder_lora_layers=lora_components["text_encoder_lora_layers"], + to_diffusers=False, + ) + self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams"))) sd_pipe.load_lora_weights(tmpdirname, from_diffusers=False) lora_images = sd_pipe(**pipeline_inputs).images lora_image_slice = lora_images[0, -3:, -3:, -1] # Outputs shouldn't match. - self.assertFalse( - paddle.allclose( - paddle.to_tensor(orig_image_slice), - paddle.to_tensor(lora_image_slice))) + self.assertFalse(paddle.allclose(paddle.to_tensor(orig_image_slice), paddle.to_tensor(lora_image_slice))) def test_lora_save_load_safetensors(self): pipeline_components, lora_components = self.get_dummy_components() @@ -189,24 +184,18 @@ def test_lora_save_load_safetensors(self): LoraLoaderMixin.save_lora_weights( save_directory=tmpdirname, unet_lora_layers=lora_components["unet_lora_layers"], - text_encoder_lora_layers=lora_components[ - "text_encoder_lora_layers"], + text_encoder_lora_layers=lora_components["text_encoder_lora_layers"], safe_serialization=True, - to_diffusers=True, ) - self.assertTrue( - os.path.isfile( - os.path.join(tmpdirname, - "pytorch_lora_weights.safetensors"))) + to_diffusers=True, + ) + self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors"))) sd_pipe.load_lora_weights(tmpdirname, from_diffusers=True) lora_images = sd_pipe(**pipeline_inputs).images lora_image_slice = lora_images[0, -3:, -3:, -1] # Outputs shouldn't match. - self.assertFalse( - paddle.allclose( - paddle.to_tensor(orig_image_slice), - paddle.to_tensor(lora_image_slice))) + self.assertFalse(paddle.allclose(paddle.to_tensor(orig_image_slice), paddle.to_tensor(lora_image_slice))) def test_lora_save_load_legacy(self): pipeline_components, lora_components = self.get_dummy_components() @@ -223,16 +212,11 @@ def test_lora_save_load_legacy(self): unet = sd_pipe.unet unet.set_attn_processor(unet_lora_attn_procs) unet.save_attn_procs(tmpdirname, to_diffusers=False) - self.assertTrue( - os.path.isfile( - os.path.join(tmpdirname, "paddle_lora_weights.pdparams"))) + self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams"))) sd_pipe.load_lora_weights(tmpdirname, from_diffusers=False) lora_images = sd_pipe(**pipeline_inputs).images lora_image_slice = lora_images[0, -3:, -3:, -1] # Outputs shouldn't match. - self.assertFalse( - paddle.allclose( - paddle.to_tensor(orig_image_slice), - paddle.to_tensor(lora_image_slice))) + self.assertFalse(paddle.allclose(paddle.to_tensor(orig_image_slice), paddle.to_tensor(lora_image_slice))) diff --git a/ppdiffusers/tests/models/test_modeling_common.py b/ppdiffusers/tests/models/test_modeling_common.py index 2224b1d99e300..8780b3abc746b 100644 --- a/ppdiffusers/tests/models/test_modeling_common.py +++ b/ppdiffusers/tests/models/test_modeling_common.py @@ -45,12 +45,14 @@ def test_cached_files_are_used_when_no_internet(self): response_mock.raise_for_status.side_effect = HTTPError response_mock.json.return_value = {} orig_model = UNet2DConditionModel.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet") + "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet" + ) with mock.patch("requests.request", return_value=response_mock): model = UNet2DConditionModel.from_pretrained( "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet", - local_files_only=True, ) + local_files_only=True, + ) for p1, p2 in zip(orig_model.parameters(), model.parameters()): if (p1 != p2).cast("int64").sum() > 0: assert False, "Parameters not the same!" @@ -67,13 +69,12 @@ def test_one_request_upon_cached(self): subfolder="unet", cache_dir=tmpdirname, from_hf_hub=True, - from_diffusers=True, ) + from_diffusers=True, + ) download_requests = [r.method for r in m.request_history] - assert (download_requests.count("HEAD") == 2 - ), "2 HEAD requests one for config, one for model" - assert (download_requests.count("GET") == 2 - ), "2 GET requests one for config, one for model" + assert download_requests.count("HEAD") == 2, "2 HEAD requests one for config, one for model" + assert download_requests.count("GET") == 2, "2 GET requests one for config, one for model" with requests_mock.mock(real_http=True) as m: UNet2DConditionModel.from_pretrained( @@ -81,7 +82,8 @@ def test_one_request_upon_cached(self): subfolder="unet", cache_dir=tmpdirname, from_hf_hub=True, - from_diffusers=True, ) + from_diffusers=True, + ) cache_requests = [r.method for r in m.request_history] # TODO check this @@ -92,15 +94,15 @@ def test_one_request_upon_cached(self): ppdiffusers.utils.import_utils._safetensors_available = True def test_weight_overwrite(self): - with tempfile.TemporaryDirectory() as tmpdirname, self.assertRaises( - RuntimeError) as error_context: + with tempfile.TemporaryDirectory() as tmpdirname, self.assertRaises(RuntimeError) as error_context: UNet2DConditionModel.from_pretrained( "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet", cache_dir=tmpdirname, in_channels=9, from_hf_hub=True, - from_diffusers=True, ) + from_diffusers=True, + ) # make sure that error message states what keys are missing assert "size mismatch" in str(error_context.exception) @@ -114,7 +116,8 @@ def test_weight_overwrite(self): low_cpu_mem_usage=False, ignore_mismatched_sizes=True, from_hf_hub=True, - from_diffusers=True, ) + from_diffusers=True, + ) assert model.config.in_channels == 9 @@ -139,8 +142,7 @@ def test_from_save_pretrained(self): if isinstance(new_image, dict): new_image = new_image.sample max_diff = (image - new_image).abs().sum().item() - self.assertLessEqual(max_diff, 5e-05, - "Models give different forward passes") + self.assertLessEqual(max_diff, 5e-05, "Models give different forward passes") def test_getattr_is_correct(self): init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() @@ -183,10 +185,7 @@ def test_getattr_is_correct(self): with self.assertRaises(AttributeError) as error: model.does_not_exist - assert ( - str(error.exception) == - f"'{type(model).__name__}' object has no attribute 'does_not_exist'" - ) + assert str(error.exception) == f"'{type(model).__name__}' object has no attribute 'does_not_exist'" def test_from_save_pretrained_variant(self): init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() @@ -196,8 +195,7 @@ def test_from_save_pretrained_variant(self): model.eval() with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname, variant="fp16") - new_model = self.model_class.from_pretrained( - tmpdirname, variant="fp16") + new_model = self.model_class.from_pretrained(tmpdirname, variant="fp16") if hasattr(new_model, "set_default_attn_processor"): new_model.set_default_attn_processor() # non-variant cannot be loaded @@ -208,8 +206,7 @@ def test_from_save_pretrained_variant(self): # support diffusion_pytorch_model.bin and model_state.pdparams assert "Error no file named model_state.pdparams found in directory" in str( error_context.exception - ) or "Error no file named diffusion_pytorch_model.bin found in directory" in str( - error_context.exception) + ) or "Error no file named diffusion_pytorch_model.bin found in directory" in str(error_context.exception) with paddle.no_grad(): image = model(**inputs_dict) @@ -219,8 +216,7 @@ def test_from_save_pretrained_variant(self): if isinstance(new_image, dict): new_image = new_image.sample max_diff = (image - new_image).abs().sum().item() - self.assertLessEqual(max_diff, 5e-05, - "Models give different forward passes") + self.assertLessEqual(max_diff, 5e-05, "Models give different forward passes") def test_from_save_pretrained_dtype(self): init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() @@ -231,11 +227,9 @@ def test_from_save_pretrained_dtype(self): with tempfile.TemporaryDirectory() as tmpdirname: model.to(dtype=dtype) model.save_pretrained(tmpdirname) - new_model = self.model_class.from_pretrained( - tmpdirname, paddle_dtype=dtype) + new_model = self.model_class.from_pretrained(tmpdirname, paddle_dtype=dtype) assert new_model.dtype == dtype - new_model = self.model_class.from_pretrained( - tmpdirname, paddle_dtype=dtype) + new_model = self.model_class.from_pretrained(tmpdirname, paddle_dtype=dtype) assert new_model.dtype == dtype def test_determinism(self): @@ -266,8 +260,7 @@ def test_output(self): output = output.sample self.assertIsNotNone(output) expected_shape = inputs_dict["sample"].shape - self.assertEqual(output.shape, expected_shape, - "Input and output shapes do not match") + self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") def test_forward_with_norm_groups(self): init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() @@ -281,8 +274,7 @@ def test_forward_with_norm_groups(self): output = output.sample self.assertIsNotNone(output) expected_shape = inputs_dict["sample"].shape - self.assertEqual(output.shape, expected_shape, - "Input and output shapes do not match") + self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") def test_forward_signature(self): init_dict, _ = self.prepare_init_args_and_inputs_for_common() @@ -320,8 +312,7 @@ def test_training(self): output = model(**inputs_dict) if isinstance(output, dict): output = output.sample - noise = paddle.randn( - shape=list((inputs_dict["sample"].shape[0], ) + self.output_shape)) + noise = paddle.randn(shape=list((inputs_dict["sample"].shape[0],) + self.output_shape)) loss = paddle.nn.functional.mse_loss(input=output, label=noise) loss.backward() @@ -333,8 +324,7 @@ def test_ema_training(self): output = model(**inputs_dict) if isinstance(output, dict): output = output.sample - noise = paddle.randn( - shape=list((inputs_dict["sample"].shape[0], ) + self.output_shape)) + noise = paddle.randn(shape=list((inputs_dict["sample"].shape[0],) + self.output_shape)) loss = paddle.nn.functional.mse_loss(input=output, label=noise) loss.backward() ema_model.step(model.parameters()) @@ -346,12 +336,10 @@ def set_nan_tensor_to_zero(t): def recursive_check(tuple_object, dict_object): if isinstance(tuple_object, (List, Tuple)): - for tuple_iterable_value, dict_iterable_value in zip( - tuple_object, dict_object.values()): + for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()): recursive_check(tuple_iterable_value, dict_iterable_value) elif isinstance(tuple_object, Dict): - for tuple_iterable_value, dict_iterable_value in zip( - tuple_object.values(), dict_object.values()): + for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()): recursive_check(tuple_iterable_value, dict_iterable_value) elif tuple_object is None: return @@ -360,7 +348,8 @@ def recursive_check(tuple_object, dict_object): paddle.allclose( set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), - atol=1e-05, ), + atol=1e-05, + ), msg=f"Tuple and dict output are not equal. Difference: {paddle.max(x=paddle.abs(x=tuple_object - dict_object))}. Tuple has `nan`: {paddle.isnan(x=tuple_object).any()} and `inf`: {paddle.isinf(x=tuple_object)}. Dict has `nan`: {paddle.isnan(x=dict_object).any()} and `inf`: {paddle.isinf(x=dict_object)}.", ) @@ -384,8 +373,7 @@ def test_enable_disable_gradient_checkpointing(self): self.assertFalse(model.is_gradient_checkpointing) def test_deprecated_kwargs(self): - has_kwarg_in_model_class = ( - "kwargs" in inspect.signature(self.model_class.__init__).parameters) + has_kwarg_in_model_class = "kwargs" in inspect.signature(self.model_class.__init__).parameters has_deprecated_kwarg = len(self.model_class._deprecated_kwargs) > 0 if has_kwarg_in_model_class and not has_deprecated_kwarg: raise ValueError( diff --git a/ppdiffusers/tests/models/test_models_unet_1d.py b/ppdiffusers/tests/models/test_models_unet_1d.py index 8ff48ee303f86..8d1339ed5c4dc 100644 --- a/ppdiffusers/tests/models/test_models_unet_1d.py +++ b/ppdiffusers/tests/models/test_models_unet_1d.py @@ -79,9 +79,9 @@ def prepare_init_args_and_inputs_for_common(self): "DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D", - "DownResnetBlock1D", ), - "up_block_types": - ("UpResnetBlock1D", "UpResnetBlock1D", "UpResnetBlock1D"), + "DownResnetBlock1D", + ), + "up_block_types": ("UpResnetBlock1D", "UpResnetBlock1D", "UpResnetBlock1D"), "act_fn": "mish", } inputs_dict = self.dummy_input @@ -91,38 +91,37 @@ def test_from_pretrained_hub(self): model, loading_info = UNet1DModel.from_pretrained( "bglick13/hopper-medium-v2-value-function-hor32", output_loading_info=True, - subfolder="unet", ) + subfolder="unet", + ) self.assertIsNotNone(model) self.assertEqual(len(loading_info["missing_keys"]), 0) image = model(**self.dummy_input) assert image is not None, "Make sure output is not None" def test_output_pretrained(self): - model = UNet1DModel.from_pretrained( - "bglick13/hopper-medium-v2-value-function-hor32", subfolder="unet") + model = UNet1DModel.from_pretrained("bglick13/hopper-medium-v2-value-function-hor32", subfolder="unet") paddle.seed(0) num_features = model.config.in_channels seq_len = 16 - noise = paddle.randn(shape=(1, seq_len, num_features)).transpose( - perm=[0, 2, 1]) - time_step = paddle.full(shape=(num_features, ), fill_value=0) + noise = paddle.randn(shape=(1, seq_len, num_features)).transpose(perm=[0, 2, 1]) + time_step = paddle.full(shape=(num_features,), fill_value=0) with paddle.no_grad(): output = model(noise, time_step).sample.permute(0, 2, 1) output_slice = output[0, -3:, -3:].flatten() - expected_output_slice = paddle.to_tensor([ - -0.2857576608657837, - -0.9908187389373779, - 0.2976357340812683, - -0.8677187561988831, - -0.21778395771980286, - 0.08095654845237732, - -0.5871752500534058, - 0.3299727439880371, - -0.17421625554561615, - ]) - self.assertTrue( - paddle.allclose( - output_slice, expected_output_slice, rtol=0.001)) + expected_output_slice = paddle.to_tensor( + [ + -0.2857576608657837, + -0.9908187389373779, + 0.2976357340812683, + -0.8677187561988831, + -0.21778395771980286, + 0.08095654845237732, + -0.5871752500534058, + 0.3299727439880371, + -0.17421625554561615, + ] + ) + self.assertTrue(paddle.allclose(output_slice, expected_output_slice, rtol=0.001)) def test_forward_with_norm_groups(self): pass @@ -133,9 +132,9 @@ def test_unet_1d_maestro(self): model_id = "harmonai/maestro-150k" model = UNet1DModel.from_pretrained(model_id, subfolder="unet") sample_size = 65536 - noise = paddle.sin(x=paddle.arange( - start=sample_size, - dtype=paddle.float32)[None, None, :].tile(repeat_times=[1, 2, 1])) + noise = paddle.sin( + x=paddle.arange(start=sample_size, dtype=paddle.float32)[None, None, :].tile(repeat_times=[1, 2, 1]) + ) timestep = paddle.to_tensor([1.0]) # must cast float32 with paddle.no_grad(): output = model(noise, timestep).sample @@ -187,8 +186,7 @@ def test_output(self): output = output.sample self.assertIsNotNone(output) expected_shape = [inputs_dict["sample"].shape[0], 1] - self.assertEqual(output.shape, expected_shape, - "Input and output shapes do not match") + self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") def test_ema_training(self): pass @@ -225,7 +223,8 @@ def test_from_pretrained_hub(self): value_function, vf_loading_info = UNet1DModel.from_pretrained( "bglick13/hopper-medium-v2-value-function-hor32", output_loading_info=True, - subfolder="value_function", ) + subfolder="value_function", + ) self.assertIsNotNone(value_function) self.assertEqual(len(vf_loading_info["missing_keys"]), 0) image = value_function(**self.dummy_input) @@ -235,19 +234,17 @@ def test_output_pretrained(self): value_function, vf_loading_info = UNet1DModel.from_pretrained( "bglick13/hopper-medium-v2-value-function-hor32", output_loading_info=True, - subfolder="value_function", ) + subfolder="value_function", + ) paddle.seed(0) num_features = value_function.config.in_channels seq_len = 14 - noise = paddle.randn(shape=(1, seq_len, num_features)).transpose( - perm=[0, 2, 1]) - time_step = paddle.full(shape=(num_features, ), fill_value=0) + noise = paddle.randn(shape=(1, seq_len, num_features)).transpose(perm=[0, 2, 1]) + time_step = paddle.full(shape=(num_features,), fill_value=0) with paddle.no_grad(): output = value_function(noise, time_step).sample expected_output_slice = paddle.to_tensor([291.51135254] * seq_len) - self.assertTrue( - paddle.allclose( - output.squeeze(-1), expected_output_slice, rtol=0.001)) + self.assertTrue(paddle.allclose(output.squeeze(-1), expected_output_slice, rtol=0.001)) def test_forward_with_norm_groups(self): pass diff --git a/ppdiffusers/tests/models/test_models_unet_2d.py b/ppdiffusers/tests/models/test_models_unet_2d.py index 6473ab0323f19..15147e00742e8 100644 --- a/ppdiffusers/tests/models/test_models_unet_2d.py +++ b/ppdiffusers/tests/models/test_models_unet_2d.py @@ -97,22 +97,19 @@ def prepare_init_args_and_inputs_for_common(self): return init_dict, inputs_dict def test_from_pretrained_hub(self): - model, loading_info = UNet2DModel.from_pretrained( - "fusing/unet-ldm-dummy-update", output_loading_info=True) + model, loading_info = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True) self.assertIsNotNone(model) self.assertEqual(len(loading_info["missing_keys"]), 0) image = model(**self.dummy_input).sample assert image is not None, "Make sure output is not None" def test_from_pretrained_accelerate(self): - model, _ = UNet2DModel.from_pretrained( - "fusing/unet-ldm-dummy-update", output_loading_info=True) + model, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True) image = model(**self.dummy_input).sample assert image is not None, "Make sure output is not None" def test_from_pretrained_accelerate_wont_change_results(self): - model_accelerate, _ = UNet2DModel.from_pretrained( - "fusing/unet-ldm-dummy-update", output_loading_info=True) + model_accelerate, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True) model_accelerate model_accelerate.eval() noise = paddle.randn( @@ -122,7 +119,8 @@ def test_from_pretrained_accelerate_wont_change_results(self): model_accelerate.config.sample_size, model_accelerate.config.sample_size, ], - generator=paddle.Generator().manual_seed(0), ) + generator=paddle.Generator().manual_seed(0), + ) time_step = paddle.to_tensor([10] * noise.shape[0]) arr_accelerate = model_accelerate(noise, time_step)["sample"] del model_accelerate @@ -130,7 +128,8 @@ def test_from_pretrained_accelerate_wont_change_results(self): gc.collect() model_normal_load, _ = UNet2DModel.from_pretrained( "fusing/unet-ldm-dummy-update", - output_loading_info=True, ) + output_loading_info=True, + ) model_normal_load.eval() arr_normal_load = model_normal_load(noise, time_step)["sample"] assert paddle_all_close(arr_accelerate, arr_normal_load, rtol=0.001) @@ -145,25 +144,26 @@ def test_output_pretrained(self): model.config.sample_size, model.config.sample_size, ], - generator=paddle.Generator().manual_seed(0), ) + generator=paddle.Generator().manual_seed(0), + ) time_step = paddle.to_tensor([10] * noise.shape[0]) with paddle.no_grad(): output = model(noise, time_step).sample output_slice = output[0, -1, -3:, -3:].flatten().cpu() - expected_output_slice = paddle.to_tensor([ - 0.43855608, - -10.29346752, - -9.60953522, - -8.39902020, - -16.29206276, - -13.07511997, - -9.30383205, - -13.69859409, - -10.52999401, - ]) - self.assertTrue( - paddle_all_close( - output_slice, expected_output_slice, rtol=0.001)) + expected_output_slice = paddle.to_tensor( + [ + 0.43855608, + -10.29346752, + -9.60953522, + -8.39902020, + -16.29206276, + -13.07511997, + -9.30383205, + -13.69859409, + -10.52999401, + ] + ) + self.assertTrue(paddle_all_close(output_slice, expected_output_slice, rtol=0.001)) class NCSNppModelTests(ModelTesterMixin, unittest.TestCase): @@ -213,8 +213,7 @@ def prepare_init_args_and_inputs_for_common(self): @slow def test_from_pretrained_hub(self): - model, loading_info = UNet2DModel.from_pretrained( - "google/ncsnpp-celebahq-256", output_loading_info=True) + model, loading_info = UNet2DModel.from_pretrained("google/ncsnpp-celebahq-256", output_loading_info=True) self.assertIsNotNone(model) self.assertEqual(len(loading_info["missing_keys"]), 0) inputs = self.dummy_input @@ -235,24 +234,23 @@ def test_output_pretrained_ve_mid(self): with paddle.no_grad(): output = model(noise, time_step).sample output_slice = output[0, -3:, -3:, -1].flatten().cpu() - expected_output_slice = paddle.to_tensor([ - -4836.2231, - -6487.1387, - -3816.7969, - -7964.9253, - -10966.2842, - -20043.6016, - 8137.0571, - 2340.3499, - 544.6114, - ]) - self.assertTrue( - paddle_all_close( - output_slice, expected_output_slice, rtol=0.01)) + expected_output_slice = paddle.to_tensor( + [ + -4836.2231, + -6487.1387, + -3816.7969, + -7964.9253, + -10966.2842, + -20043.6016, + 8137.0571, + 2340.3499, + 544.6114, + ] + ) + self.assertTrue(paddle_all_close(output_slice, expected_output_slice, rtol=0.01)) def test_output_pretrained_ve_large(self): - model = UNet2DModel.from_pretrained( - "fusing/ncsnpp-ffhq-ve-dummy-update") + model = UNet2DModel.from_pretrained("fusing/ncsnpp-ffhq-ve-dummy-update") paddle.seed(0) batch_size = 4 num_channels = 3 @@ -262,13 +260,10 @@ def test_output_pretrained_ve_large(self): with paddle.no_grad(): output = model(noise, time_step).sample output_slice = output[0, -3:, -3:, -1].flatten().cpu() - expected_output_slice = paddle.to_tensor([ - -0.0325, -0.09, -0.0869, -0.0332, -0.0725, -0.027, -0.0101, 0.0227, - 0.0256 - ]) - self.assertTrue( - paddle_all_close( - output_slice, expected_output_slice, rtol=0.01)) + expected_output_slice = paddle.to_tensor( + [-0.0325, -0.09, -0.0869, -0.0332, -0.0725, -0.027, -0.0101, 0.0227, 0.0256] + ) + self.assertTrue(paddle_all_close(output_slice, expected_output_slice, rtol=0.01)) def test_forward_with_norm_groups(self): pass diff --git a/ppdiffusers/tests/models/test_models_unet_2d_condition.py b/ppdiffusers/tests/models/test_models_unet_2d_condition.py index 085837f1fb0dd..6b9930399b0c3 100644 --- a/ppdiffusers/tests/models/test_models_unet_2d_condition.py +++ b/ppdiffusers/tests/models/test_models_unet_2d_condition.py @@ -24,9 +24,17 @@ from ppdiffusers import UNet2DConditionModel from ppdiffusers.models.attention_processor import ( - CustomDiffusionAttnProcessor, LoRAAttnProcessor) -from ppdiffusers.utils import (floats_tensor, load_ppnlp_numpy, logging, - paddle_all_close, require_paddle_gpu, slow) + CustomDiffusionAttnProcessor, + LoRAAttnProcessor, +) +from ppdiffusers.utils import ( + floats_tensor, + load_ppnlp_numpy, + logging, + paddle_all_close, + require_paddle_gpu, + slow, +) from ppdiffusers.utils.import_utils import is_ppxformers_available from .test_modeling_common import ModelTesterMixin @@ -34,50 +42,41 @@ logger = logging.get_logger(__name__) -def create_lora_layers(model, mock_weights: bool=True): +def create_lora_layers(model, mock_weights: bool = True): lora_attn_procs = {} for name in model.attn_processors.keys(): - cross_attention_dim = (None if name.endswith("attn1.processor") else - model.config.cross_attention_dim) + cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim if name.startswith("mid_block"): hidden_size = model.config.block_out_channels[-1] elif name.startswith("up_blocks"): block_id = int(name[len("up_blocks.")]) - hidden_size = list(reversed(model.config.block_out_channels))[ - block_id] + hidden_size = list(reversed(model.config.block_out_channels))[block_id] elif name.startswith("down_blocks"): block_id = int(name[len("down_blocks.")]) hidden_size = model.config.block_out_channels[block_id] - lora_attn_procs[name] = LoRAAttnProcessor( - hidden_size=hidden_size, cross_attention_dim=cross_attention_dim) + lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim) if mock_weights: with paddle.no_grad(): - lora_attn_procs[name].to_q_lora.up.weight.set_value( - lora_attn_procs[name].to_q_lora.up.weight + 1) - lora_attn_procs[name].to_k_lora.up.weight.set_value( - lora_attn_procs[name].to_k_lora.up.weight + 1) - lora_attn_procs[name].to_v_lora.up.weight.set_value( - lora_attn_procs[name].to_v_lora.up.weight + 1) - lora_attn_procs[name].to_out_lora.up.weight.set_value( - lora_attn_procs[name].to_out_lora.up.weight + 1) + lora_attn_procs[name].to_q_lora.up.weight.set_value(lora_attn_procs[name].to_q_lora.up.weight + 1) + lora_attn_procs[name].to_k_lora.up.weight.set_value(lora_attn_procs[name].to_k_lora.up.weight + 1) + lora_attn_procs[name].to_v_lora.up.weight.set_value(lora_attn_procs[name].to_v_lora.up.weight + 1) + lora_attn_procs[name].to_out_lora.up.weight.set_value(lora_attn_procs[name].to_out_lora.up.weight + 1) return lora_attn_procs -def create_custom_ppdiffusion_layers(model, mock_weights: bool=True): +def create_custom_ppdiffusion_layers(model, mock_weights: bool = True): train_kv = True train_q_out = True custom_diffusion_attn_procs = {} st = model.state_dict() for name, _ in model.attn_processors.items(): - cross_attention_dim = (None if name.endswith("attn1.processor") else - model.config.cross_attention_dim) + cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim if name.startswith("mid_block"): hidden_size = model.config.block_out_channels[-1] elif name.startswith("up_blocks"): block_id = int(name[len("up_blocks.")]) - hidden_size = list(reversed(model.config.block_out_channels))[ - block_id] + hidden_size = list(reversed(model.config.block_out_channels))[block_id] elif name.startswith("down_blocks"): block_id = int(name[len("down_blocks.")]) hidden_size = model.config.block_out_channels[block_id] @@ -87,36 +86,33 @@ def create_custom_ppdiffusion_layers(model, mock_weights: bool=True): "to_v_custom_diffusion.weight": st[layer_name + ".to_v.weight"], } if train_q_out: - weights["to_q_custom_diffusion.weight"] = st[layer_name + - ".to_q.weight"] - weights["to_out_custom_diffusion.0.weight"] = st[layer_name + - ".to_out.0.weight"] - weights["to_out_custom_diffusion.0.bias"] = st[layer_name + - ".to_out.0.bias"] + weights["to_q_custom_diffusion.weight"] = st[layer_name + ".to_q.weight"] + weights["to_out_custom_diffusion.0.weight"] = st[layer_name + ".to_out.0.weight"] + weights["to_out_custom_diffusion.0.bias"] = st[layer_name + ".to_out.0.bias"] if cross_attention_dim is not None: custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor( train_kv=train_kv, train_q_out=train_q_out, hidden_size=hidden_size, - cross_attention_dim=cross_attention_dim, ) + cross_attention_dim=cross_attention_dim, + ) custom_diffusion_attn_procs[name].load_dict(weights) if mock_weights: # add 1 to weights to mock trained weights with paddle.no_grad(): - custom_diffusion_attn_procs[ - name].to_k_custom_diffusion.weight.set_value( - custom_diffusion_attn_procs[ - name].to_k_custom_diffusion.weight + 1) - custom_diffusion_attn_procs[ - name].to_v_custom_diffusion.weight.set_value( - custom_diffusion_attn_procs[ - name].to_v_custom_diffusion.weight + 1) + custom_diffusion_attn_procs[name].to_k_custom_diffusion.weight.set_value( + custom_diffusion_attn_procs[name].to_k_custom_diffusion.weight + 1 + ) + custom_diffusion_attn_procs[name].to_v_custom_diffusion.weight.set_value( + custom_diffusion_attn_procs[name].to_v_custom_diffusion.weight + 1 + ) else: custom_diffusion_attn_procs[name] = CustomDiffusionAttnProcessor( train_kv=False, train_q_out=False, hidden_size=hidden_size, - cross_attention_dim=cross_attention_dim, ) + cross_attention_dim=cross_attention_dim, + ) del st return custom_diffusion_attn_procs @@ -165,9 +161,10 @@ def test_xformers_enable_works(self): init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() model = self.model_class(**init_dict) model.enable_xformers_memory_efficient_attention() - assert (model.mid_block.attentions[0].transformer_blocks[0] - .attn1.processor.__class__.__name__ == "XFormersAttnProcessor" - ), "xformers is not enabled" + assert ( + model.mid_block.attentions[0].transformer_blocks[0].attn1.processor.__class__.__name__ + == "XFormersAttnProcessor" + ), "xformers is not enabled" def test_gradient_checkpointing(self): init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() @@ -190,9 +187,7 @@ def test_gradient_checkpointing(self): named_params = dict(model.named_parameters()) named_params_2 = dict(model_2.named_parameters()) for name, param in named_params.items(): - self.assertTrue( - paddle_all_close( - param.grad, named_params_2[name].grad, atol=5e-05)) + self.assertTrue(paddle_all_close(param.grad, named_params_2[name].grad, atol=5e-05)) def test_model_with_attention_head_dim_tuple(self): init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() @@ -205,8 +200,7 @@ def test_model_with_attention_head_dim_tuple(self): output = output.sample self.assertIsNotNone(output) expected_shape = inputs_dict["sample"].shape - self.assertEqual(output.shape, expected_shape, - "Input and output shapes do not match") + self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") def test_model_with_use_linear_projection(self): init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() @@ -219,8 +213,7 @@ def test_model_with_use_linear_projection(self): output = output.sample self.assertIsNotNone(output) expected_shape = inputs_dict["sample"].shape - self.assertEqual(output.shape, expected_shape, - "Input and output shapes do not match") + self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") def test_model_with_cross_attention_dim_tuple(self): init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() @@ -238,8 +231,7 @@ def test_model_with_cross_attention_dim_tuple(self): self.assertIsNotNone(output) expected_shape = inputs_dict["sample"].shape - self.assertEqual(output.shape, expected_shape, - "Input and output shapes do not match") + self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") def test_model_with_simple_projection(self): init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() @@ -262,8 +254,7 @@ def test_model_with_simple_projection(self): self.assertIsNotNone(output) expected_shape = inputs_dict["sample"].shape - self.assertEqual(output.shape, expected_shape, - "Input and output shapes do not match") + self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") def test_model_with_class_embeddings_concat(self): init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() @@ -287,8 +278,7 @@ def test_model_with_class_embeddings_concat(self): self.assertIsNotNone(output) expected_shape = inputs_dict["sample"].shape - self.assertEqual(output.shape, expected_shape, - "Input and output shapes do not match") + self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") def test_model_attention_slicing(self): init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() @@ -327,34 +317,32 @@ class AttnEasyProc(nn.Layer): def __init__(self, num): super().__init__() self.weight = self.create_parameter( - (1, ), + (1,), dtype=paddle.get_default_dtype(), - default_initializer=nn.initializer.Constant(num), ) + default_initializer=nn.initializer.Constant(num), + ) self.is_run = False self.number = 0 self.counter = 0 def __call__( - self, - attn, - hidden_states, - encoder_hidden_states=None, - attention_mask=None, - number=None, ): + self, + attn, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + number=None, + ): batch_size, sequence_length, _ = hidden_states.shape - attention_mask = attn.prepare_attention_mask( - attention_mask, sequence_length, batch_size) + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) query = attn.to_q(hidden_states) - encoder_hidden_states = (encoder_hidden_states - if encoder_hidden_states is not None - else hidden_states) + encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states key = attn.to_k(encoder_hidden_states) value = attn.to_v(encoder_hidden_states) query = attn.head_to_batch_dim(query) key = attn.head_to_batch_dim(key) value = attn.head_to_batch_dim(value) - attention_probs = attn.get_attention_scores(query, key, - attention_mask) + attention_probs = attn.get_attention_scores(query, key, attention_mask) hidden_states = paddle.matmul(attention_probs, value) hidden_states = attn.batch_to_head_dim(hidden_states) hidden_states = attn.to_out[0](hidden_states) @@ -385,12 +373,9 @@ def test_lora_processors(self): model.set_attn_processor(lora_attn_procs) model.set_attn_processor(model.attn_processors) with paddle.no_grad(): - sample2 = model( - **inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample - sample3 = model( - **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample - sample4 = model( - **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample + sample2 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample + sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample + sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample assert (sample1 - sample2).abs().max() < 0.0001 assert (sample3 - sample4).abs().max() < 0.0001 assert (sample2 - sample3).abs().max() > 0.0001 @@ -405,20 +390,16 @@ def test_lora_save_load(self): lora_attn_procs = create_lora_layers(model) model.set_attn_processor(lora_attn_procs) with paddle.no_grad(): - sample = model( - **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample + sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample with tempfile.TemporaryDirectory() as tmpdirname: model.save_attn_procs(tmpdirname, to_diffusers=False) - self.assertTrue( - os.path.isfile( - os.path.join(tmpdirname, "paddle_lora_weights.pdparams"))) + self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams"))) paddle.seed(0) new_model = self.model_class(**init_dict) new_model.load_attn_procs(tmpdirname, from_diffusers=False) with paddle.no_grad(): - new_sample = new_model( - **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample + new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample assert (sample - new_sample).abs().max() < 1e-4 @@ -441,23 +422,16 @@ def test_lora_save_load_safetensors(self): model.set_attn_processor(lora_attn_procs) with paddle.no_grad(): - sample = model( - **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample + sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample with tempfile.TemporaryDirectory() as tmpdirname: - model.save_attn_procs( - tmpdirname, safe_serialization=True, to_diffusers=True) - self.assertTrue( - os.path.isfile( - os.path.join(tmpdirname, - "pytorch_lora_weights.safetensors"))) + model.save_attn_procs(tmpdirname, safe_serialization=True, to_diffusers=True) + self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors"))) paddle.seed(0) new_model = self.model_class(**init_dict) - new_model.load_attn_procs( - tmpdirname, from_diffusers=True, use_safetensors=True) + new_model.load_attn_procs(tmpdirname, from_diffusers=True, use_safetensors=True) with paddle.no_grad(): - new_sample = new_model( - **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample + new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample assert (sample - new_sample).abs().max() < 0.0001 assert (sample - old_sample).abs().max() > 0.0001 @@ -475,16 +449,15 @@ def test_lora_save_safetensors_load_torch(self): # Saving as torch, properly reloads with directly filename with tempfile.TemporaryDirectory() as tmpdirname: model.save_attn_procs(tmpdirname, to_diffusers=True) - self.assertTrue( - os.path.isfile( - os.path.join(tmpdirname, "pytorch_lora_weights.bin"))) + self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin"))) paddle.seed(0) new_model = self.model_class(**init_dict) new_model.load_attn_procs( tmpdirname, weight_name="pytorch_lora_weights.bin", from_diffusers=True, - use_safetensors=False, ) + use_safetensors=False, + ) def test_lora_save_torch_force_load_safetensors_error(self): pass @@ -499,8 +472,7 @@ def test_lora_on_off(self): lora_attn_procs = create_lora_layers(model) model.set_attn_processor(lora_attn_procs) with paddle.no_grad(): - sample = model( - **inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample + sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample model.set_default_attn_processor() with paddle.no_grad(): new_sample = model(**inputs_dict).sample @@ -538,8 +510,7 @@ def test_custom_diffusion_processors(self): with paddle.no_grad(): sample1 = model(**inputs_dict).sample - custom_diffusion_attn_procs = create_custom_ppdiffusion_layers( - model, mock_weights=False) + custom_diffusion_attn_procs = create_custom_ppdiffusion_layers(model, mock_weights=False) # make sure we can set a list of attention processors model.set_attn_processor(custom_diffusion_attn_procs) @@ -564,8 +535,7 @@ def test_custom_diffusion_save_load(self): with paddle.no_grad(): old_sample = model(**inputs_dict).sample - custom_diffusion_attn_procs = create_custom_ppdiffusion_layers( - model, mock_weights=False) + custom_diffusion_attn_procs = create_custom_ppdiffusion_layers(model, mock_weights=False) model.set_attn_processor(custom_diffusion_attn_procs) with paddle.no_grad(): @@ -573,16 +543,14 @@ def test_custom_diffusion_save_load(self): with tempfile.TemporaryDirectory() as tmpdirname: model.save_attn_procs(tmpdirname, to_diffusers=False) - self.assertTrue( - os.path.isfile( - os.path.join(tmpdirname, - "paddle_custom_diffusion_weights.pdparams"))) + self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_custom_diffusion_weights.pdparams"))) paddle.seed(0) new_model = self.model_class(**init_dict) new_model.load_attn_procs( tmpdirname, weight_name="paddle_custom_diffusion_weights.pdparams", - from_diffusers=False, ) + from_diffusers=False, + ) with paddle.no_grad(): new_sample = new_model(**inputs_dict).sample @@ -604,8 +572,7 @@ def test_custom_diffusion_xformers_on_off(self): paddle.seed(0) model = self.model_class(**init_dict) - custom_diffusion_attn_procs = create_custom_ppdiffusion_layers( - model, mock_weights=False) + custom_diffusion_attn_procs = create_custom_ppdiffusion_layers(model, mock_weights=False) model.set_attn_processor(custom_diffusion_attn_procs) # default @@ -634,20 +601,15 @@ def tearDown(self): def get_latents(self, seed=0, shape=(4, 4, 64, 64), fp16=False): dtype = paddle.float16 if fp16 else paddle.float32 - image = paddle.to_tensor(data=load_ppnlp_numpy( - self.get_file_format(seed, shape))).cast(dtype) + image = paddle.to_tensor(data=load_ppnlp_numpy(self.get_file_format(seed, shape))).cast(dtype) return image - def get_unet_model(self, - fp16=False, - model_id="CompVis/stable-diffusion-v1-4"): + def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"): revision = "fp16" if fp16 else None paddle_dtype = paddle.float16 if fp16 else paddle.float32 model = UNet2DConditionModel.from_pretrained( - model_id, - subfolder="unet", - paddle_dtype=paddle_dtype, - revision=revision) + model_id, subfolder="unet", paddle_dtype=paddle_dtype, revision=revision + ) model.eval() return model @@ -659,10 +621,7 @@ def test_set_attention_slice_auto(self): encoder_hidden_states = self.get_encoder_hidden_states(33) timestep = 1 with paddle.no_grad(): - _ = unet( - latents, - timestep=timestep, - encoder_hidden_states=encoder_hidden_states).sample + _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample mem_bytes = paddle.device.cuda.memory_allocated() assert mem_bytes < 5 * 10**9 @@ -674,10 +633,7 @@ def test_set_attention_slice_max(self): encoder_hidden_states = self.get_encoder_hidden_states(33) timestep = 1 with paddle.no_grad(): - _ = unet( - latents, - timestep=timestep, - encoder_hidden_states=encoder_hidden_states).sample + _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample mem_bytes = paddle.device.cuda.memory_allocated() assert mem_bytes < 5 * 10**9 @@ -689,10 +645,7 @@ def test_set_attention_slice_int(self): encoder_hidden_states = self.get_encoder_hidden_states(33) timestep = 1 with paddle.no_grad(): - _ = unet( - latents, - timestep=timestep, - encoder_hidden_states=encoder_hidden_states).sample + _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample mem_bytes = paddle.device.cuda.memory_allocated() assert mem_bytes < 5 * 10**9 @@ -705,49 +658,35 @@ def test_set_attention_slice_list(self): encoder_hidden_states = self.get_encoder_hidden_states(33) timestep = 1 with paddle.no_grad(): - _ = unet( - latents, - timestep=timestep, - encoder_hidden_states=encoder_hidden_states).sample + _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample mem_bytes = paddle.device.cuda.memory_allocated() assert mem_bytes < 5 * 10**9 def get_encoder_hidden_states(self, seed=0, shape=(4, 77, 768), fp16=False): dtype = "float16" if fp16 else "float32" - hidden_states = paddle.to_tensor(data=load_ppnlp_numpy( - self.get_file_format(seed, shape))).cast(dtype) + hidden_states = paddle.to_tensor(data=load_ppnlp_numpy(self.get_file_format(seed, shape))).cast(dtype) return hidden_states - @parameterized.expand([ - [ - 33, 4, - [-0.4424, 0.151, -0.1937, 0.2118, 0.3746, -0.3957, 0.016, -0.0435] - ], + @parameterized.expand( [ - 47, - 0.55, + [33, 4, [-0.4424, 0.151, -0.1937, 0.2118, 0.3746, -0.3957, 0.016, -0.0435]], [ - -0.1508, 0.0379, -0.3075, 0.254, 0.3633, -0.0821, 0.1719, - -0.0207 + 47, + 0.55, + [-0.1508, 0.0379, -0.3075, 0.254, 0.3633, -0.0821, 0.1719, -0.0207], ], - ], - [ - 21, - 0.89, [ - -0.6479, 0.6364, -0.3464, 0.8697, 0.4443, -0.6289, -0.0091, - 0.1778 + 21, + 0.89, + [-0.6479, 0.6364, -0.3464, 0.8697, 0.4443, -0.6289, -0.0091, 0.1778], ], - ], - [ - 9, - 1000, [ - 0.8888, -0.5659, 0.5834, -0.7469, 1.1912, -0.3923, 1.1241, - -0.4424 + 9, + 1000, + [0.8888, -0.5659, 0.5834, -0.7469, 1.1912, -0.3923, 1.1241, -0.4424], ], - ], - ]) + ] + ) @require_paddle_gpu def test_compvis_sd_v1_4(self, seed, timestep, expected_slice): model = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4") @@ -755,93 +694,69 @@ def test_compvis_sd_v1_4(self, seed, timestep, expected_slice): encoder_hidden_states = self.get_encoder_hidden_states(seed) timestep = paddle.to_tensor([timestep], dtype="int64") with paddle.no_grad(): - sample = model( - latents, - timestep=timestep, - encoder_hidden_states=encoder_hidden_states).sample + sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample assert sample.shape == latents.shape output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu() expected_output_slice = paddle.to_tensor(expected_slice) assert paddle_all_close(output_slice, expected_output_slice, atol=0.01) - @parameterized.expand([ + @parameterized.expand( [ - 83, - 4, [ - -0.2323, -0.1304, 0.0813, -0.3093, -0.0919, -0.1571, -0.1125, - -0.5806 + 83, + 4, + [-0.2323, -0.1304, 0.0813, -0.3093, -0.0919, -0.1571, -0.1125, -0.5806], ], - ], - [ - 17, - 0.55, [ - -0.0831, -0.2443, 0.0901, -0.0919, 0.3396, 0.0103, -0.3743, - 0.0701 + 17, + 0.55, + [-0.0831, -0.2443, 0.0901, -0.0919, 0.3396, 0.0103, -0.3743, 0.0701], ], - ], - [ - 8, - 0.89, [ - -0.4863, 0.0859, 0.0875, -0.1658, 0.9199, -0.0114, 0.4839, - 0.4639 + 8, + 0.89, + [-0.4863, 0.0859, 0.0875, -0.1658, 0.9199, -0.0114, 0.4839, 0.4639], ], - ], - [ - 3, - 1000, [ - -0.5649, 0.2402, -0.5518, 0.1248, 1.1328, -0.2443, -0.0325, - -1.0078 + 3, + 1000, + [-0.5649, 0.2402, -0.5518, 0.1248, 1.1328, -0.2443, -0.0325, -1.0078], ], - ], - ]) + ] + ) @require_paddle_gpu def test_compvis_sd_v1_4_fp16(self, seed, timestep, expected_slice): - model = self.get_unet_model( - model_id="CompVis/stable-diffusion-v1-4", fp16=True) + model = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4", fp16=True) latents = self.get_latents(seed, fp16=True) encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True) timestep = paddle.to_tensor([timestep], dtype="int64") with paddle.no_grad(): - sample = model( - latents, - timestep=timestep, - encoder_hidden_states=encoder_hidden_states).sample + sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample assert sample.shape == latents.shape output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu() expected_output_slice = paddle.to_tensor(expected_slice) assert paddle_all_close(output_slice, expected_output_slice, atol=0.005) - @parameterized.expand([ - [ - 33, 4, - [-0.443, 0.157, -0.1867, 0.2376, 0.3205, -0.3681, 0.0525, -0.0722] - ], + @parameterized.expand( [ - 47, - 0.55, + [33, 4, [-0.443, 0.157, -0.1867, 0.2376, 0.3205, -0.3681, 0.0525, -0.0722]], [ - -0.1415, 0.0129, -0.3136, 0.2257, 0.343, -0.0536, 0.2114, - -0.0436 + 47, + 0.55, + [-0.1415, 0.0129, -0.3136, 0.2257, 0.343, -0.0536, 0.2114, -0.0436], ], - ], - [ - 21, - 0.89, - [-0.7091, 0.6664, -0.3643, 0.9032, 0.4499, -0.6541, 0.0139, 0.175], - ], - [ - 9, - 1000, [ - 0.8878, -0.5659, 0.5844, -0.7442, 1.1883, -0.3927, 1.1192, - -0.4423 + 21, + 0.89, + [-0.7091, 0.6664, -0.3643, 0.9032, 0.4499, -0.6541, 0.0139, 0.175], ], - ], - ]) + [ + 9, + 1000, + [0.8878, -0.5659, 0.5844, -0.7442, 1.1883, -0.3927, 1.1192, -0.4423], + ], + ] + ) @require_paddle_gpu def test_compvis_sd_v1_5(self, seed, timestep, expected_slice): model = self.get_unet_model(model_id="runwayml/stable-diffusion-v1-5") @@ -849,199 +764,151 @@ def test_compvis_sd_v1_5(self, seed, timestep, expected_slice): encoder_hidden_states = self.get_encoder_hidden_states(seed) timestep = paddle.to_tensor([timestep], dtype="int64") with paddle.no_grad(): - sample = model( - latents, - timestep=timestep, - encoder_hidden_states=encoder_hidden_states).sample + sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample assert sample.shape == latents.shape output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu() expected_output_slice = paddle.to_tensor(expected_slice) assert paddle_all_close(output_slice, expected_output_slice, atol=0.01) - @parameterized.expand([ + @parameterized.expand( [ - 83, - 4, [ - -0.2695, -0.1669, 0.0073, -0.3181, -0.1187, -0.1676, -0.1395, - -0.5972 + 83, + 4, + [-0.2695, -0.1669, 0.0073, -0.3181, -0.1187, -0.1676, -0.1395, -0.5972], ], - ], - [ - 17, - 0.55, [ - -0.129, -0.2588, 0.0551, -0.0916, 0.3286, 0.0238, -0.3669, - 0.0322 + 17, + 0.55, + [-0.129, -0.2588, 0.0551, -0.0916, 0.3286, 0.0238, -0.3669, 0.0322], + ], + [ + 8, + 0.89, + [-0.5283, 0.1198, 0.087, -0.1141, 0.9189, -0.015, 0.5474, 0.4319], ], - ], - [ - 8, - 0.89, - [-0.5283, 0.1198, 0.087, -0.1141, 0.9189, -0.015, 0.5474, 0.4319], - ], - [ - 3, - 1000, [ - -0.5601, 0.2411, -0.5435, 0.1268, 1.1338, -0.2427, -0.028, - -1.002 + 3, + 1000, + [-0.5601, 0.2411, -0.5435, 0.1268, 1.1338, -0.2427, -0.028, -1.002], ], - ], - ]) + ] + ) @require_paddle_gpu def test_compvis_sd_v1_5_fp16(self, seed, timestep, expected_slice): - model = self.get_unet_model( - model_id="runwayml/stable-diffusion-v1-5", fp16=True) + model = self.get_unet_model(model_id="runwayml/stable-diffusion-v1-5", fp16=True) latents = self.get_latents(seed, fp16=True) encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True) timestep = paddle.to_tensor([timestep], dtype="int64") with paddle.no_grad(): - sample = model( - latents, - timestep=timestep, - encoder_hidden_states=encoder_hidden_states).sample + sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample assert sample.shape == latents.shape output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu() expected_output_slice = paddle.to_tensor(expected_slice) assert paddle_all_close(output_slice, expected_output_slice, atol=0.005) - @parameterized.expand([ + @parameterized.expand( [ - 33, - 4, [ - -0.7639, 0.0106, -0.1615, -0.3487, -0.0423, -0.7972, 0.0085, - -0.4858 + 33, + 4, + [-0.7639, 0.0106, -0.1615, -0.3487, -0.0423, -0.7972, 0.0085, -0.4858], ], - ], - [ - 47, - 0.55, [ - -0.6564, 0.0795, -1.9026, -0.6258, 1.8235, 1.2056, 1.2169, - 0.9073 + 47, + 0.55, + [-0.6564, 0.0795, -1.9026, -0.6258, 1.8235, 1.2056, 1.2169, 0.9073], + ], + [ + 21, + 0.89, + [0.0327, 0.4399, -0.6358, 0.3417, 0.412, -0.5621, -0.0397, -1.043], ], - ], - [ - 21, - 0.89, - [0.0327, 0.4399, -0.6358, 0.3417, 0.412, -0.5621, -0.0397, -1.043], - ], - [ - 9, - 1000, [ - 0.16, 0.7303, -1.0556, -0.3515, -0.744, -1.2037, -1.8149, - -1.8931 + 9, + 1000, + [0.16, 0.7303, -1.0556, -0.3515, -0.744, -1.2037, -1.8149, -1.8931], ], - ], - ]) + ] + ) @require_paddle_gpu def test_compvis_sd_inpaint(self, seed, timestep, expected_slice): - model = self.get_unet_model( - model_id="runwayml/stable-diffusion-inpainting") + model = self.get_unet_model(model_id="runwayml/stable-diffusion-inpainting") latents = self.get_latents(seed, shape=(4, 9, 64, 64)) encoder_hidden_states = self.get_encoder_hidden_states(seed) timestep = paddle.to_tensor([timestep], dtype="int64") with paddle.no_grad(): - sample = model( - latents, - timestep=timestep, - encoder_hidden_states=encoder_hidden_states).sample + sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample assert sample.shape == [4, 4, 64, 64] output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu() expected_output_slice = paddle.to_tensor(expected_slice) assert paddle_all_close(output_slice, expected_output_slice, atol=0.01) - @parameterized.expand([ + @parameterized.expand( [ - 83, - 4, [ - -0.1047, -1.7227, 0.1067, 0.0164, -0.5698, -0.4172, -0.1388, - 1.1387 + 83, + 4, + [-0.1047, -1.7227, 0.1067, 0.0164, -0.5698, -0.4172, -0.1388, 1.1387], ], - ], - [ - 17, - 0.55, - [0.0975, -0.2856, -0.3508, -0.46, 0.3376, 0.293, -0.2747, -0.7026], - ], - [ - 8, - 0.89, [ - -0.0952, 0.0183, -0.5825, -0.1981, 0.1131, 0.4668, -0.0395, - -0.3486 + 17, + 0.55, + [0.0975, -0.2856, -0.3508, -0.46, 0.3376, 0.293, -0.2747, -0.7026], ], - ], - [ - 3, - 1000, [ - 0.479, 0.4949, -1.0732, -0.7158, 0.7959, -0.9478, 0.1105, - -0.9741 + 8, + 0.89, + [-0.0952, 0.0183, -0.5825, -0.1981, 0.1131, 0.4668, -0.0395, -0.3486], ], - ], - ]) + [ + 3, + 1000, + [0.479, 0.4949, -1.0732, -0.7158, 0.7959, -0.9478, 0.1105, -0.9741], + ], + ] + ) @require_paddle_gpu def test_compvis_sd_inpaint_fp16(self, seed, timestep, expected_slice): - model = self.get_unet_model( - model_id="runwayml/stable-diffusion-inpainting", fp16=True) + model = self.get_unet_model(model_id="runwayml/stable-diffusion-inpainting", fp16=True) latents = self.get_latents(seed, shape=(4, 9, 64, 64), fp16=True) encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True) timestep = paddle.to_tensor([timestep], dtype="int64") with paddle.no_grad(): - sample = model( - latents, - timestep=timestep, - encoder_hidden_states=encoder_hidden_states).sample + sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample assert sample.shape == [4, 4, 64, 64] output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu() expected_output_slice = paddle.to_tensor(expected_slice) assert paddle_all_close(output_slice, expected_output_slice, atol=0.005) - @parameterized.expand([ - [ - 83, 4, - [0.1514, 0.0807, 0.1624, 0.1016, -0.1896, 0.0263, 0.0677, 0.231] - ], - [ - 17, - 0.55, - [0.1164, -0.0216, 0.017, 0.1589, -0.312, 0.1005, -0.0581, -0.1458], - ], + @parameterized.expand( [ - 8, - 0.89, + [83, 4, [0.1514, 0.0807, 0.1624, 0.1016, -0.1896, 0.0263, 0.0677, 0.231]], [ - -0.1758, -0.0169, 0.1004, -0.1411, 0.1312, 0.1103, -0.1996, - 0.2139 + 17, + 0.55, + [0.1164, -0.0216, 0.017, 0.1589, -0.312, 0.1005, -0.0581, -0.1458], + ], + [ + 8, + 0.89, + [-0.1758, -0.0169, 0.1004, -0.1411, 0.1312, 0.1103, -0.1996, 0.2139], ], - ], - [ - 3, - 1000, [ - 0.1214, 0.0352, -0.0731, -0.1562, -0.0994, -0.0906, -0.234, - -0.0539 + 3, + 1000, + [0.1214, 0.0352, -0.0731, -0.1562, -0.0994, -0.0906, -0.234, -0.0539], ], - ], - ]) + ] + ) @require_paddle_gpu def test_stabilityai_sd_v2_fp16(self, seed, timestep, expected_slice): - model = self.get_unet_model( - model_id="stabilityai/stable-diffusion-2", fp16=True) + model = self.get_unet_model(model_id="stabilityai/stable-diffusion-2", fp16=True) latents = self.get_latents(seed, shape=(4, 4, 96, 96), fp16=True) - encoder_hidden_states = self.get_encoder_hidden_states( - seed, shape=(4, 77, 1024), fp16=True) + encoder_hidden_states = self.get_encoder_hidden_states(seed, shape=(4, 77, 1024), fp16=True) timestep = paddle.to_tensor([timestep], dtype="int64") with paddle.no_grad(): - sample = model( - latents, - timestep=timestep, - encoder_hidden_states=encoder_hidden_states).sample + sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample assert sample.shape == latents.shape output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu() expected_output_slice = paddle.to_tensor(expected_slice) diff --git a/ppdiffusers/tests/models/test_models_unet_3d_condition.py b/ppdiffusers/tests/models/test_models_unet_3d_condition.py index 12479b35ac6f0..ca2f44b1edd9f 100644 --- a/ppdiffusers/tests/models/test_models_unet_3d_condition.py +++ b/ppdiffusers/tests/models/test_models_unet_3d_condition.py @@ -20,8 +20,7 @@ import paddle from ppdiffusers.models import UNet3DConditionModel -from ppdiffusers.models.attention_processor import (AttnProcessor, - LoRAAttnProcessor) +from ppdiffusers.models.attention_processor import AttnProcessor, LoRAAttnProcessor from ppdiffusers.utils import floats_tensor, logging from ppdiffusers.utils.import_utils import is_ppxformers_available @@ -30,20 +29,18 @@ logger = logging.get_logger(__name__) -def create_lora_layers(model, mock_weights: bool=True): +def create_lora_layers(model, mock_weights: bool = True): lora_attn_procs = {} for name in model.attn_processors.keys(): has_cross_attention = name.endswith("attn2.processor") and not ( - name.startswith("transformer_in") or - "temp_attentions" in name.split(".")) - cross_attention_dim = (model.config.cross_attention_dim - if has_cross_attention else None) + name.startswith("transformer_in") or "temp_attentions" in name.split(".") + ) + cross_attention_dim = model.config.cross_attention_dim if has_cross_attention else None if name.startswith("mid_block"): hidden_size = model.config.block_out_channels[-1] elif name.startswith("up_blocks"): block_id = int(name[len("up_blocks.")]) - hidden_size = list(reversed(model.config.block_out_channels))[ - block_id] + hidden_size = list(reversed(model.config.block_out_channels))[block_id] elif name.startswith("down_blocks"): block_id = int(name[len("down_blocks.")]) hidden_size = model.config.block_out_channels[block_id] @@ -51,20 +48,15 @@ def create_lora_layers(model, mock_weights: bool=True): # Note that the `8 * ...` comes from: https://github.com/huggingface/diffusers/blob/7139f0e874f10b2463caa8cbd585762a309d12d6/src/diffusers/models/unet_3d_condition.py#L148 hidden_size = 8 * model.config.attention_head_dim - lora_attn_procs[name] = LoRAAttnProcessor( - hidden_size=hidden_size, cross_attention_dim=cross_attention_dim) + lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim) if mock_weights: # add 1 to weights to mock trained weights with paddle.no_grad(): - lora_attn_procs[name].to_q_lora.up.weight.set_value( - lora_attn_procs[name].to_q_lora.up.weight + 1) - lora_attn_procs[name].to_k_lora.up.weight.set_value( - lora_attn_procs[name].to_k_lora.up.weight + 1) - lora_attn_procs[name].to_v_lora.up.weight.set_value( - lora_attn_procs[name].to_v_lora.up.weight + 1) - lora_attn_procs[name].to_out_lora.up.weight.set_value( - lora_attn_procs[name].to_out_lora.up.weight + 1) + lora_attn_procs[name].to_q_lora.up.weight.set_value(lora_attn_procs[name].to_q_lora.up.weight + 1) + lora_attn_procs[name].to_k_lora.up.weight.set_value(lora_attn_procs[name].to_k_lora.up.weight + 1) + lora_attn_procs[name].to_v_lora.up.weight.set_value(lora_attn_procs[name].to_v_lora.up.weight + 1) + lora_attn_procs[name].to_out_lora.up.weight.set_value(lora_attn_procs[name].to_out_lora.up.weight + 1) return lora_attn_procs @@ -99,7 +91,8 @@ def prepare_init_args_and_inputs_for_common(self): "block_out_channels": (32, 64), "down_block_types": ( "CrossAttnDownBlock3D", - "DownBlock3D", ), + "DownBlock3D", + ), "up_block_types": ("UpBlock3D", "CrossAttnUpBlock3D"), "cross_attention_dim": 32, "attention_head_dim": 8, @@ -121,9 +114,10 @@ def test_xformers_enable_works(self): model.enable_xformers_memory_efficient_attention() - assert (model.mid_block.attentions[0].transformer_blocks[0] - .attn1.processor.__class__.__name__ == "XFormersAttnProcessor" - ), "xformers is not enabled" + assert ( + model.mid_block.attentions[0].transformer_blocks[0].attn1.processor.__class__.__name__ + == "XFormersAttnProcessor" + ), "xformers is not enabled" # Overriding to set `norm_num_groups` needs to be different for this model. def test_forward_with_norm_groups(self): @@ -140,8 +134,7 @@ def test_forward_with_norm_groups(self): output = output.sample self.assertIsNotNone(output) expected_shape = inputs_dict["sample"].shape - self.assertEqual(output.shape, expected_shape, - "Input and output shapes do not match") + self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") # Overriding since the UNet3D outputs a different structure. def test_determinism(self): @@ -199,12 +192,9 @@ def test_lora_processors(self): model.set_attn_processor(model.attn_processors) with paddle.no_grad(): - sample2 = model( - **inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample - sample3 = model( - **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample - sample4 = model( - **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample + sample2 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample + sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample + sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample assert (sample1 - sample2).abs().max() < 1e-4 assert (sample3 - sample4).abs().max() < 1e-4 @@ -227,23 +217,20 @@ def test_lora_save_load(self): model.set_attn_processor(lora_attn_procs) with paddle.no_grad(): - sample = model( - **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample + sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample with tempfile.TemporaryDirectory() as tmpdirname: model.save_attn_procs( tmpdirname, - to_diffusers=False, ) - self.assertTrue( - os.path.isfile( - os.path.join(tmpdirname, "paddle_lora_weights.pdparams"))) + to_diffusers=False, + ) + self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams"))) paddle.seed(0) new_model = self.model_class(**init_dict) new_model.load_attn_procs(tmpdirname, from_diffusers=False) with paddle.no_grad(): - new_sample = new_model( - **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample + new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample assert (sample - new_sample).abs().max() < 1e-4 @@ -265,24 +252,17 @@ def test_lora_save_load_safetensors(self): model.set_attn_processor(lora_attn_procs) with paddle.no_grad(): - sample = model( - **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample + sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample with tempfile.TemporaryDirectory() as tmpdirname: - model.save_attn_procs( - tmpdirname, safe_serialization=True, to_diffusers=True) - self.assertTrue( - os.path.isfile( - os.path.join(tmpdirname, - "pytorch_lora_weights.safetensors"))) + model.save_attn_procs(tmpdirname, safe_serialization=True, to_diffusers=True) + self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors"))) paddle.seed(0) new_model = self.model_class(**init_dict) - new_model.load_attn_procs( - tmpdirname, use_safetensors=True, from_diffusers=True) + new_model.load_attn_procs(tmpdirname, use_safetensors=True, from_diffusers=True) with paddle.no_grad(): - new_sample = new_model( - **inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample + new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample assert (sample - new_sample).abs().max() < 1e-4 @@ -303,16 +283,15 @@ def test_lora_save_safetensors_load_torch(self): # Saving as paddle, properly reloads with directly filename with tempfile.TemporaryDirectory() as tmpdirname: model.save_attn_procs(tmpdirname, to_diffusers=True) - self.assertTrue( - os.path.isfile( - os.path.join(tmpdirname, "pytorch_lora_weights.bin"))) + self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin"))) paddle.seed(0) new_model = self.model_class(**init_dict) new_model.load_attn_procs( tmpdirname, weight_name="pytorch_lora_weights.bin", use_safetensors=False, - from_diffusers=True, ) + from_diffusers=True, + ) def test_lora_save_paddle_force_load_safetensors_error(self): pass @@ -332,8 +311,7 @@ def test_lora_on_off(self): model.set_attn_processor(lora_attn_procs) with paddle.no_grad(): - sample = model( - **inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample + sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample model.set_attn_processor(AttnProcessor()) diff --git a/ppdiffusers/tests/models/test_models_vae.py b/ppdiffusers/tests/models/test_models_vae.py index 8cc3c0794fbd8..c385339e1b134 100644 --- a/ppdiffusers/tests/models/test_models_vae.py +++ b/ppdiffusers/tests/models/test_models_vae.py @@ -20,8 +20,13 @@ from parameterized import parameterized from ppdiffusers import AutoencoderKL -from ppdiffusers.utils import (floats_tensor, load_ppnlp_numpy, - paddle_all_close, require_paddle_gpu, slow) +from ppdiffusers.utils import ( + floats_tensor, + load_ppnlp_numpy, + paddle_all_close, + require_paddle_gpu, + slow, +) from .test_modeling_common import ModelTesterMixin @@ -100,13 +105,10 @@ def test_gradient_checkpointing(self): named_params_2 = dict(model_2.named_parameters()) with paddle.no_grad(): for name, param in named_params.items(): - self.assertTrue( - paddle_all_close( - param.grad, named_params_2[name].grad, atol=5e-5)) + self.assertTrue(paddle_all_close(param.grad, named_params_2[name].grad, atol=5e-5)) def test_from_pretrained_hub(self): - model, loading_info = AutoencoderKL.from_pretrained( - "fusing/autoencoder-kl-dummy", output_loading_info=True) + model, loading_info = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy", output_loading_info=True) self.assertIsNotNone(model) self.assertEqual(len(loading_info["missing_keys"]), 0) image = model(**self.dummy_input) @@ -124,25 +126,25 @@ def test_output_pretrained(self): model.config.sample_size, model.config.sample_size, ], - generator=paddle.Generator().manual_seed(0), ) + generator=paddle.Generator().manual_seed(0), + ) with paddle.no_grad(): - output = model( - image, sample_posterior=True, generator=generator).sample + output = model(image, sample_posterior=True, generator=generator).sample output_slice = output[0, -1, -3:, -3:].flatten().cpu() - expected_output_slice = paddle.to_tensor([ - -0.39049336, - 0.34836933, - 0.27105471, - -0.02148458, - 0.00975929, - 0.27822807, - -0.12224892, - -0.02011922, - 0.19761699, - ]) - self.assertTrue( - paddle_all_close( - output_slice, expected_output_slice, rtol=0.01)) + expected_output_slice = paddle.to_tensor( + [ + -0.39049336, + 0.34836933, + 0.27105471, + -0.02148458, + 0.00975929, + 0.27822807, + -0.12224892, + -0.02011922, + 0.19761699, + ] + ) + self.assertTrue(paddle_all_close(output_slice, expected_output_slice, rtol=0.01)) @slow @@ -157,115 +159,77 @@ def tearDown(self): def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False): dtype = paddle.float16 if fp16 else paddle.float32 - image = paddle.to_tensor(data=load_ppnlp_numpy( - self.get_file_format(seed, shape))).cast(dtype) + image = paddle.to_tensor(data=load_ppnlp_numpy(self.get_file_format(seed, shape))).cast(dtype) return image - def get_sd_vae_model(self, - model_id="CompVis/stable-diffusion-v1-4", - fp16=False): + def get_sd_vae_model(self, model_id="CompVis/stable-diffusion-v1-4", fp16=False): revision = "fp16" if fp16 else None paddle_dtype = paddle.float16 if fp16 else paddle.float32 - model = AutoencoderKL.from_pretrained( - model_id, - subfolder="vae", - paddle_dtype=paddle_dtype, - revision=revision) + model = AutoencoderKL.from_pretrained(model_id, subfolder="vae", paddle_dtype=paddle_dtype, revision=revision) model.eval() return model def get_generator(self, seed=0): return paddle.Generator().manual_seed(seed) - @parameterized.expand([ - [ - 33, - [ - -0.1603, 0.9878, -0.0495, -0.079, -0.2709, 0.8375, -0.206, - -0.0824 - ], - [ - -0.2395, 0.0098, 0.0102, -0.0709, -0.284, -0.0274, -0.0718, - -0.1824 - ], - ], + @parameterized.expand( [ - 47, [ - -0.2376, 0.1168, 0.1332, -0.484, -0.2508, -0.0791, -0.0493, - -0.4089 + 33, + [-0.1603, 0.9878, -0.0495, -0.079, -0.2709, 0.8375, -0.206, -0.0824], + [-0.2395, 0.0098, 0.0102, -0.0709, -0.284, -0.0274, -0.0718, -0.1824], ], [ - 0.035, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, - -0.1131 + 47, + [-0.2376, 0.1168, 0.1332, -0.484, -0.2508, -0.0791, -0.0493, -0.4089], + [0.035, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131], ], - ], - ]) + ] + ) def test_stable_diffusion(self, seed, expected_slice, expected_slice_mps): model = self.get_sd_vae_model() image = self.get_sd_image(seed) generator = self.get_generator(seed) with paddle.no_grad(): - sample = model( - image, generator=generator, sample_posterior=True).sample + sample = model(image, generator=generator, sample_posterior=True).sample assert sample.shape == image.shape output_slice = sample[-1, -2:, -2:, :2].flatten().cast("float32").cpu() expected_output_slice = paddle.to_tensor(expected_slice) assert paddle_all_close(output_slice, expected_output_slice, atol=0.01) - @parameterized.expand([ - [ - 33, [ - -0.0513, 0.0289, 1.3799, 0.2166, -0.2573, -0.0871, 0.5103, - -0.0999 - ] - ], + @parameterized.expand( [ - 47, [ - -0.4128, -0.132, -0.3704, 0.1965, -0.4116, -0.2332, -0.334, - 0.2247 - ] - ], - ]) + [33, [-0.0513, 0.0289, 1.3799, 0.2166, -0.2573, -0.0871, 0.5103, -0.0999]], + [47, [-0.4128, -0.132, -0.3704, 0.1965, -0.4116, -0.2332, -0.334, 0.2247]], + ] + ) @require_paddle_gpu def test_stable_diffusion_fp16(self, seed, expected_slice): model = self.get_sd_vae_model(fp16=True) image = self.get_sd_image(seed, fp16=True) generator = self.get_generator(seed) with paddle.no_grad(): - sample = model( - image, generator=generator, sample_posterior=True).sample + sample = model(image, generator=generator, sample_posterior=True).sample assert sample.shape == image.shape output_slice = sample[-1, -2:, :2, -2:].flatten().cast("float32").cpu() expected_output_slice = paddle.to_tensor(expected_slice) assert paddle_all_close(output_slice, expected_output_slice, atol=0.01) - @parameterized.expand([ + @parameterized.expand( [ - 33, [ - -0.1609, 0.9866, -0.0487, -0.0777, -0.2716, 0.8368, -0.2055, - -0.0814 + 33, + [-0.1609, 0.9866, -0.0487, -0.0777, -0.2716, 0.8368, -0.2055, -0.0814], + [-0.2395, 0.0098, 0.0102, -0.0709, -0.284, -0.0274, -0.0718, -0.1824], ], [ - -0.2395, 0.0098, 0.0102, -0.0709, -0.284, -0.0274, -0.0718, - -0.1824 + 47, + [-0.2377, 0.1147, 0.1333, -0.4841, -0.2506, -0.0805, -0.0491, -0.4085], + [0.035, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131], ], - ], - [ - 47, - [ - -0.2377, 0.1147, 0.1333, -0.4841, -0.2506, -0.0805, -0.0491, - -0.4085 - ], - [ - 0.035, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, - -0.1131 - ], - ], - ]) - def test_stable_diffusion_mode(self, seed, expected_slice, - expected_slice_mps): + ] + ) + def test_stable_diffusion_mode(self, seed, expected_slice, expected_slice_mps): model = self.get_sd_vae_model() image = self.get_sd_image(seed) with paddle.no_grad(): @@ -275,28 +239,27 @@ def test_stable_diffusion_mode(self, seed, expected_slice, expected_output_slice = paddle.to_tensor(expected_slice) assert paddle_all_close(output_slice, expected_output_slice, atol=0.01) - @parameterized.expand([ + @parameterized.expand( [ - 13, [ - -0.2051, - -0.1803, - -0.2311, - -0.2114, - -0.3292, - -0.3574, - -0.2953, - -0.3323, + 13, + [ + -0.2051, + -0.1803, + -0.2311, + -0.2114, + -0.3292, + -0.3574, + -0.2953, + -0.3323, + ], ], - ], - [ - 37, [ - -0.2632, -0.2625, -0.2199, -0.2741, -0.4539, -0.499, -0.372, - -0.4925 + 37, + [-0.2632, -0.2625, -0.2199, -0.2741, -0.4539, -0.499, -0.372, -0.4925], ], - ], - ]) + ] + ) @require_paddle_gpu def test_stable_diffusion_decode(self, seed, expected_slice): model = self.get_sd_vae_model() @@ -308,28 +271,27 @@ def test_stable_diffusion_decode(self, seed, expected_slice): expected_output_slice = paddle.to_tensor(expected_slice) assert paddle_all_close(output_slice, expected_output_slice, atol=0.01) - @parameterized.expand([ + @parameterized.expand( [ - 27, [ - -0.0369, 0.0207, -0.0776, -0.0682, -0.1747, -0.193, -0.1465, - -0.2039 + 27, + [-0.0369, 0.0207, -0.0776, -0.0682, -0.1747, -0.193, -0.1465, -0.2039], ], - ], - [ - 16, [ - -0.1628, - -0.2134, - -0.2747, - -0.2642, - -0.3774, - -0.4404, - -0.3687, - -0.4277, + 16, + [ + -0.1628, + -0.2134, + -0.2747, + -0.2642, + -0.3774, + -0.4404, + -0.3687, + -0.4277, + ], ], - ], - ]) + ] + ) @require_paddle_gpu def test_stable_diffusion_decode_fp16(self, seed, expected_slice): model = self.get_sd_vae_model(fp16=True) @@ -341,7 +303,7 @@ def test_stable_diffusion_decode_fp16(self, seed, expected_slice): expected_output_slice = paddle.to_tensor(expected_slice) assert paddle_all_close(output_slice, expected_output_slice, atol=0.005) - @parameterized.expand([(13, ), (16, ), (27, )]) + @parameterized.expand([(13,), (16,), (27,)]) @require_paddle_gpu def test_stable_diffusion_decode_ppxformers_vs_2_5_fp16(self, seed): model = self.get_sd_vae_model(fp16=True) @@ -358,7 +320,7 @@ def test_stable_diffusion_decode_ppxformers_vs_2_5_fp16(self, seed): assert paddle_all_close(sample, sample_2, atol=1e-1) - @parameterized.expand([(13, ), (16, ), (37, )]) + @parameterized.expand([(13,), (16,), (37,)]) @require_paddle_gpu def test_stable_diffusion_decode_ppxformers_vs_2_5(self, seed): model = self.get_sd_vae_model() @@ -375,36 +337,38 @@ def test_stable_diffusion_decode_ppxformers_vs_2_5(self, seed): assert paddle_all_close(sample, sample_2, atol=1e-2) - @parameterized.expand([ + @parameterized.expand( [ - 33, [ - -0.3001, - 0.0918, - -2.6984, - -3.972, - -3.2099, - -5.0353, - 1.7338, - -0.2065, - 3.4267, + 33, + [ + -0.3001, + 0.0918, + -2.6984, + -3.972, + -3.2099, + -5.0353, + 1.7338, + -0.2065, + 3.4267, + ], ], - ], - [ - 47, [ - -1.503, - -4.3871, - -6.0355, - -9.1157, - -1.6661, - -2.7853, - 2.1607, - -5.0823, - 2.5633, + 47, + [ + -1.503, + -4.3871, + -6.0355, + -9.1157, + -1.6661, + -2.7853, + 2.1607, + -5.0823, + 2.5633, + ], ], - ], - ]) + ] + ) def test_stable_diffusion_encode_sample(self, seed, expected_slice): model = self.get_sd_vae_model() image = self.get_sd_image(seed) @@ -412,11 +376,8 @@ def test_stable_diffusion_encode_sample(self, seed, expected_slice): with paddle.no_grad(): dist = model.encode(image).latent_dist sample = dist.sample(generator=generator) - assert list(sample.shape) == [image.shape[0], 4] + [ - (i // 8) for i in image.shape[2:] - ] + assert list(sample.shape) == [image.shape[0], 4] + [(i // 8) for i in image.shape[2:]] output_slice = sample[0, -1, -3:, -3:].flatten().cpu() expected_output_slice = paddle.to_tensor(expected_slice) tolerance = 0.01 - assert paddle_all_close( - output_slice, expected_output_slice, atol=tolerance) + assert paddle_all_close(output_slice, expected_output_slice, atol=tolerance) diff --git a/ppdiffusers/tests/models/test_models_vq.py b/ppdiffusers/tests/models/test_models_vq.py index 9b19455a496b6..af2a6292d9353 100644 --- a/ppdiffusers/tests/models/test_models_vq.py +++ b/ppdiffusers/tests/models/test_models_vq.py @@ -60,8 +60,7 @@ def test_training(self): pass def test_from_pretrained_hub(self): - model, loading_info = VQModel.from_pretrained( - "fusing/vqgan-dummy", output_loading_info=True) + model, loading_info = VQModel.from_pretrained("fusing/vqgan-dummy", output_loading_info=True) self.assertIsNotNone(model) self.assertEqual(len(loading_info["missing_keys"]), 0) image = model(**self.dummy_input) @@ -71,26 +70,28 @@ def test_output_pretrained(self): model = VQModel.from_pretrained("fusing/vqgan-dummy") model.eval() paddle.seed(0) - image = paddle.randn(shape=[ - 1, - model.config.in_channels, - model.config.sample_size, - model.config.sample_size, - ]) + image = paddle.randn( + shape=[ + 1, + model.config.in_channels, + model.config.sample_size, + model.config.sample_size, + ] + ) with paddle.no_grad(): output = model(image).sample output_slice = output[0, -1, -3:, -3:].flatten().cpu() - expected_output_slice = paddle.to_tensor([ - -0.027147896587848663, - -0.41129639744758606, - -0.17730756103992462, - -0.5245445370674133, - -0.2423611730337143, - -0.3957087993621826, - -0.16461530327796936, - -0.06902074813842773, - -0.01736617460846901, - ]) - self.assertTrue( - paddle.allclose( - output_slice, expected_output_slice, atol=0.01)) + expected_output_slice = paddle.to_tensor( + [ + -0.027147896587848663, + -0.41129639744758606, + -0.17730756103992462, + -0.5245445370674133, + -0.2423611730337143, + -0.3957087993621826, + -0.16461530327796936, + -0.06902074813842773, + -0.01736617460846901, + ] + ) + self.assertTrue(paddle.allclose(output_slice, expected_output_slice, atol=0.01)) diff --git a/ppdiffusers/tests/models/test_unet_2d_blocks.py b/ppdiffusers/tests/models/test_unet_2d_blocks.py index df1fdae9f4acf..cfb2100ee38ba 100644 --- a/ppdiffusers/tests/models/test_unet_2d_blocks.py +++ b/ppdiffusers/tests/models/test_unet_2d_blocks.py @@ -16,13 +16,28 @@ import unittest from ppdiffusers.models.unet_2d_blocks import ( - AttnDownBlock2D, AttnDownEncoderBlock2D, AttnSkipDownBlock2D, - AttnSkipUpBlock2D, AttnUpBlock2D, AttnUpDecoderBlock2D, - CrossAttnDownBlock2D, CrossAttnUpBlock2D, DownBlock2D, DownEncoderBlock2D, - ResnetDownsampleBlock2D, ResnetUpsampleBlock2D, SimpleCrossAttnDownBlock2D, - SimpleCrossAttnUpBlock2D, SkipDownBlock2D, SkipUpBlock2D, UNetMidBlock2D, - UNetMidBlock2DCrossAttn, UNetMidBlock2DSimpleCrossAttn, UpBlock2D, - UpDecoderBlock2D) + AttnDownBlock2D, + AttnDownEncoderBlock2D, + AttnSkipDownBlock2D, + AttnSkipUpBlock2D, + AttnUpBlock2D, + AttnUpDecoderBlock2D, + CrossAttnDownBlock2D, + CrossAttnUpBlock2D, + DownBlock2D, + DownEncoderBlock2D, + ResnetDownsampleBlock2D, + ResnetUpsampleBlock2D, + SimpleCrossAttnDownBlock2D, + SimpleCrossAttnUpBlock2D, + SkipDownBlock2D, + SkipUpBlock2D, + UNetMidBlock2D, + UNetMidBlock2DCrossAttn, + UNetMidBlock2DSimpleCrossAttn, + UpBlock2D, + UpDecoderBlock2D, +) from .test_unet_blocks_common import UNetBlockTesterMixin @@ -89,8 +104,7 @@ class CrossAttnDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase): block_type = "down" def prepare_init_args_and_inputs_for_common(self): - init_dict, inputs_dict = super( - ).prepare_init_args_and_inputs_for_common() + init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common() init_dict["cross_attention_dim"] = 32 return init_dict, inputs_dict @@ -118,8 +132,7 @@ def dummy_input(self): return super().get_dummy_input(include_encoder_hidden_states=True) def prepare_init_args_and_inputs_for_common(self): - init_dict, inputs_dict = super( - ).prepare_init_args_and_inputs_for_common() + init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common() init_dict["cross_attention_dim"] = 32 return init_dict, inputs_dict @@ -269,8 +282,7 @@ class UNetMidBlock2DCrossAttnTests(UNetBlockTesterMixin, unittest.TestCase): block_type = "mid" def prepare_init_args_and_inputs_for_common(self): - init_dict, inputs_dict = super( - ).prepare_init_args_and_inputs_for_common() + init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common() init_dict["cross_attention_dim"] = 32 return init_dict, inputs_dict @@ -289,8 +301,7 @@ def test_output(self): super().test_output(expected_slice) -class UNetMidBlock2DSimpleCrossAttnTests(UNetBlockTesterMixin, - unittest.TestCase): +class UNetMidBlock2DSimpleCrossAttnTests(UNetBlockTesterMixin, unittest.TestCase): block_class = UNetMidBlock2DSimpleCrossAttn block_type = "mid" @@ -299,8 +310,7 @@ def dummy_input(self): return super().get_dummy_input(include_encoder_hidden_states=True) def prepare_init_args_and_inputs_for_common(self): - init_dict, inputs_dict = super( - ).prepare_init_args_and_inputs_for_common() + init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common() init_dict["cross_attention_dim"] = 32 return init_dict, inputs_dict @@ -374,8 +384,7 @@ def dummy_input(self): return super().get_dummy_input(include_res_hidden_states_tuple=True) def prepare_init_args_and_inputs_for_common(self): - init_dict, inputs_dict = super( - ).prepare_init_args_and_inputs_for_common() + init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common() init_dict["cross_attention_dim"] = 32 return init_dict, inputs_dict @@ -400,13 +409,10 @@ class SimpleCrossAttnUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase): @property def dummy_input(self): - return super().get_dummy_input( - include_res_hidden_states_tuple=True, - include_encoder_hidden_states=True) + return super().get_dummy_input(include_res_hidden_states_tuple=True, include_encoder_hidden_states=True) def prepare_init_args_and_inputs_for_common(self): - init_dict, inputs_dict = super( - ).prepare_init_args_and_inputs_for_common() + init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common() init_dict["cross_attention_dim"] = 32 return init_dict, inputs_dict diff --git a/ppdiffusers/tests/models/test_unet_blocks_common.py b/ppdiffusers/tests/models/test_unet_blocks_common.py index 9f0920c87ef10..4595f43aec64d 100644 --- a/ppdiffusers/tests/models/test_unet_blocks_common.py +++ b/ppdiffusers/tests/models/test_unet_blocks_common.py @@ -35,16 +35,15 @@ def output_shape(self): return 4, 32, 32, 32 elif self.block_type == "up": return 4, 32, 64, 64 - raise ValueError( - f"'{self.block_type}' is not a supported block_type. Set it to 'up', 'mid', or 'down'." - ) + raise ValueError(f"'{self.block_type}' is not a supported block_type. Set it to 'up', 'mid', or 'down'.") def get_dummy_input( - self, - include_temb=True, - include_res_hidden_states_tuple=False, - include_encoder_hidden_states=False, - include_skip_sample=False, ): + self, + include_temb=True, + include_res_hidden_states_tuple=False, + include_encoder_hidden_states=False, + include_skip_sample=False, + ): batch_size = 4 num_channels = 32 sizes = 32, 32 @@ -54,28 +53,20 @@ def get_dummy_input( dummy_input = {"hidden_states": hidden_states} if include_temb: temb_channels = 128 - dummy_input["temb"] = randn_tensor( - (batch_size, temb_channels), generator=generator) + dummy_input["temb"] = randn_tensor((batch_size, temb_channels), generator=generator) if include_res_hidden_states_tuple: generator_1 = paddle.Generator().manual_seed(1) - dummy_input["res_hidden_states_tuple"] = (randn_tensor( - shape, generator=generator_1), ) + dummy_input["res_hidden_states_tuple"] = (randn_tensor(shape, generator=generator_1),) if include_encoder_hidden_states: - dummy_input["encoder_hidden_states"] = floats_tensor( - (batch_size, 32, 32)) + dummy_input["encoder_hidden_states"] = floats_tensor((batch_size, 32, 32)) if include_skip_sample: - dummy_input["skip_sample"] = randn_tensor( - (batch_size, 3) + sizes, generator=generator) + dummy_input["skip_sample"] = randn_tensor((batch_size, 3) + sizes, generator=generator) paddle.seed(0) return dummy_input def prepare_init_args_and_inputs_for_common(self): - init_dict = { - "in_channels": 32, - "out_channels": 32, - "temb_channels": 128 - } + init_dict = {"in_channels": 32, "out_channels": 32, "temb_channels": 128} if self.block_type == "up": init_dict["prev_output_channel"] = 32 if self.block_type == "mid": @@ -94,8 +85,7 @@ def test_output(self, expected_slice): self.assertEqual(list(output.shape), list(self.output_shape)) output_slice = output[0, -1, -3:, -3:] expected_slice = paddle.to_tensor(expected_slice) - assert paddle_all_close( - output_slice.flatten(), expected_slice, atol=0.005) + assert paddle_all_close(output_slice.flatten(), expected_slice, atol=0.005) def test_training(self): init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() diff --git a/ppdiffusers/tests/others/test_config.py b/ppdiffusers/tests/others/test_config.py index 171d2ea28e771..e4637ce2c35a3 100644 --- a/ppdiffusers/tests/others/test_config.py +++ b/ppdiffusers/tests/others/test_config.py @@ -16,10 +16,15 @@ import tempfile import unittest -from ppdiffusers import (DDIMScheduler, DDPMScheduler, - DPMSolverMultistepScheduler, - EulerAncestralDiscreteScheduler, - EulerDiscreteScheduler, PNDMScheduler, logging) +from ppdiffusers import ( + DDIMScheduler, + DDPMScheduler, + DPMSolverMultistepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + PNDMScheduler, + logging, +) from ppdiffusers.configuration_utils import ConfigMixin, register_to_config from ppdiffusers.utils.testing_utils import CaptureLogger @@ -44,13 +49,7 @@ class SampleObject3(ConfigMixin): config_name = "config.json" @register_to_config - def __init__(self, - a=2, - b=5, - c=(2, 5), - d="for diffusion", - e=[1, 3], - f=[1, 3]): + def __init__(self, a=2, b=5, c=(2, 5), d="for diffusion", e=[1, 3], f=[1, 3]): pass @@ -99,8 +98,7 @@ def test_save_load(self): assert config["e"] == [1, 3] with tempfile.TemporaryDirectory() as tmpdirname: obj.save_config(tmpdirname) - new_obj = SampleObject.from_config( - SampleObject.load_config(tmpdirname)) + new_obj = SampleObject.from_config(SampleObject.load_config(tmpdirname)) new_config = new_obj.config config = dict(config) new_config = dict(new_config) @@ -114,8 +112,8 @@ def test_load_ddim_from_pndm(self): logger.setLevel(30) with CaptureLogger(logger) as cap_logger: ddim = DDIMScheduler.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", - subfolder="scheduler") + "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler" + ) assert ddim.__class__ == DDIMScheduler assert cap_logger.out == "" @@ -125,8 +123,8 @@ def test_load_euler_from_pndm(self): logger.setLevel(30) with CaptureLogger(logger) as cap_logger: euler = EulerDiscreteScheduler.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", - subfolder="scheduler") + "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler" + ) assert euler.__class__ == EulerDiscreteScheduler assert cap_logger.out == "" @@ -136,8 +134,8 @@ def test_load_euler_ancestral_from_pndm(self): logger.setLevel(30) with CaptureLogger(logger) as cap_logger: euler = EulerAncestralDiscreteScheduler.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", - subfolder="scheduler") + "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler" + ) assert euler.__class__ == EulerAncestralDiscreteScheduler assert cap_logger.out == "" @@ -147,8 +145,8 @@ def test_load_pndm(self): logger.setLevel(30) with CaptureLogger(logger) as cap_logger: pndm = PNDMScheduler.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", - subfolder="scheduler") + "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler" + ) assert pndm.__class__ == PNDMScheduler assert cap_logger.out == "" @@ -161,10 +159,10 @@ def test_overwrite_config_on_load(self): "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler", prediction_type="sample", - beta_end=8, ) + beta_end=8, + ) with CaptureLogger(logger) as cap_logger_2: - ddpm_2 = DDPMScheduler.from_pretrained( - "google/ddpm-celebahq-256", beta_start=88) + ddpm_2 = DDPMScheduler.from_pretrained("google/ddpm-celebahq-256", beta_start=88) assert ddpm.__class__ == DDPMScheduler assert ddpm.config.prediction_type == "sample" assert ddpm.config.beta_end == 8 @@ -178,7 +176,7 @@ def test_load_dpmsolver(self): logger.setLevel(30) with CaptureLogger(logger) as cap_logger: dpm = DPMSolverMultistepScheduler.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", - subfolder="scheduler") + "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler" + ) assert dpm.__class__ == DPMSolverMultistepScheduler assert cap_logger.out == "" diff --git a/ppdiffusers/tests/others/test_ema.py b/ppdiffusers/tests/others/test_ema.py index 1ed2044e555e2..e8bd66abcfbee 100644 --- a/ppdiffusers/tests/others/test_ema.py +++ b/ppdiffusers/tests/others/test_ema.py @@ -33,13 +33,13 @@ class EMAModelTests(unittest.TestCase): generator = paddle.Generator().manual_seed(0) def get_models(self, decay=0.9999): - unet = UNet2DConditionModel.from_pretrained( - self.model_id, subfolder="unet") + unet = UNet2DConditionModel.from_pretrained(self.model_id, subfolder="unet") ema_unet = EMAModel( unet.parameters(), decay=decay, model_cls=UNet2DConditionModel, - model_config=unet.config, ) + model_config=unet.config, + ) return unet, ema_unet def get_dummy_inputs(self): @@ -48,21 +48,23 @@ def get_dummy_inputs(self): self.batch_size, self.num_in_channels, self.latent_height, - self.latent_width, ), - generator=self.generator, ) - timesteps = paddle.randint( - 0, 1000, shape=(self.batch_size, ), generator=self.generator) + self.latent_width, + ), + generator=self.generator, + ) + timesteps = paddle.randint(0, 1000, shape=(self.batch_size,), generator=self.generator) encoder_hidden_states = paddle.randn( (self.batch_size, self.prompt_length, self.text_encoder_hidden_dim), - generator=self.generator, ) + generator=self.generator, + ) return noisy_latents, timesteps, encoder_hidden_states def simulate_backprop(self, unet): updated_state_dict = {} for k, param in unet.state_dict().items(): - updated_param = paddle.randn( - param.shape, dtype=param.dtype) + (param * paddle.randn( - param.shape, dtype=param.dtype)) + updated_param = paddle.randn(param.shape, dtype=param.dtype) + ( + param * paddle.randn(param.shape, dtype=param.dtype) + ) updated_state_dict.update({k: updated_param}) unet.load_dict(updated_state_dict) return unet @@ -131,8 +133,7 @@ def test_consecutive_shadow_params_updated(self): ema_unet.step(unet_step_two.parameters()) step_two_shadow_params = ema_unet.shadow_params - for step_one, step_two in zip(step_one_shadow_params, - step_two_shadow_params): + for step_one, step_two in zip(step_one_shadow_params, step_two_shadow_params): assert not paddle.allclose(step_one, step_two) def test_zero_decay(self): @@ -148,23 +149,19 @@ def test_zero_decay(self): ema_unet.step(unet_step_two.parameters()) step_two_shadow_params = ema_unet.shadow_params - for step_one, step_two in zip(step_one_shadow_params, - step_two_shadow_params): + for step_one, step_two in zip(step_one_shadow_params, step_two_shadow_params): assert paddle.allclose(step_one, step_two) def test_serialization(self): unet, ema_unet = self.get_models() - noisy_latents, timesteps, encoder_hidden_states = self.get_dummy_inputs( - ) + noisy_latents, timesteps, encoder_hidden_states = self.get_dummy_inputs() with tempfile.TemporaryDirectory() as tmpdir: ema_unet.save_pretrained(tmpdir) - loaded_unet = UNet2DConditionModel.from_pretrained( - tmpdir, model_cls=UNet2DConditionModel) + loaded_unet = UNet2DConditionModel.from_pretrained(tmpdir, model_cls=UNet2DConditionModel) # Since no EMA step has been performed the outputs should match. output = unet(noisy_latents, timesteps, encoder_hidden_states).sample - output_loaded = loaded_unet(noisy_latents, timesteps, - encoder_hidden_states).sample + output_loaded = loaded_unet(noisy_latents, timesteps, encoder_hidden_states).sample assert paddle.allclose(output, output_loaded, atol=1e-4) diff --git a/ppdiffusers/tests/others/test_image_processor.py b/ppdiffusers/tests/others/test_image_processor.py index 054fe2b955ca9..e0c88c40e56b4 100644 --- a/ppdiffusers/tests/others/test_image_processor.py +++ b/ppdiffusers/tests/others/test_image_processor.py @@ -50,10 +50,10 @@ def test_vae_image_processor_pd(self): for output_type in ["pd", "np", "pil"]: out = image_processor.postprocess( image_processor.preprocess(input_pd), - output_type=output_type, ) + output_type=output_type, + ) out_np = self.to_np(out) - in_np = (input_np * - 255).round() if output_type == "pil" else input_np + in_np = (input_np * 255).round() if output_type == "pil" else input_np assert ( np.abs(in_np - out_np).max() < 1e-6 ), f"decoded output does not match input for output_type {output_type}" @@ -63,12 +63,10 @@ def test_vae_image_processor_np(self): input_np = self.dummy_sample.transpose([0, 2, 3, 1]).cpu().numpy() for output_type in ["pd", "np", "pil"]: - out = image_processor.postprocess( - image_processor.preprocess(input_np), output_type=output_type) + out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type) out_np = self.to_np(out) - in_np = (input_np * - 255).round() if output_type == "pil" else input_np + in_np = (input_np * 255).round() if output_type == "pil" else input_np assert ( np.abs(in_np - out_np).max() < 1e-6 ), f"decoded output does not match input for output_type {output_type}" @@ -80,12 +78,10 @@ def test_vae_image_processor_pil(self): input_pil = image_processor.numpy_to_pil(input_np) for output_type in ["pd", "np", "pil"]: - out = image_processor.postprocess( - image_processor.preprocess(input_pil), output_type=output_type) + out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type) for i, o in zip(input_pil, out): in_np = np.array(i) - out_np = (self.to_np(out) if output_type == "pil" else - (self.to_np(out) * 255).round()) + out_np = self.to_np(out) if output_type == "pil" else (self.to_np(out) * 255).round() assert ( np.abs(in_np - out_np).max() < 1e-6 ), f"decoded output does not match input for output_type {output_type}" @@ -98,20 +94,24 @@ def test_preprocess_input_3d(self): out_pt_4d = image_processor.postprocess( image_processor.preprocess(input_pd_4d), - output_type="np", ) + output_type="np", + ) out_pt_3d = image_processor.postprocess( image_processor.preprocess(input_pd_3d), - output_type="np", ) + output_type="np", + ) input_np_4d = self.to_np(self.dummy_sample) input_np_3d = input_np_4d.squeeze(0) out_np_4d = image_processor.postprocess( image_processor.preprocess(input_np_4d), - output_type="np", ) + output_type="np", + ) out_np_3d = image_processor.postprocess( image_processor.preprocess(input_np_3d), - output_type="np", ) + output_type="np", + ) assert np.abs(out_pt_4d - out_pt_3d).max() < 1e-6 assert np.abs(out_np_4d - out_np_3d).max() < 1e-6 @@ -124,22 +124,26 @@ def test_preprocess_input_list(self): out_pt_4d = image_processor.postprocess( image_processor.preprocess(input_pd_4d), - output_type="np", ) + output_type="np", + ) out_pt_list = image_processor.postprocess( image_processor.preprocess(input_pd_list), - output_type="np", ) + output_type="np", + ) input_np_4d = self.to_np(self.dummy_sample) list(input_np_4d) out_np_4d = image_processor.postprocess( image_processor.preprocess(input_pd_4d), - output_type="np", ) + output_type="np", + ) out_np_list = image_processor.postprocess( image_processor.preprocess(input_pd_list), - output_type="np", ) + output_type="np", + ) assert np.abs(out_pt_4d - out_pt_list).max() < 1e-6 assert np.abs(out_np_4d - out_np_list).max() < 1e-6 diff --git a/ppdiffusers/tests/others/test_training.py b/ppdiffusers/tests/others/test_training.py index c52c0988951f2..12b72686eaed6 100644 --- a/ppdiffusers/tests/others/test_training.py +++ b/ppdiffusers/tests/others/test_training.py @@ -17,8 +17,7 @@ import paddle -from ppdiffusers import (DDIMScheduler, DDPMScheduler, UNet2DConditionModel, - UNet2DModel) +from ppdiffusers import DDIMScheduler, DDPMScheduler, UNet2DConditionModel, UNet2DModel from ppdiffusers.training_utils import set_seed from ppdiffusers.utils.import_utils import is_ppxformers_available from ppdiffusers.utils.testing_utils import slow @@ -27,10 +26,8 @@ class UNet2DModelTrainingTests(unittest.TestCase): def get_model_optimizer(self, resolution=32): set_seed(0) - model = UNet2DModel( - sample_size=resolution, in_channels=3, out_channels=3) - optimizer = paddle.optimizer.SGD(parameters=model.parameters(), - learning_rate=0.0001) + model = UNet2DModel(sample_size=resolution, in_channels=3, out_channels=3) + optimizer = paddle.optimizer.SGD(parameters=model.parameters(), learning_rate=0.0001) return model, optimizer @slow @@ -40,34 +37,27 @@ def test_training_step_equality(self): beta_start=0.0001, beta_end=0.02, beta_schedule="linear", - clip_sample=True, ) + clip_sample=True, + ) ddim_scheduler = DDIMScheduler( num_train_timesteps=1000, beta_start=0.0001, beta_end=0.02, beta_schedule="linear", - clip_sample=True, ) - assert (ddpm_scheduler.config.num_train_timesteps == - ddim_scheduler.config.num_train_timesteps) + clip_sample=True, + ) + assert ddpm_scheduler.config.num_train_timesteps == ddim_scheduler.config.num_train_timesteps set_seed(0) - clean_images = [ - paddle.randn(shape=(4, 3, 32, 32)).clip( - min=-1, max=1) for _ in range(4) - ] + clean_images = [paddle.randn(shape=(4, 3, 32, 32)).clip(min=-1, max=1) for _ in range(4)] noise = [paddle.randn(shape=(4, 3, 32, 32)) for _ in range(4)] - timesteps = [ - paddle.randint(0, 1000, (4, )).astype(dtype="int64") - for _ in range(4) - ] + timesteps = [paddle.randint(0, 1000, (4,)).astype(dtype="int64") for _ in range(4)] model, optimizer = self.get_model_optimizer(resolution=32) model.train() for i in range(4): optimizer.clear_grad() - ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i], - noise[i], timesteps[i]) + ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i], noise[i], timesteps[i]) ddpm_noise_pred = model(ddpm_noisy_images, timesteps[i]).sample - loss = paddle.nn.functional.mse_loss( - input=ddpm_noise_pred, label=noise[i]) + loss = paddle.nn.functional.mse_loss(input=ddpm_noise_pred, label=noise[i]) loss.backward() optimizer.step() del model, optimizer @@ -75,30 +65,22 @@ def test_training_step_equality(self): model.train() for i in range(4): optimizer.clear_grad() - ddim_noisy_images = ddim_scheduler.add_noise(clean_images[i], - noise[i], timesteps[i]) + ddim_noisy_images = ddim_scheduler.add_noise(clean_images[i], noise[i], timesteps[i]) ddim_noise_pred = model(ddim_noisy_images, timesteps[i]).sample - loss = paddle.nn.functional.mse_loss( - input=ddim_noise_pred, label=noise[i]) + loss = paddle.nn.functional.mse_loss(input=ddim_noise_pred, label=noise[i]) loss.backward() optimizer.step() del model, optimizer - self.assertTrue( - paddle.allclose( - ddpm_noisy_images, ddim_noisy_images, atol=1e-05)) - self.assertTrue( - paddle.allclose( - ddpm_noise_pred, ddim_noise_pred, atol=1e-04)) + self.assertTrue(paddle.allclose(ddpm_noisy_images, ddim_noisy_images, atol=1e-05)) + self.assertTrue(paddle.allclose(ddpm_noise_pred, ddim_noise_pred, atol=1e-04)) # new added class UNet2DConditionModelTrainingTests(unittest.TestCase): def get_model_optimizer(self, resolution=32): set_seed(0) - model = UNet2DConditionModel( - sample_size=resolution, in_channels=3, out_channels=3) - optimizer = paddle.optimizer.AdamW( - parameters=model.parameters(), learning_rate=0.0001) + model = UNet2DConditionModel(sample_size=resolution, in_channels=3, out_channels=3) + optimizer = paddle.optimizer.AdamW(parameters=model.parameters(), learning_rate=0.0001) return model, optimizer @slow @@ -107,37 +89,31 @@ def test_training_step_equality(self): num_train_timesteps=1000, beta_start=0.00085, beta_end=0.012, - beta_schedule="scaled_linear", ) + beta_schedule="scaled_linear", + ) ddim_scheduler = DDIMScheduler( num_train_timesteps=1000, beta_start=0.00085, beta_end=0.012, - beta_schedule="scaled_linear", ) - assert (ddpm_scheduler.config.num_train_timesteps == - ddim_scheduler.config.num_train_timesteps) + beta_schedule="scaled_linear", + ) + assert ddpm_scheduler.config.num_train_timesteps == ddim_scheduler.config.num_train_timesteps set_seed(0) - clean_images = [ - paddle.randn(shape=(4, 3, 32, 32)).clip( - min=-1, max=1) for _ in range(4) - ] + clean_images = [paddle.randn(shape=(4, 3, 32, 32)).clip(min=-1, max=1) for _ in range(4)] noise = [paddle.randn(shape=(4, 3, 32, 32)) for _ in range(4)] text_embeddings = [paddle.randn(shape=(4, 77, 1280)) for _ in range(4)] - timesteps = [ - paddle.randint(0, 1000, (4, )).astype(dtype="int64") - for _ in range(4) - ] + timesteps = [paddle.randint(0, 1000, (4,)).astype(dtype="int64") for _ in range(4)] model, optimizer = self.get_model_optimizer(resolution=32) model.train() for i in range(4): optimizer.clear_grad() - ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i], - noise[i], timesteps[i]) + ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i], noise[i], timesteps[i]) ddpm_noise_pred = model( ddpm_noisy_images, timesteps[i], - encoder_hidden_states=text_embeddings[i], ).sample - loss = paddle.nn.functional.mse_loss( - input=ddpm_noise_pred, label=noise[i]) + encoder_hidden_states=text_embeddings[i], + ).sample + loss = paddle.nn.functional.mse_loss(input=ddpm_noise_pred, label=noise[i]) loss.backward() optimizer.step() del model, optimizer @@ -145,23 +121,18 @@ def test_training_step_equality(self): model.train() for i in range(4): optimizer.clear_grad() - ddim_noisy_images = ddim_scheduler.add_noise(clean_images[i], - noise[i], timesteps[i]) + ddim_noisy_images = ddim_scheduler.add_noise(clean_images[i], noise[i], timesteps[i]) ddim_noise_pred = model( ddim_noisy_images, timesteps[i], - encoder_hidden_states=text_embeddings[i], ).sample - loss = paddle.nn.functional.mse_loss( - input=ddim_noise_pred, label=noise[i]) + encoder_hidden_states=text_embeddings[i], + ).sample + loss = paddle.nn.functional.mse_loss(input=ddim_noise_pred, label=noise[i]) loss.backward() optimizer.step() del model, optimizer - self.assertTrue( - paddle.allclose( - ddpm_noisy_images, ddim_noisy_images, atol=1e-05)) - self.assertTrue( - paddle.allclose( - ddpm_noise_pred, ddim_noise_pred, atol=1e-04)) + self.assertTrue(paddle.allclose(ddpm_noisy_images, ddim_noisy_images, atol=1e-05)) + self.assertTrue(paddle.allclose(ddpm_noise_pred, ddim_noise_pred, atol=1e-04)) @unittest.skipIf( not is_ppxformers_available(), @@ -173,17 +144,12 @@ def test_recompute_xformers_training(self): num_train_timesteps=1000, beta_start=0.00085, beta_end=0.012, - beta_schedule="scaled_linear", ) + beta_schedule="scaled_linear", + ) set_seed(0) - clean_images = [ - paddle.randn(shape=(4, 3, 32, 32)).clip( - min=-1, max=1) for _ in range(4) - ] + clean_images = [paddle.randn(shape=(4, 3, 32, 32)).clip(min=-1, max=1) for _ in range(4)] noise = [paddle.randn(shape=(4, 3, 32, 32)) for _ in range(4)] - timesteps = [ - paddle.randint(0, 1000, (4, )).astype(dtype="int64") - for _ in range(4) - ] + timesteps = [paddle.randint(0, 1000, (4,)).astype(dtype="int64") for _ in range(4)] text_embeddings = [paddle.randn(shape=(4, 77, 1280)) for _ in range(4)] model, optimizer = self.get_model_optimizer(resolution=32) model.enable_gradient_checkpointing() @@ -191,13 +157,12 @@ def test_recompute_xformers_training(self): model.train() for i in range(4): optimizer.clear_grad() - ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i], - noise[i], timesteps[i]) + ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i], noise[i], timesteps[i]) ddpm_noise_pred = model( ddpm_noisy_images, timesteps[i], - encoder_hidden_states=text_embeddings[i], ).sample - loss = paddle.nn.functional.mse_loss( - input=ddpm_noise_pred, label=noise[i]) + encoder_hidden_states=text_embeddings[i], + ).sample + loss = paddle.nn.functional.mse_loss(input=ddpm_noise_pred, label=noise[i]) loss.backward() optimizer.step() diff --git a/ppdiffusers/tests/others/test_utils.py b/ppdiffusers/tests/others/test_utils.py index 870e791a6f54b..ae27388bf5f60 100644 --- a/ppdiffusers/tests/others/test_utils.py +++ b/ppdiffusers/tests/others/test_utils.py @@ -20,34 +20,27 @@ class DeprecateTester(unittest.TestCase): - higher_version = ".".join([str(int(__version__.split(".")[0]) + 1)] + - __version__.split(".")[1:]) + higher_version = ".".join([str(int(__version__.split(".")[0]) + 1)] + __version__.split(".")[1:]) lower_version = "0.0.1" def test_deprecate_function_arg(self): kwargs = {"deprecated_arg": 4} with self.assertWarns(FutureWarning) as warning: - output = deprecate( - "deprecated_arg", - self.higher_version, - "message", - take_from=kwargs) + output = deprecate("deprecated_arg", self.higher_version, "message", take_from=kwargs) assert output == 4 assert ( - str(warning.warning) == - f"The `deprecated_arg` argument is deprecated and will be removed in version {self.higher_version}. message" + str(warning.warning) + == f"The `deprecated_arg` argument is deprecated and will be removed in version {self.higher_version}. message" ) def test_deprecate_function_arg_tuple(self): kwargs = {"deprecated_arg": 4} with self.assertWarns(FutureWarning) as warning: - output = deprecate( - ("deprecated_arg", self.higher_version, "message"), - take_from=kwargs) + output = deprecate(("deprecated_arg", self.higher_version, "message"), take_from=kwargs) assert output == 4 assert ( - str(warning.warning) == - f"The `deprecated_arg` argument is deprecated and will be removed in version {self.higher_version}. message" + str(warning.warning) + == f"The `deprecated_arg` argument is deprecated and will be removed in version {self.higher_version}. message" ) def test_deprecate_function_args(self): @@ -56,49 +49,48 @@ def test_deprecate_function_args(self): output_1, output_2 = deprecate( ("deprecated_arg_1", self.higher_version, "Hey"), ("deprecated_arg_2", self.higher_version, "Hey"), - take_from=kwargs, ) + take_from=kwargs, + ) assert output_1 == 4 assert output_2 == 8 assert ( - str(warning.warnings[0].message) == - f"The `deprecated_arg_1` argument is deprecated and will be removed in version {self.higher_version}. Hey" + str(warning.warnings[0].message) + == f"The `deprecated_arg_1` argument is deprecated and will be removed in version {self.higher_version}. Hey" ) assert ( - str(warning.warnings[1].message) == - f"The `deprecated_arg_2` argument is deprecated and will be removed in version {self.higher_version}. Hey" + str(warning.warnings[1].message) + == f"The `deprecated_arg_2` argument is deprecated and will be removed in version {self.higher_version}. Hey" ) def test_deprecate_function_incorrect_arg(self): kwargs = {"deprecated_arg": 4} with self.assertRaises(TypeError) as error: - deprecate( - ("wrong_arg", self.higher_version, "message"), take_from=kwargs) - assert "test_deprecate_function_incorrect_arg in" in str( - error.exception) + deprecate(("wrong_arg", self.higher_version, "message"), take_from=kwargs) + assert "test_deprecate_function_incorrect_arg in" in str(error.exception) assert "line" in str(error.exception) - assert "got an unexpected keyword argument `deprecated_arg`" in str( - error.exception) + assert "got an unexpected keyword argument `deprecated_arg`" in str(error.exception) def test_deprecate_arg_no_kwarg(self): with self.assertWarns(FutureWarning) as warning: deprecate(("deprecated_arg", self.higher_version, "message")) assert ( - str(warning.warning) == - f"`deprecated_arg` is deprecated and will be removed in version {self.higher_version}. message" + str(warning.warning) + == f"`deprecated_arg` is deprecated and will be removed in version {self.higher_version}. message" ) def test_deprecate_args_no_kwarg(self): with self.assertWarns(FutureWarning) as warning: deprecate( ("deprecated_arg_1", self.higher_version, "Hey"), - ("deprecated_arg_2", self.higher_version, "Hey"), ) + ("deprecated_arg_2", self.higher_version, "Hey"), + ) assert ( - str(warning.warnings[0].message) == - f"`deprecated_arg_1` is deprecated and will be removed in version {self.higher_version}. Hey" + str(warning.warnings[0].message) + == f"`deprecated_arg_1` is deprecated and will be removed in version {self.higher_version}. Hey" ) assert ( - str(warning.warnings[1].message) == - f"`deprecated_arg_2` is deprecated and will be removed in version {self.higher_version}. Hey" + str(warning.warnings[1].message) + == f"`deprecated_arg_2` is deprecated and will be removed in version {self.higher_version}. Hey" ) def test_deprecate_class_obj(self): @@ -106,12 +98,11 @@ class Args: arg = 5 with self.assertWarns(FutureWarning) as warning: - arg = deprecate( - ("arg", self.higher_version, "message"), take_from=Args()) + arg = deprecate(("arg", self.higher_version, "message"), take_from=Args()) assert arg == 5 assert ( - str(warning.warning) == - f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message" + str(warning.warning) + == f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message" ) def test_deprecate_class_objs(self): @@ -124,45 +115,45 @@ class Args: ("arg", self.higher_version, "message"), ("foo", self.higher_version, "message"), ("does not exist", self.higher_version, "message"), - take_from=Args(), ) + take_from=Args(), + ) assert arg_1 == 5 assert arg_2 == 7 assert ( - str(warning.warning) == - f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message" + str(warning.warning) + == f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message" ) assert ( - str(warning.warnings[0].message) == - f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message" + str(warning.warnings[0].message) + == f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message" ) assert ( - str(warning.warnings[1].message) == - f"The `foo` attribute is deprecated and will be removed in version {self.higher_version}. message" + str(warning.warnings[1].message) + == f"The `foo` attribute is deprecated and will be removed in version {self.higher_version}. message" ) def test_deprecate_incorrect_version(self): kwargs = {"deprecated_arg": 4} with self.assertRaises(ValueError) as error: - deprecate( - ("wrong_arg", self.lower_version, "message"), take_from=kwargs) + deprecate(("wrong_arg", self.lower_version, "message"), take_from=kwargs) assert ( - str(error.exception) == - f"The deprecation tuple ('wrong_arg', '0.0.1', 'message') should be removed since ppdiffusers' version {__version__} is >= {self.lower_version}" + str(error.exception) + == f"The deprecation tuple ('wrong_arg', '0.0.1', 'message') should be removed since ppdiffusers' version {__version__} is >= {self.lower_version}" ) def test_deprecate_incorrect_no_standard_warn(self): with self.assertWarns(FutureWarning) as warning: deprecate( - ("deprecated_arg", self.higher_version, - "This message is better!!!"), - standard_warn=False, ) + ("deprecated_arg", self.higher_version, "This message is better!!!"), + standard_warn=False, + ) assert str(warning.warning) == "This message is better!!!" def test_deprecate_stacklevel(self): with self.assertWarns(FutureWarning) as warning: deprecate( - ("deprecated_arg", self.higher_version, - "This message is better!!!"), - standard_warn=False, ) + ("deprecated_arg", self.higher_version, "This message is better!!!"), + standard_warn=False, + ) assert str(warning.warning) == "This message is better!!!" assert "test_utils.py" in warning.filename diff --git a/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py b/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py index f0804e24b9b35..e49767c5a033b 100644 --- a/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py +++ b/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py @@ -18,14 +18,20 @@ import numpy as np import paddle -from paddlenlp.transformers import (CLIPTextConfig, CLIPTextModel, - XLMRobertaTokenizer) +from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, XLMRobertaTokenizer import ppdiffusers # noqa F401 -from ppdiffusers import (AltDiffusionPipeline, AutoencoderKL, DDIMScheduler, - PNDMScheduler, UNet2DConditionModel) +from ppdiffusers import ( + AltDiffusionPipeline, + AutoencoderKL, + DDIMScheduler, + PNDMScheduler, + UNet2DConditionModel, +) from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import ( - RobertaSeriesConfig, RobertaSeriesModelWithTransformation) + RobertaSeriesConfig, + RobertaSeriesModelWithTransformation, +) from ppdiffusers.utils import slow from ppdiffusers.utils.testing_utils import require_paddle_gpu @@ -48,13 +54,15 @@ def get_dummy_components(self): out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, ) + cross_attention_dim=32, + ) scheduler = DDIMScheduler( beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, - set_alpha_to_one=False, ) + set_alpha_to_one=False, + ) paddle.seed(0) vae = AutoencoderKL( block_out_channels=[32, 64], @@ -62,7 +70,8 @@ def get_dummy_components(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, ) + latent_channels=4, + ) paddle.seed(0) text_encoder_config = CLIPTextConfig( bos_token_id=0, @@ -74,11 +83,12 @@ def get_dummy_components(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=5002, ) + vocab_size=5002, + ) text_encoder = CLIPTextModel(text_encoder_config).eval() tokenizer = XLMRobertaTokenizer.from_pretrained( - "hf-internal-testing/tiny-xlm-roberta", - model_max_length=77) # must set model_max_length 77 here + "hf-internal-testing/tiny-xlm-roberta", model_max_length=77 + ) # must set model_max_length 77 here components = { "unet": unet, "scheduler": scheduler, @@ -111,9 +121,9 @@ def test_alt_diffusion_ddim(self): layer_norm_eps=1e-05, num_attention_heads=4, num_hidden_layers=5, - vocab_size=5002, ) - text_encoder = RobertaSeriesModelWithTransformation( - text_encoder_config).eval() + vocab_size=5002, + ) + text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config).eval() components["text_encoder"] = text_encoder alt_pipe = AltDiffusionPipeline(**components) alt_pipe.set_progress_bar_config(disable=None) @@ -123,17 +133,19 @@ def test_alt_diffusion_ddim(self): image = output.images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.32336113, - 0.2371237, - 0.34009337, - 0.22972241, - 0.23742735, - 0.4925817, - 0.22020563, - 0.20505491, - 0.43374813, - ]) + expected_slice = np.array( + [ + 0.32336113, + 0.2371237, + 0.34009337, + 0.22972241, + 0.23742735, + 0.4925817, + 0.22020563, + 0.20505491, + 0.43374813, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05 def test_alt_diffusion_pndm(self): @@ -147,9 +159,9 @@ def test_alt_diffusion_pndm(self): layer_norm_eps=1e-05, num_attention_heads=4, num_hidden_layers=5, - vocab_size=5002, ) - text_encoder = RobertaSeriesModelWithTransformation( - text_encoder_config).eval() + vocab_size=5002, + ) + text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config).eval() components["text_encoder"] = text_encoder alt_pipe = AltDiffusionPipeline(**components) alt_pipe.set_progress_bar_config(disable=None) @@ -158,17 +170,19 @@ def test_alt_diffusion_pndm(self): image = output.images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.24095133, - 0.26875997, - 0.34291863, - 0.2529385, - 0.2736602, - 0.49928105, - 0.23973131, - 0.21133915, - 0.41810605, - ]) + expected_slice = np.array( + [ + 0.24095133, + 0.26875997, + 0.34291863, + 0.2529385, + 0.2736602, + 0.49928105, + 0.23973131, + 0.21133915, + 0.41810605, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05 @@ -181,8 +195,7 @@ def tearDown(self): paddle.device.cuda.empty_cache() def test_alt_diffusion(self): - alt_pipe = AltDiffusionPipeline.from_pretrained( - "BAAI/AltDiffusion", safety_checker=None) + alt_pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", safety_checker=None) alt_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" generator = paddle.Generator().manual_seed(0) @@ -191,48 +204,47 @@ def test_alt_diffusion(self): generator=generator, guidance_scale=6.0, num_inference_steps=20, - output_type="np", ) + output_type="np", + ) image = output.images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.8718514442443848, - 0.8715569972991943, - 0.8748429417610168, - 0.8708409070968628, - 0.8782679438591003, - 0.8931069374084473, - 0.883078932762146, - 0.881088376045227, - 0.8617547154426575, - ]) + expected_slice = np.array( + [ + 0.8718514442443848, + 0.8715569972991943, + 0.8748429417610168, + 0.8708409070968628, + 0.8782679438591003, + 0.8931069374084473, + 0.883078932762146, + 0.881088376045227, + 0.8617547154426575, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_alt_diffusion_fast_ddim(self): - scheduler = DDIMScheduler.from_pretrained( - "BAAI/AltDiffusion", subfolder="scheduler") - alt_pipe = AltDiffusionPipeline.from_pretrained( - "BAAI/AltDiffusion", scheduler=scheduler, safety_checker=None) + scheduler = DDIMScheduler.from_pretrained("BAAI/AltDiffusion", subfolder="scheduler") + alt_pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", scheduler=scheduler, safety_checker=None) alt_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" generator = paddle.Generator().manual_seed(0) - output = alt_pipe( - [prompt], - generator=generator, - num_inference_steps=2, - output_type="numpy") + output = alt_pipe([prompt], generator=generator, num_inference_steps=2, output_type="numpy") image = output.images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.9265012741088867, - 0.9305188059806824, - 0.8999797105789185, - 0.9346827268600464, - 0.9264709949493408, - 0.9447494745254517, - 0.9428927898406982, - 0.9417785406112671, - 0.9157286882400513, - ]) + expected_slice = np.array( + [ + 0.9265012741088867, + 0.9305188059806824, + 0.8999797105789185, + 0.9346827268600464, + 0.9264709949493408, + 0.9447494745254517, + 0.9428927898406982, + 0.9417785406112671, + 0.9157286882400513, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05 diff --git a/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py b/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py index ca070f3ff45ee..1422ec516f01d 100644 --- a/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py +++ b/ppdiffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py @@ -22,11 +22,17 @@ from paddlenlp.transformers import XLMRobertaTokenizer import ppdiffusers # noqa F401 -from ppdiffusers import (AltDiffusionImg2ImgPipeline, AutoencoderKL, - PNDMScheduler, UNet2DConditionModel) +from ppdiffusers import ( + AltDiffusionImg2ImgPipeline, + AutoencoderKL, + PNDMScheduler, + UNet2DConditionModel, +) from ppdiffusers.image_processor import VaeImageProcessor from ppdiffusers.pipelines.alt_diffusion.modeling_roberta_series import ( - RobertaSeriesConfig, RobertaSeriesModelWithTransformation) + RobertaSeriesConfig, + RobertaSeriesModelWithTransformation, +) from ppdiffusers.utils import floats_tensor, load_image, slow from ppdiffusers.utils.testing_utils import require_paddle_gpu @@ -42,8 +48,7 @@ def dummy_image(self): batch_size = 1 num_channels = 3 sizes = 32, 32 - image = floats_tensor( - (batch_size, num_channels) + sizes, rng=random.Random(0)) + image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)) return image @property @@ -57,7 +62,8 @@ def dummy_cond_unet(self): out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, ) + cross_attention_dim=32, + ) return model @property @@ -69,7 +75,8 @@ def dummy_vae(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, ) + latent_channels=4, + ) return model @property @@ -83,7 +90,8 @@ def dummy_text_encoder(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=5006, ) + vocab_size=5006, + ) return RobertaSeriesModelWithTransformation(config) @property @@ -106,8 +114,7 @@ def test_stable_diffusion_img2img_default_case(self): scheduler = PNDMScheduler(skip_prk_steps=True) vae = self.dummy_vae bert = self.dummy_text_encoder - tokenizer = XLMRobertaTokenizer.from_pretrained( - "hf-internal-testing/tiny-xlm-roberta") + tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta") tokenizer.model_max_length = 77 init_image = self.dummy_image alt_pipe = AltDiffusionImg2ImgPipeline( @@ -117,9 +124,9 @@ def test_stable_diffusion_img2img_default_case(self): text_encoder=bert, tokenizer=tokenizer, safety_checker=None, - feature_extractor=self.dummy_extractor, ) - alt_pipe.image_processor = VaeImageProcessor( - vae_scale_factor=alt_pipe.vae_scale_factor) + feature_extractor=self.dummy_extractor, + ) + alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor) alt_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" generator = paddle.Generator().manual_seed(0) @@ -129,7 +136,8 @@ def test_stable_diffusion_img2img_default_case(self): guidance_scale=6.0, num_inference_steps=2, output_type="np", - image=init_image, ) + image=init_image, + ) image = output.images generator = paddle.Generator().manual_seed(0) image_from_tuple = alt_pipe( @@ -139,24 +147,26 @@ def test_stable_diffusion_img2img_default_case(self): num_inference_steps=2, output_type="np", image=init_image, - return_dict=False, )[0] + return_dict=False, + )[0] image_slice = image[0, -3:, -3:, -1] image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([ - 0.48931587, - 0.40102208, - 0.49653798, - 0.4203022, - 0.34621224, - 0.50789315, - 0.41116416, - 0.4933398, - 0.5465742, - ]) + expected_slice = np.array( + [ + 0.48931587, + 0.40102208, + 0.49653798, + 0.4203022, + 0.34621224, + 0.50789315, + 0.41116416, + 0.4933398, + 0.5465742, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.005 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max( - ) < 0.005 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.005 def test_stable_diffusion_img2img_fp16(self): """Test that stable diffusion img2img works with fp16""" @@ -164,8 +174,7 @@ def test_stable_diffusion_img2img_fp16(self): scheduler = PNDMScheduler(skip_prk_steps=True) vae = self.dummy_vae bert = self.dummy_text_encoder - tokenizer = XLMRobertaTokenizer.from_pretrained( - "hf-internal-testing/tiny-xlm-roberta") + tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta") tokenizer.model_max_length = 77 init_image = self.dummy_image unet = unet.to(dtype=paddle.float16) @@ -178,9 +187,9 @@ def test_stable_diffusion_img2img_fp16(self): text_encoder=bert, tokenizer=tokenizer, safety_checker=None, - feature_extractor=self.dummy_extractor, ) - alt_pipe.image_processor = VaeImageProcessor( - vae_scale_factor=alt_pipe.vae_scale_factor) + feature_extractor=self.dummy_extractor, + ) + alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor) alt_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" generator = paddle.Generator().manual_seed(0) @@ -189,7 +198,8 @@ def test_stable_diffusion_img2img_fp16(self): generator=generator, num_inference_steps=2, output_type="np", - image=init_image, ).images + image=init_image, + ).images assert image.shape == (1, 32, 32, 3) def test_stable_diffusion_img2img_pipeline_multiple_of_8(self): @@ -198,8 +208,7 @@ def test_stable_diffusion_img2img_pipeline_multiple_of_8(self): ) init_image = init_image.resize((760, 504)) model_id = "BAAI/AltDiffusion" - pipe = AltDiffusionImg2ImgPipeline.from_pretrained( - model_id, safety_checker=None) + pipe = AltDiffusionImg2ImgPipeline.from_pretrained(model_id, safety_checker=None) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() prompt = "A fantasy landscape, trending on artstation" @@ -210,21 +219,24 @@ def test_stable_diffusion_img2img_pipeline_multiple_of_8(self): strength=0.75, guidance_scale=7.5, generator=generator, - output_type="np", ) + output_type="np", + ) image = output.images[0] image_slice = image[255:258, 383:386, -1] assert image.shape == (504, 760, 3) - expected_slice = np.array([ - 0.3251649, - 0.3340174, - 0.3418343, - 0.32628638, - 0.33462793, - 0.3300547, - 0.31628466, - 0.3470268, - 0.34273332, - ]) + expected_slice = np.array( + [ + 0.3251649, + 0.3340174, + 0.3418343, + 0.32628638, + 0.33462793, + 0.3300547, + 0.31628466, + 0.3470268, + 0.34273332, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.005 @@ -245,8 +257,7 @@ def test_stable_diffusion_img2img_pipeline_default(self): # "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/fantasy_landscape_alt.npy" # ) model_id = "BAAI/AltDiffusion" - pipe = AltDiffusionImg2ImgPipeline.from_pretrained( - model_id, safety_checker=None) + pipe = AltDiffusionImg2ImgPipeline.from_pretrained(model_id, safety_checker=None) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() prompt = "A fantasy landscape, trending on artstation" @@ -257,19 +268,22 @@ def test_stable_diffusion_img2img_pipeline_default(self): strength=0.75, guidance_scale=7.5, generator=generator, - output_type="np", ) + output_type="np", + ) image = output.images assert image.shape == (1, 512, 768, 3) image_slice = image[0, -3:, -3:, -1] - expected_slice = np.array([ - 0.09987255930900574, - 0.09875822067260742, - 0.12803134322166443, - 0.10067081451416016, - 0.1142435073852539, - 0.11815103888511658, - 0.14216548204421997, - 0.16465380787849426, - 0.15393462777137756, - ]) + expected_slice = np.array( + [ + 0.09987255930900574, + 0.09875822067260742, + 0.12803134322166443, + 0.10067081451416016, + 0.1142435073852539, + 0.11815103888511658, + 0.14216548204421997, + 0.16465380787849426, + 0.15393462777137756, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 diff --git a/ppdiffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py b/ppdiffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py index a8426c0ee78a1..e65d01ffc9eb8 100644 --- a/ppdiffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py +++ b/ppdiffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py @@ -19,9 +19,16 @@ import numpy as np import paddle -from ppdiffusers import (AudioDiffusionPipeline, AutoencoderKL, DDIMScheduler, - DDPMScheduler, DiffusionPipeline, Mel, - UNet2DConditionModel, UNet2DModel) +from ppdiffusers import ( + AudioDiffusionPipeline, + AutoencoderKL, + DDIMScheduler, + DDPMScheduler, + DiffusionPipeline, + Mel, + UNet2DConditionModel, + UNet2DModel, +) from ppdiffusers.utils import slow from ppdiffusers.utils.testing_utils import require_paddle_gpu @@ -42,7 +49,8 @@ def dummy_unet(self): layers_per_block=2, block_out_channels=(128, 128), down_block_types=("AttnDownBlock2D", "DownBlock2D"), - up_block_types=("UpBlock2D", "AttnUpBlock2D"), ) + up_block_types=("UpBlock2D", "AttnUpBlock2D"), + ) return model @property @@ -56,7 +64,8 @@ def dummy_unet_condition(self): block_out_channels=(128, 128), down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"), up_block_types=("UpBlock2D", "CrossAttnUpBlock2D"), - cross_attention_dim=10, ) + cross_attention_dim=10, + ) return model @property @@ -70,7 +79,8 @@ def dummy_vqvae_and_unet(self): layers_per_block=2, block_out_channels=(128, 128), down_block_types=("DownEncoderBlock2D", "DownEncoderBlock2D"), - up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"), ) + up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"), + ) unet = UNet2DModel( sample_size=(64, 32), in_channels=1, @@ -78,14 +88,14 @@ def dummy_vqvae_and_unet(self): layers_per_block=2, block_out_channels=(128, 128), down_block_types=("AttnDownBlock2D", "DownBlock2D"), - up_block_types=("UpBlock2D", "AttnUpBlock2D"), ) + up_block_types=("UpBlock2D", "AttnUpBlock2D"), + ) return vqvae, unet def test_audio_diffusion(self): mel = Mel() scheduler = DDPMScheduler() - pipe = AudioDiffusionPipeline( - vqvae=None, unet=self.dummy_unet, mel=mel, scheduler=scheduler) + pipe = AudioDiffusionPipeline(vqvae=None, unet=self.dummy_unet, mel=mel, scheduler=scheduler) pipe.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(42) output = pipe(generator=generator, steps=4) @@ -96,55 +106,55 @@ def test_audio_diffusion(self): image_from_tuple = output[0][0] assert audio.shape == ( 1, - (self.dummy_unet.config.sample_size[1] - 1) * mel.hop_length, ) - assert (image.height == self.dummy_unet.config.sample_size[0] and - image.width == self.dummy_unet.config.sample_size[1]) + (self.dummy_unet.config.sample_size[1] - 1) * mel.hop_length, + ) + assert ( + image.height == self.dummy_unet.config.sample_size[0] + and image.width == self.dummy_unet.config.sample_size[1] + ) image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10] - image_from_tuple_slice = np.frombuffer( - image_from_tuple.tobytes(), dtype="uint8")[:10] + image_from_tuple_slice = np.frombuffer(image_from_tuple.tobytes(), dtype="uint8")[:10] expected_slice = np.array([0, 252, 0, 160, 144, 1, 0, 211, 99, 3]) assert np.abs(image_slice.flatten() - expected_slice).max() == 0 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max( - ) <= 5 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() <= 5 scheduler = DDIMScheduler() dummy_vqvae_and_unet = self.dummy_vqvae_and_unet pipe = AudioDiffusionPipeline( vqvae=self.dummy_vqvae_and_unet[0], unet=dummy_vqvae_and_unet[1], mel=mel, - scheduler=scheduler, ) + scheduler=scheduler, + ) pipe.set_progress_bar_config(disable=None) np.random.seed(0) raw_audio = np.random.uniform( -1, 1, - ((dummy_vqvae_and_unet[0].config.sample_size[1] - 1) * - mel.hop_length, ), ) + ((dummy_vqvae_and_unet[0].config.sample_size[1] - 1) * mel.hop_length,), + ) generator = paddle.Generator().manual_seed(42) - output = pipe( - raw_audio=raw_audio, generator=generator, start_step=5, steps=10) + output = pipe(raw_audio=raw_audio, generator=generator, start_step=5, steps=10) image = output.images[0] assert ( image.height == self.dummy_vqvae_and_unet[0].config.sample_size[0] - and - image.width == self.dummy_vqvae_and_unet[0].config.sample_size[1]) + and image.width == self.dummy_vqvae_and_unet[0].config.sample_size[1] + ) image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10] - expected_slice = np.array( - [128, 100, 153, 95, 92, 77, 130, 121, 81, 166]) + expected_slice = np.array([128, 100, 153, 95, 92, 77, 130, 121, 81, 166]) assert np.abs(image_slice.flatten() - expected_slice).max() <= 5 dummy_unet_condition = self.dummy_unet_condition pipe = AudioDiffusionPipeline( vqvae=self.dummy_vqvae_and_unet[0], unet=dummy_unet_condition, mel=mel, - scheduler=scheduler, ) + scheduler=scheduler, + ) np.random.seed(0) encoding = paddle.rand(shape=(1, 1, 10)) output = pipe(generator=generator, encoding=encoding) image = output.images[0] image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10] - expected_slice = np.array( - [139, 103, 88, 105, 100, 120, 116, 99, 106, 89]) + expected_slice = np.array([139, 103, 88, 105, 100, 120, 116, 99, 106, 89]) assert np.abs(image_slice.flatten() - expected_slice).max() <= 5 @@ -157,8 +167,7 @@ def tearDown(self): paddle.device.cuda.empty_cache() def test_audio_diffusion(self): - pipe = DiffusionPipeline.from_pretrained( - "teticio/audio-diffusion-ddim-256") + pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256") pipe.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(42) output = pipe(generator=generator) @@ -166,10 +175,9 @@ def test_audio_diffusion(self): image = output.images[0] assert audio.shape == ( 1, - (pipe.unet.config.sample_size[1] - 1) * pipe.mel.hop_length, ) - assert (image.height == pipe.unet.config.sample_size[0] and - image.width == pipe.unet.config.sample_size[1]) + (pipe.unet.config.sample_size[1] - 1) * pipe.mel.hop_length, + ) + assert image.height == pipe.unet.config.sample_size[0] and image.width == pipe.unet.config.sample_size[1] image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10] - expected_slice = np.array( - [151, 167, 154, 144, 122, 134, 121, 105, 70, 26]) + expected_slice = np.array([151, 167, 154, 144, 122, 134, 121, 105, 70, 26]) assert np.abs(image_slice.flatten() - expected_slice).max() <= 5 diff --git a/ppdiffusers/tests/pipelines/audioldm/test_audioldm.py b/ppdiffusers/tests/pipelines/audioldm/test_audioldm.py index 82c9242a44d2d..c9d67aaf82a83 100644 --- a/ppdiffusers/tests/pipelines/audioldm/test_audioldm.py +++ b/ppdiffusers/tests/pipelines/audioldm/test_audioldm.py @@ -18,13 +18,22 @@ import numpy as np import paddle import paddle.nn.functional as F -from paddlenlp.transformers import (ClapTextConfig, ClapTextModelWithProjection, - RobertaTokenizer, SpeechT5HifiGan, - SpeechT5HifiGanConfig) - -from ppdiffusers import (AudioLDMPipeline, AutoencoderKL, DDIMScheduler, - LMSDiscreteScheduler, PNDMScheduler, - UNet2DConditionModel) +from paddlenlp.transformers import ( + ClapTextConfig, + ClapTextModelWithProjection, + RobertaTokenizer, + SpeechT5HifiGan, + SpeechT5HifiGanConfig, +) + +from ppdiffusers import ( + AudioLDMPipeline, + AutoencoderKL, + DDIMScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + UNet2DConditionModel, +) from ppdiffusers.training_utils import enable_full_determinism from ppdiffusers.utils import require_paddle_gpu, slow @@ -39,16 +48,18 @@ class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase): params = TEXT_TO_AUDIO_PARAMS batch_params = TEXT_TO_AUDIO_BATCH_PARAMS test_xformers_attention = False - required_optional_params = frozenset([ - "num_inference_steps", - "num_waveforms_per_prompt", - "generator", - "latents", - "output_type", - "return_dict", - "callback", - "callback_steps", - ]) + required_optional_params = frozenset( + [ + "num_inference_steps", + "num_waveforms_per_prompt", + "generator", + "latents", + "output_type", + "return_dict", + "callback", + "callback_steps", + ] + ) def get_dummy_components(self): paddle.seed(0) @@ -63,13 +74,15 @@ def get_dummy_components(self): cross_attention_dim=(32, 64), class_embed_type="simple_projection", projection_class_embeddings_input_dim=32, - class_embeddings_concat=True, ) + class_embeddings_concat=True, + ) scheduler = DDIMScheduler( beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, - set_alpha_to_one=False, ) + set_alpha_to_one=False, + ) paddle.seed(0) vae = AutoencoderKL( block_out_channels=[32, 64], @@ -77,7 +90,8 @@ def get_dummy_components(self): out_channels=1, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, ) + latent_channels=4, + ) paddle.seed(0) text_encoder_config = ClapTextConfig( bos_token_id=0, @@ -89,11 +103,11 @@ def get_dummy_components(self): num_hidden_layers=5, pad_token_id=1, vocab_size=1000, - projection_dim=32, ) + projection_dim=32, + ) text_encoder = ClapTextModelWithProjection(text_encoder_config) text_encoder.eval() - tokenizer = RobertaTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-roberta", model_max_length=77) + tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77) vocoder_config = SpeechT5HifiGanConfig( model_in_dim=8, @@ -103,7 +117,8 @@ def get_dummy_components(self): upsample_kernel_sizes=[4, 4], resblock_kernel_sizes=[3, 7], resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]], - normalize_before=False, ) + normalize_before=False, + ) vocoder = SpeechT5HifiGan(vocoder_config) vocoder.eval() @@ -139,18 +154,20 @@ def test_audioldm_ddim(self): assert len(audio) == 256 audio_slice = audio[:10] - expected_slice = np.array([ - -0.0050, - 0.0050, - -0.0060, - 0.0033, - -0.0026, - 0.0033, - -0.0027, - 0.0033, - -0.0028, - 0.0033, - ]) + expected_slice = np.array( + [ + -0.0050, + 0.0050, + -0.0060, + 0.0033, + -0.0026, + 0.0033, + -0.0027, + 0.0033, + -0.0028, + 0.0033, + ] + ) assert np.abs(audio_slice - expected_slice).max() < 1e-2 @@ -175,10 +192,13 @@ def test_audioldm_prompt_embeds(self): max_length=audioldm_pipe.tokenizer.model_max_length, return_attention_mask=True, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_inputs = text_inputs["input_ids"].cast("int32") - prompt_embeds = audioldm_pipe.text_encoder(text_inputs, ) + prompt_embeds = audioldm_pipe.text_encoder( + text_inputs, + ) prompt_embeds = prompt_embeds.text_embeds # additional L_2 normalization over each hidden-state prompt_embeds = F.normalize(prompt_embeds, axis=-1) @@ -216,10 +236,13 @@ def test_audioldm_negative_prompt_embeds(self): max_length=audioldm_pipe.tokenizer.model_max_length, truncation=True, return_attention_mask=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_inputs = text_inputs["input_ids"].cast("int32") - text_embeds = audioldm_pipe.text_encoder(text_inputs, ) + text_embeds = audioldm_pipe.text_encoder( + text_inputs, + ) text_embeds = text_embeds.text_embeds # additional L_2 normalization over each hidden-state text_embeds = F.normalize(text_embeds, axis=-1) @@ -249,18 +272,20 @@ def test_audioldm_negative_prompt(self): assert len(audio) == 256 audio_slice = audio[:10] - expected_slice = np.array([ - -0.0051, - 0.0050, - -0.0060, - 0.0034, - -0.0026, - 0.0033, - -0.0027, - 0.0033, - -0.0028, - 0.0032, - ]) + expected_slice = np.array( + [ + -0.0051, + 0.0050, + -0.0060, + 0.0034, + -0.0026, + 0.0033, + -0.0027, + 0.0033, + -0.0028, + 0.0032, + ] + ) assert np.abs(audio_slice - expected_slice).max() < 1e-2 @@ -278,8 +303,7 @@ def test_audioldm_num_waveforms_per_prompt(self): # test num_waveforms_per_prompt=1 (default) for batch of prompts batch_size = 2 - audios = audioldm_pipe( - [prompt] * batch_size, num_inference_steps=2).audios + audios = audioldm_pipe([prompt] * batch_size, num_inference_steps=2).audios assert audios.shape == (batch_size, 256) @@ -288,7 +312,8 @@ def test_audioldm_num_waveforms_per_prompt(self): audios = audioldm_pipe( prompt, num_inference_steps=2, - num_waveforms_per_prompt=num_waveforms_per_prompt, ).audios + num_waveforms_per_prompt=num_waveforms_per_prompt, + ).audios assert audios.shape == (num_waveforms_per_prompt, 256) @@ -297,7 +322,8 @@ def test_audioldm_num_waveforms_per_prompt(self): audios = audioldm_pipe( [prompt] * batch_size, num_inference_steps=2, - num_waveforms_per_prompt=num_waveforms_per_prompt, ).audios + num_waveforms_per_prompt=num_waveforms_per_prompt, + ).audios assert audios.shape == (batch_size * num_waveforms_per_prompt, 256) @@ -339,12 +365,10 @@ def test_audioldm_vocoder_model_in_dim(self): assert audio_shape == (1, 256) def test_attention_slicing_forward_pass(self): - self._test_attention_slicing_forward_pass( - test_mean_pixel_difference=False) + self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False) def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical( - test_mean_pixel_difference=False) + self._test_inference_batch_single_identical(test_mean_pixel_difference=False) @slow @@ -380,25 +404,26 @@ def test_audioldm(self): assert len(audio) == 81920 audio_slice = audio[77230:77240] - expected_slice = np.array([ - -0.4884, - -0.4607, - 0.0023, - 0.5007, - 0.5896, - 0.5151, - 0.3813, - -0.0208, - -0.3687, - -0.4315, - ]) + expected_slice = np.array( + [ + -0.4884, + -0.4607, + 0.0023, + 0.5007, + 0.5896, + 0.5151, + 0.3813, + -0.0208, + -0.3687, + -0.4315, + ] + ) max_diff = np.abs(expected_slice - audio_slice).max() assert max_diff < 1e-2 def test_audioldm_lms(self): audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm") - audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config( - audioldm_pipe.scheduler.config) + audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config) audioldm_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() @@ -408,17 +433,19 @@ def test_audioldm_lms(self): assert len(audio) == 81920 audio_slice = audio[27780:27790] - expected_slice = np.array([ - -0.2131, - -0.0873, - -0.0124, - -0.0189, - 0.0569, - 0.1373, - 0.1883, - 0.2886, - 0.3297, - 0.2212, - ]) + expected_slice = np.array( + [ + -0.2131, + -0.0873, + -0.0124, + -0.0189, + 0.0569, + 0.1373, + 0.1883, + 0.2886, + 0.3297, + 0.2212, + ] + ) max_diff = np.abs(expected_slice - audio_slice).max() assert max_diff < 3e-2 diff --git a/ppdiffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py b/ppdiffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py index 9b76eed8898ad..b8477a5e775df 100644 --- a/ppdiffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py +++ b/ppdiffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py @@ -23,8 +23,10 @@ from ppdiffusers.utils import slow from ppdiffusers.utils.testing_utils import require_paddle_gpu -from ..pipeline_params import (UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS, - UNCONDITIONAL_AUDIO_GENERATION_PARAMS) +from ..pipeline_params import ( + UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS, + UNCONDITIONAL_AUDIO_GENERATION_PARAMS, +) from ..test_pipelines_common import PipelineTesterMixin @@ -55,9 +57,9 @@ def get_dummy_components(self): use_timestep_embedding=False, time_embedding_type="fourier", mid_block_type="UNetMidBlock1D", - down_block_types=("DownBlock1DNoSkip", "DownBlock1D", - "AttnDownBlock1D"), - up_block_types=("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"), ) + down_block_types=("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"), + up_block_types=("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"), + ) scheduler = IPNDMScheduler() components = {"unet": unet, "scheduler": scheduler} return components @@ -65,11 +67,7 @@ def get_dummy_components(self): def get_dummy_inputs(self, seed=0): generator = paddle.Generator().manual_seed(seed) - inputs = { - "batch_size": 1, - "generator": generator, - "num_inference_steps": 4 - } + inputs = {"batch_size": 1, "generator": generator, "num_inference_steps": 4} return inputs def test_dance_diffusion(self): @@ -81,8 +79,7 @@ def test_dance_diffusion(self): audio = output.audios audio_slice = audio[0, -3:, -3:] assert audio.shape == (1, 2, components["unet"].sample_size) - expected_slice = np.array( - [1.0, 1.0, 0.9972942, -0.4477799, -0.5952974, 1.0]) + expected_slice = np.array([1.0, 1.0, 0.9972942, -0.4477799, -0.5952974, 1.0]) assert np.abs(audio_slice.flatten() - expected_slice).max() < 0.01 @@ -98,42 +95,39 @@ def test_dance_diffusion(self): pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k") pipe.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) - output = pipe( - generator=generator, - num_inference_steps=100, - audio_length_in_s=4.096) + output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096) audio = output.audios audio_slice = audio[0, -3:, -3:] assert audio.shape == (1, 2, pipe.unet.sample_size) - expected_slice = np.array([ - -0.15758808, - -0.15257765, - -0.12701476, - -0.26994032, - -0.27616554, - -0.24865153, - ]) + expected_slice = np.array( + [ + -0.15758808, + -0.15257765, + -0.12701476, + -0.26994032, + -0.27616554, + -0.24865153, + ] + ) assert np.abs(audio_slice.flatten() - expected_slice).max() < 0.01 def test_dance_diffusion_fp16(self): - pipe = DanceDiffusionPipeline.from_pretrained( - "harmonai/maestro-150k", paddle_dtype=paddle.float16) + pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k", paddle_dtype=paddle.float16) pipe.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) - output = pipe( - generator=generator, - num_inference_steps=100, - audio_length_in_s=4.096) + output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096) audio = output.audios audio_slice = audio[0, -3:, -3:] assert audio.shape == (1, 2, pipe.unet.sample_size) # scheduler use fp32 - expected_slice = np.array([ - -0.15350387, - -0.14624646, - -0.12091318, - -0.25969276, - -0.26154587, - -0.23359495, - ]) + expected_slice = np.array( + [ + -0.15350387, + -0.14624646, + -0.12091318, + -0.25969276, + -0.26154587, + -0.23359495, + ] + ) assert np.abs(audio_slice.flatten() - expected_slice).max() < 0.05 diff --git a/ppdiffusers/tests/pipelines/ddim/test_ddim.py b/ppdiffusers/tests/pipelines/ddim/test_ddim.py index c2fb14bc1020a..92f66001a03f4 100644 --- a/ppdiffusers/tests/pipelines/ddim/test_ddim.py +++ b/ppdiffusers/tests/pipelines/ddim/test_ddim.py @@ -21,8 +21,10 @@ from ppdiffusers import DDIMPipeline, DDIMScheduler, UNet2DModel from ppdiffusers.utils.testing_utils import require_paddle_gpu, slow -from ..pipeline_params import (UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS, - UNCONDITIONAL_IMAGE_GENERATION_PARAMS) +from ..pipeline_params import ( + UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS, + UNCONDITIONAL_IMAGE_GENERATION_PARAMS, +) from ..test_pipelines_common import PipelineTesterMixin @@ -47,7 +49,8 @@ def get_dummy_components(self): in_channels=3, out_channels=3, down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), ) + up_block_types=("AttnUpBlock2D", "UpBlock2D"), + ) scheduler = DDIMScheduler() components = {"unet": unet, "scheduler": scheduler} return components @@ -71,17 +74,19 @@ def test_inference(self): image = pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] self.assertEqual(image.shape, (1, 32, 32, 3)) - expected_slice = np.array([ - 0.0, - 0.00152004, - 0.0, - 0.0, - 0.00860906, - 0.00182715, - 0.00189051, - 1.0, - 0.668702, - ]) + expected_slice = np.array( + [ + 0.0, + 0.00152004, + 0.0, + 0.0, + 0.00860906, + 0.00182715, + 0.00189051, + 1.0, + 0.668702, + ] + ) max_diff = np.abs(image_slice.flatten() - expected_slice).max() self.assertLessEqual(max_diff, 0.001) @@ -99,10 +104,7 @@ def test_inference_cifar10(self): image = ddim(generator=generator, eta=0.0, output_type="numpy").images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([ - 0.2060, 0.2042, 0.2022, 0.2193, 0.2146, 0.2110, 0.2471, 0.2446, - 0.2388 - ]) + expected_slice = np.array([0.2060, 0.2042, 0.2022, 0.2193, 0.2146, 0.2110, 0.2471, 0.2446, 0.2388]) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_inference_ema_bedroom(self): @@ -115,15 +117,17 @@ def test_inference_ema_bedroom(self): image = ddim(generator=generator, output_type="numpy").images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 256, 256, 3) - expected_slice = np.array([ - 0.19830778, - 0.18826014, - 0.18584034, - 0.1927332, - 0.18754855, - 0.17855307, - 0.18288234, - 0.16375086, - 0.1497818, - ]) + expected_slice = np.array( + [ + 0.19830778, + 0.18826014, + 0.18584034, + 0.1927332, + 0.18754855, + 0.17855307, + 0.18288234, + 0.16375086, + 0.1497818, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 diff --git a/ppdiffusers/tests/pipelines/ddpm/test_ddpm.py b/ppdiffusers/tests/pipelines/ddpm/test_ddpm.py index 988129c546625..f2d25b2e39403 100644 --- a/ppdiffusers/tests/pipelines/ddpm/test_ddpm.py +++ b/ppdiffusers/tests/pipelines/ddpm/test_ddpm.py @@ -33,7 +33,8 @@ def dummy_uncond_unet(self): in_channels=3, out_channels=3, down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), ) + up_block_types=("AttnUpBlock2D", "UpBlock2D"), + ) return model def test_fast_inference(self): @@ -42,33 +43,33 @@ def test_fast_inference(self): ddpm = DDPMPipeline(unet=unet, scheduler=scheduler) ddpm.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) - image = ddpm( - generator=generator, num_inference_steps=2, - output_type="numpy").images + image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images generator = paddle.Generator().manual_seed(0) image_from_tuple = ddpm( generator=generator, num_inference_steps=2, output_type="numpy", - return_dict=False, )[0] + return_dict=False, + )[0] image_slice = image[0, -3:, -3:, -1] image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([ - 0.0, - 0.0, - 0.0, - 0.0, - 0.007474243640899658, - 0.0, - 0.007990598678588867, - 0.9972629547119141, - 0.6665917634963989, - ]) + expected_slice = np.array( + [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.007474243640899658, + 0.0, + 0.007990598678588867, + 0.9972629547119141, + 0.6665917634963989, + ] + ) print(image_slice.flatten().tolist()) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max( - ) < 0.01 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01 def test_inference_predict_sample(self): unet = self.dummy_uncond_unet @@ -76,18 +77,14 @@ def test_inference_predict_sample(self): ddpm = DDPMPipeline(unet=unet, scheduler=scheduler) ddpm.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) - image = ddpm( - generator=generator, num_inference_steps=2, - output_type="numpy").images + image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images generator = paddle.Generator().manual_seed(0) - image_eps = ddpm( - generator=generator, num_inference_steps=2, output_type="numpy")[0] + image_eps = ddpm(generator=generator, num_inference_steps=2, output_type="numpy")[0] image_slice = image[0, -3:, -3:, -1] image_eps_slice = image_eps[0, -3:, -3:, -1] assert image.shape == (1, 32, 32, 3) tolerance = 0.01 - assert (np.abs(image_slice.flatten() - image_eps_slice.flatten()).max() - < tolerance) + assert np.abs(image_slice.flatten() - image_eps_slice.flatten()).max() < tolerance @slow @@ -103,8 +100,5 @@ def test_inference_cifar10(self): image = ddpm(generator=generator, output_type="numpy").images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([ - 0.4454, 0.2025, 0.0315, 0.3023, 0.2575, 0.1031, 0.0953, 0.1604, - 0.2020 - ]) + expected_slice = np.array([0.4454, 0.2025, 0.0315, 0.3023, 0.2575, 0.1031, 0.0953, 0.1604, 0.2020]) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/__init__.py b/ppdiffusers/tests/pipelines/deepfloyd_if/__init__.py index 9f3a881a35c78..acb9a8a602116 100644 --- a/ppdiffusers/tests/pipelines/deepfloyd_if/__init__.py +++ b/ppdiffusers/tests/pipelines/deepfloyd_if/__init__.py @@ -30,13 +30,11 @@ class IFPipelineTesterMixin: def _get_dummy_components(self): paddle.seed(0) - text_encoder = T5EncoderModel.from_pretrained( - "hf-internal-testing/tiny-random-t5") + text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5") text_encoder.eval() paddle.seed(0) - tokenizer = AutoTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-t5") + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5") paddle.seed(0) unet = UNet2DConditionModel( @@ -48,9 +46,7 @@ def _get_dummy_components(self): "SimpleCrossAttnDownBlock2D", ], mid_block_type="UNetMidBlock2DSimpleCrossAttn", - up_block_types=[ - "SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D" - ], + up_block_types=["SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"], in_channels=3, out_channels=6, cross_attention_dim=32, @@ -60,9 +56,9 @@ def _get_dummy_components(self): addition_embed_type_num_heads=2, cross_attention_norm="group_norm", resnet_time_scale_shift="scale_shift", - act_fn="gelu", ) - unet.set_attn_processor( - AttnAddedKVProcessor()) # For reproducibility tests + act_fn="gelu", + ) + unet.set_attn_processor(AttnAddedKVProcessor()) # For reproducibility tests paddle.seed(0) scheduler = DDPMScheduler( @@ -74,7 +70,8 @@ def _get_dummy_components(self): dynamic_thresholding_ratio=0.95, sample_max_value=1.0, prediction_type="epsilon", - variance_type="learned_range", ) + variance_type="learned_range", + ) paddle.seed(0) watermarker = IFWatermarker() @@ -91,13 +88,11 @@ def _get_dummy_components(self): def _get_superresolution_dummy_components(self): paddle.seed(0) - text_encoder = T5EncoderModel.from_pretrained( - "hf-internal-testing/tiny-random-t5") + text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5") text_encoder.eval() paddle.seed(0) - tokenizer = AutoTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-t5") + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5") paddle.seed(0) unet = UNet2DConditionModel( @@ -109,9 +104,7 @@ def _get_superresolution_dummy_components(self): "SimpleCrossAttnDownBlock2D", ], mid_block_type="UNetMidBlock2DSimpleCrossAttn", - up_block_types=[ - "SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D" - ], + up_block_types=["SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"], in_channels=6, out_channels=6, cross_attention_dim=32, @@ -125,9 +118,9 @@ def _get_superresolution_dummy_components(self): class_embed_type="timestep", mid_block_scale_factor=1.414, time_embedding_act_fn="gelu", - time_embedding_dim=32, ) - unet.set_attn_processor( - AttnAddedKVProcessor()) # For reproducibility tests + time_embedding_dim=32, + ) + unet.set_attn_processor(AttnAddedKVProcessor()) # For reproducibility tests paddle.seed(0) scheduler = DDPMScheduler( @@ -139,14 +132,16 @@ def _get_superresolution_dummy_components(self): dynamic_thresholding_ratio=0.95, sample_max_value=1.0, prediction_type="epsilon", - variance_type="learned_range", ) + variance_type="learned_range", + ) paddle.seed(0) image_noising_scheduler = DDPMScheduler( num_train_timesteps=1000, beta_schedule="squaredcos_cap_v2", beta_start=0.0001, - beta_end=0.02, ) + beta_end=0.02, + ) paddle.seed(0) watermarker = IFWatermarker() @@ -226,8 +221,7 @@ def _test_save_load_optional_components(self): pipe_loaded = self.pipeline_class.from_pretrained(tmpdir) pipe_loaded.set_progress_bar_config(disable=None) - pipe_loaded.unet.set_attn_processor( - AttnAddedKVProcessor()) # For reproducibility tests + pipe_loaded.unet.set_attn_processor(AttnAddedKVProcessor()) # For reproducibility tests for optional_component in pipe._optional_components: self.assertTrue( @@ -278,8 +272,7 @@ def _test_save_load_local(self): pipe_loaded = self.pipeline_class.from_pretrained(tmpdir) pipe_loaded.set_progress_bar_config(disable=None) - pipe_loaded.unet.set_attn_processor( - AttnAddedKVProcessor()) # For reproducibility tests + pipe_loaded.unet.set_attn_processor(AttnAddedKVProcessor()) # For reproducibility tests inputs = self.get_dummy_inputs() output_loaded = pipe_loaded(**inputs)[0] diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if.py index f5daacd7abdcb..4192ea593d45d 100644 --- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if.py +++ b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if.py @@ -19,26 +19,31 @@ import paddle from ppdiffusers import ( - IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, IFInpaintingPipeline, - IFInpaintingSuperResolutionPipeline, IFPipeline, IFSuperResolutionPipeline) + IFImg2ImgPipeline, + IFImg2ImgSuperResolutionPipeline, + IFInpaintingPipeline, + IFInpaintingSuperResolutionPipeline, + IFPipeline, + IFSuperResolutionPipeline, +) from ppdiffusers.models.attention_processor import AttnAddedKVProcessor -from ppdiffusers.utils.testing_utils import (floats_tensor, load_numpy, - require_paddle_gpu, slow) +from ppdiffusers.utils.testing_utils import ( + floats_tensor, + load_numpy, + require_paddle_gpu, + slow, +) from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ..test_pipelines_common import (PipelineTesterMixin, - assert_mean_pixel_difference) +from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference from . import IFPipelineTesterMixin -class IFPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, - unittest.TestCase): +class IFPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase): pipeline_class = IFPipeline params = TEXT_TO_IMAGE_PARAMS - {"width", "height", "latents"} batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - required_optional_params = PipelineTesterMixin.required_optional_params - { - "latents" - } + required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} def get_dummy_components(self): return self._get_dummy_components() @@ -69,11 +74,12 @@ def test_save_load_local(self): self._test_save_load_local() def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical(expected_max_diff=1e-2, ) + self._test_inference_batch_single_identical( + expected_max_diff=1e-2, + ) def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass( - expected_max_diff=1e-3) + self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3) @slow @@ -88,24 +94,21 @@ def tearDown(self): def test_all(self): # if - pipe_1 = IFPipeline.from_pretrained( - "DeepFloyd/IF-I-XL-v1.0", - variant="fp16", - paddle_dtype=paddle.float16) + pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", paddle_dtype=paddle.float16) pipe_2 = IFSuperResolutionPipeline.from_pretrained( "DeepFloyd/IF-II-L-v1.0", variant="fp16", paddle_dtype=paddle.float16, text_encoder=None, - tokenizer=None, ) + tokenizer=None, + ) # pre compute text embeddings and remove T5 to save memory pipe_1.text_encoder - prompt_embeds, negative_prompt_embeds = pipe_1.encode_prompt( - "anime turtle") + prompt_embeds, negative_prompt_embeds = pipe_1.encode_prompt("anime turtle") del pipe_1.tokenizer del pipe_1.text_encoder @@ -136,8 +139,7 @@ def test_all(self): pipe_1.unet.set_attn_processor(AttnAddedKVProcessor()) pipe_2.unet.set_attn_processor(AttnAddedKVProcessor()) - self._test_if_img2img(pipe_1, pipe_2, prompt_embeds, - negative_prompt_embeds) + self._test_if_img2img(pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds) pipe_1.remove_all_hooks() pipe_2.remove_all_hooks() @@ -153,8 +155,7 @@ def test_all(self): pipe_1.unet.set_attn_processor(AttnAddedKVProcessor()) pipe_2.unet.set_attn_processor(AttnAddedKVProcessor()) - self._test_if_inpainting(pipe_1, pipe_2, prompt_embeds, - negative_prompt_embeds) + self._test_if_inpainting(pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds) def _test_if(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds): # pipeline 1 @@ -165,7 +166,8 @@ def _test_if(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds): negative_prompt_embeds=negative_prompt_embeds, num_inference_steps=2, generator=generator, - output_type="np", ) + output_type="np", + ) image = output.images[0] @@ -191,7 +193,8 @@ def _test_if(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds): image=image, generator=generator, num_inference_steps=2, - output_type="np", ) + output_type="np", + ) image = output.images[0] @@ -205,8 +208,7 @@ def _test_if(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds): ) assert_mean_pixel_difference(image, expected_image) - def _test_if_img2img(self, pipe_1, pipe_2, prompt_embeds, - negative_prompt_embeds): + def _test_if_img2img(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds): # pipeline 1 image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)) @@ -219,7 +221,8 @@ def _test_if_img2img(self, pipe_1, pipe_2, prompt_embeds, image=image, num_inference_steps=2, generator=generator, - output_type="np", ) + output_type="np", + ) image = output.images[0] @@ -247,7 +250,8 @@ def _test_if_img2img(self, pipe_1, pipe_2, prompt_embeds, original_image=original_image, generator=generator, num_inference_steps=2, - output_type="np", ) + output_type="np", + ) image = output.images[0] @@ -261,8 +265,7 @@ def _test_if_img2img(self, pipe_1, pipe_2, prompt_embeds, ) assert_mean_pixel_difference(image, expected_image) - def _test_if_inpainting(self, pipe_1, pipe_2, prompt_embeds, - negative_prompt_embeds): + def _test_if_inpainting(self, pipe_1, pipe_2, prompt_embeds, negative_prompt_embeds): # pipeline 1 image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)) @@ -276,7 +279,8 @@ def _test_if_inpainting(self, pipe_1, pipe_2, prompt_embeds, mask_image=mask_image, num_inference_steps=2, generator=generator, - output_type="np", ) + output_type="np", + ) image = output.images[0] @@ -306,7 +310,8 @@ def _test_if_inpainting(self, pipe_1, pipe_2, prompt_embeds, original_image=original_image, generator=generator, num_inference_steps=2, - output_type="np", ) + output_type="np", + ) image = output.images[0] diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img.py index 3fce4eab7164b..bab44fc4a5cbf 100644 --- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img.py +++ b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img.py @@ -20,20 +20,19 @@ from ppdiffusers import IFImg2ImgPipeline from ppdiffusers.utils import floats_tensor -from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, - TEXT_GUIDED_IMAGE_VARIATION_PARAMS) +from ..pipeline_params import ( + TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_PARAMS, +) from ..test_pipelines_common import PipelineTesterMixin from . import IFPipelineTesterMixin -class IFImg2ImgPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, - unittest.TestCase): +class IFImg2ImgPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase): pipeline_class = IFImg2ImgPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"width", "height"} batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS - required_optional_params = PipelineTesterMixin.required_optional_params - { - "latents" - } + required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} def get_dummy_components(self): return self._get_dummy_components() @@ -58,8 +57,7 @@ def test_save_load_optional_components(self): self._test_save_load_optional_components() def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass( - expected_max_diff=1e-3) + self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3) def test_save_load_float16(self): # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder @@ -75,4 +73,6 @@ def test_save_load_local(self): self._test_save_load_local() def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical(expected_max_diff=1e-2, ) + self._test_inference_batch_single_identical( + expected_max_diff=1e-2, + ) diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py index effd8aec47da6..0d977c5d6f2ee 100644 --- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py +++ b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py @@ -20,21 +20,19 @@ from ppdiffusers import IFImg2ImgSuperResolutionPipeline from ppdiffusers.utils import floats_tensor -from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, - TEXT_GUIDED_IMAGE_VARIATION_PARAMS) +from ..pipeline_params import ( + TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_PARAMS, +) from ..test_pipelines_common import PipelineTesterMixin from . import IFPipelineTesterMixin -class IFImg2ImgSuperResolutionPipelineFastTests( - PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase): +class IFImg2ImgSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase): pipeline_class = IFImg2ImgSuperResolutionPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"width", "height"} - batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union( - {"original_image"}) - required_optional_params = PipelineTesterMixin.required_optional_params - { - "latents" - } + batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"original_image"}) + required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} def get_dummy_components(self): return self._get_superresolution_dummy_components() @@ -58,8 +56,7 @@ def get_dummy_inputs(self, seed=0): return inputs def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass( - expected_max_diff=1e-3) + self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3) def test_save_load_optional_components(self): self._test_save_load_optional_components() @@ -75,4 +72,6 @@ def test_save_load_local(self): self._test_save_load_local() def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical(expected_max_diff=1e-2, ) + self._test_inference_batch_single_identical( + expected_max_diff=1e-2, + ) diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting.py index 803ebffdb1ad5..e46b7c5ebea69 100644 --- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting.py +++ b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting.py @@ -20,20 +20,19 @@ from ppdiffusers import IFInpaintingPipeline from ppdiffusers.utils import floats_tensor -from ..pipeline_params import (TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, - TEXT_GUIDED_IMAGE_INPAINTING_PARAMS) +from ..pipeline_params import ( + TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_INPAINTING_PARAMS, +) from ..test_pipelines_common import PipelineTesterMixin from . import IFPipelineTesterMixin -class IFInpaintingPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, - unittest.TestCase): +class IFInpaintingPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase): pipeline_class = IFInpaintingPipeline params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"width", "height"} batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS - required_optional_params = PipelineTesterMixin.required_optional_params - { - "latents" - } + required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} def get_dummy_components(self): return self._get_dummy_components() @@ -57,8 +56,7 @@ def get_dummy_inputs(self, seed=0): return inputs def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass( - expected_max_diff=1e-3) + self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3) def test_save_load_optional_components(self): self._test_save_load_optional_components() @@ -74,4 +72,6 @@ def test_save_load_local(self): self._test_save_load_local() def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical(expected_max_diff=1e-2, ) + self._test_inference_batch_single_identical( + expected_max_diff=1e-2, + ) diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py index 0f24c066122e2..d50852284146e 100644 --- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py +++ b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py @@ -20,21 +20,19 @@ from ppdiffusers import IFInpaintingSuperResolutionPipeline from ppdiffusers.utils import floats_tensor -from ..pipeline_params import (TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, - TEXT_GUIDED_IMAGE_INPAINTING_PARAMS) +from ..pipeline_params import ( + TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_INPAINTING_PARAMS, +) from ..test_pipelines_common import PipelineTesterMixin from . import IFPipelineTesterMixin -class IFInpaintingSuperResolutionPipelineFastTests( - PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase): +class IFInpaintingSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase): pipeline_class = IFInpaintingSuperResolutionPipeline params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"width", "height"} - batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS.union( - {"original_image"}) - required_optional_params = PipelineTesterMixin.required_optional_params - { - "latents" - } + batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS.union({"original_image"}) + required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} def get_dummy_components(self): return self._get_superresolution_dummy_components() @@ -60,8 +58,7 @@ def get_dummy_inputs(self, seed=0): return inputs def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass( - expected_max_diff=1e-3) + self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3) def test_save_load_optional_components(self): self._test_save_load_optional_components() @@ -77,4 +74,6 @@ def test_save_load_local(self): self._test_save_load_local() def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical(expected_max_diff=1e-2, ) + self._test_inference_batch_single_identical( + expected_max_diff=1e-2, + ) diff --git a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_superresolution.py b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_superresolution.py index ae1810b58f991..79a7319b80757 100644 --- a/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_superresolution.py +++ b/ppdiffusers/tests/pipelines/deepfloyd_if/test_if_superresolution.py @@ -20,20 +20,19 @@ from ppdiffusers import IFSuperResolutionPipeline from ppdiffusers.utils import floats_tensor -from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, - TEXT_GUIDED_IMAGE_VARIATION_PARAMS) +from ..pipeline_params import ( + TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_PARAMS, +) from ..test_pipelines_common import PipelineTesterMixin from . import IFPipelineTesterMixin -class IFSuperResolutionPipelineFastTests( - PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase): +class IFSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.TestCase): pipeline_class = IFSuperResolutionPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"width", "height"} batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS - required_optional_params = PipelineTesterMixin.required_optional_params - { - "latents" - } + required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} def get_dummy_components(self): return self._get_superresolution_dummy_components() @@ -55,8 +54,7 @@ def get_dummy_inputs(self, seed=0): return inputs def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass( - expected_max_diff=1e-3) + self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3) def test_save_load_optional_components(self): self._test_save_load_optional_components() @@ -72,4 +70,6 @@ def test_save_load_local(self): self._test_save_load_local() def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical(expected_max_diff=1e-2, ) + self._test_inference_batch_single_identical( + expected_max_diff=1e-2, + ) diff --git a/ppdiffusers/tests/pipelines/dit/test_dit.py b/ppdiffusers/tests/pipelines/dit/test_dit.py index ffbe5d6d4dc33..c9d17607fcbd0 100644 --- a/ppdiffusers/tests/pipelines/dit/test_dit.py +++ b/ppdiffusers/tests/pipelines/dit/test_dit.py @@ -19,13 +19,20 @@ import numpy as np import paddle -from ppdiffusers import (AutoencoderKL, DDIMScheduler, DiTPipeline, - DPMSolverMultistepScheduler, Transformer2DModel) +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + DiTPipeline, + DPMSolverMultistepScheduler, + Transformer2DModel, +) from ppdiffusers.utils import slow from ppdiffusers.utils.testing_utils import require_paddle_gpu -from ..pipeline_params import (CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS, - CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS) +from ..pipeline_params import ( + CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS, + CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS, +) from ..test_pipelines_common import PipelineTesterMixin @@ -55,7 +62,8 @@ def get_dummy_components(self): activation_fn="gelu-approximate", num_embeds_ada_norm=1000, norm_type="ada_norm_zero", - norm_elementwise_affine=False, ) + norm_elementwise_affine=False, + ) vae = AutoencoderKL() scheduler = DDIMScheduler() components = { @@ -85,20 +93,15 @@ def test_inference(self): image_slice = image[0, -3:, -3:, -1] self.assertEqual(image.shape, (1, 16, 16, 3)) print(image_slice.flatten()) - expected_slice = np.array([ - 0.28088313, 0.0, 0.8108508, 1.0, 1.0, 0.47994, 0.9075564, 0.0, - 0.14398015 - ]) + expected_slice = np.array([0.28088313, 0.0, 0.8108508, 1.0, 1.0, 0.47994, 0.9075564, 0.0, 0.14398015]) max_diff = np.abs(image_slice.flatten() - expected_slice).max() self.assertLessEqual(max_diff, 0.001) def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical( - relax_max_difference=True, expected_max_diff=1e-3) + self._test_inference_batch_single_identical(relax_max_difference=True, expected_max_diff=1e-3) def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass( - expected_max_diff=1e-3) + self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3) @require_paddle_gpu @@ -116,35 +119,35 @@ def test_dit_256(self): words = ["vase", "umbrella", "white shark", "white wolf"] ids = pipe.get_label_ids(words) - images = pipe( - ids, generator=generator, num_inference_steps=40, - output_type="np").images - expected_slices = np.array([ - [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], - [0.0, 0.0, 0.0, 0.0, 0.0016301274299621582, 0.0, 0.0, 0.0, 0.0], + images = pipe(ids, generator=generator, num_inference_steps=40, output_type="np").images + expected_slices = np.array( [ - 0.434637188911438, - 0.4323567748069763, - 0.4406988322734833, - 0.442973256111145, - 0.4462621212005615, - 0.45129328966140747, - 0.41893237829208374, - 0.42390328645706177, - 0.3906112015247345, - ], - [ - 0.9986965656280518, - 0.9948190450668335, - 0.9841029644012451, - 0.9911775588989258, - 0.9871039390563965, - 0.9874314069747925, - 0.9822297096252441, - 0.9997426271438599, - 1.0, - ], - ]) + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0016301274299621582, 0.0, 0.0, 0.0, 0.0], + [ + 0.434637188911438, + 0.4323567748069763, + 0.4406988322734833, + 0.442973256111145, + 0.4462621212005615, + 0.45129328966140747, + 0.41893237829208374, + 0.42390328645706177, + 0.3906112015247345, + ], + [ + 0.9986965656280518, + 0.9948190450668335, + 0.9841029644012451, + 0.9911775588989258, + 0.9871039390563965, + 0.9874314069747925, + 0.9822297096252441, + 0.9997426271438599, + 1.0, + ], + ] + ) for word, image, expected_slice in zip(words, images, expected_slices): # expected_image = load_numpy( @@ -152,37 +155,34 @@ def test_dit_256(self): # ) assert image.shape == (256, 256, 3) image_slice = image[-3:, -3:, -1] - assert np.abs((image_slice.flatten() - expected_slice).max( - )) < 0.001 + assert np.abs((image_slice.flatten() - expected_slice).max()) < 0.001 def test_dit_512_fp16(self): - pipe = DiTPipeline.from_pretrained( - "facebook/DiT-XL-2-512", paddle_dtype=paddle.float16) - pipe.scheduler = DPMSolverMultistepScheduler.from_config( - pipe.scheduler.config) + pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-512", paddle_dtype=paddle.float16) + pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) pipe.to("gpu") words = ["vase", "umbrella"] ids = pipe.get_label_ids(words) generator = paddle.Generator().manual_seed(0) - images = pipe( - ids, generator=generator, num_inference_steps=25, - output_type="np").images + images = pipe(ids, generator=generator, num_inference_steps=25, output_type="np").images - expected_slices = np.array([ - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.994140625], + expected_slices = np.array( [ - 0.0, - 0.0, - 0.01708984375, - 0.024658203125, - 0.0830078125, - 0.134521484375, - 0.175537109375, - 0.33740234375, - 0.207763671875, - ], - ]) + [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.994140625], + [ + 0.0, + 0.0, + 0.01708984375, + 0.024658203125, + 0.0830078125, + 0.134521484375, + 0.175537109375, + 0.33740234375, + 0.207763671875, + ], + ] + ) for word, image, expected_slice in zip(words, images, expected_slices): # expected_image = load_numpy( diff --git a/ppdiffusers/tests/pipelines/karras_ve/test_karras_ve.py b/ppdiffusers/tests/pipelines/karras_ve/test_karras_ve.py index aff5775323867..da80059ddfdc4 100644 --- a/ppdiffusers/tests/pipelines/karras_ve/test_karras_ve.py +++ b/ppdiffusers/tests/pipelines/karras_ve/test_karras_ve.py @@ -33,7 +33,8 @@ def dummy_uncond_unet(self): in_channels=3, out_channels=3, down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), ) + up_block_types=("AttnUpBlock2D", "UpBlock2D"), + ) return model def test_inference(self): @@ -42,22 +43,20 @@ def test_inference(self): pipe = KarrasVePipeline(unet=unet, scheduler=scheduler) pipe.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) - image = pipe( - num_inference_steps=2, generator=generator, - output_type="numpy").images + image = pipe(num_inference_steps=2, generator=generator, output_type="numpy").images generator = paddle.Generator().manual_seed(0) image_from_tuple = pipe( num_inference_steps=2, generator=generator, output_type="numpy", - return_dict=False, )[0] + return_dict=False, + )[0] image_slice = image[0, -3:, -3:, -1] image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] assert image.shape == (1, 32, 32, 3) expected_slice = np.array([0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0]) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max( - ) < 0.01 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01 @slow @@ -70,20 +69,20 @@ def test_inference(self): pipe = KarrasVePipeline(unet=model, scheduler=scheduler) pipe.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) - image = pipe( - num_inference_steps=20, generator=generator, - output_type="numpy").images + image = pipe(num_inference_steps=20, generator=generator, output_type="numpy").images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 256, 256, 3) - expected_slice = np.array([ - 0.7528239, - 0.7529462, - 0.76014197, - 0.75482357, - 0.75692874, - 0.7577723, - 0.760527, - 0.758951, - 0.7599246, - ]) + expected_slice = np.array( + [ + 0.7528239, + 0.7529462, + 0.76014197, + 0.75482357, + 0.75692874, + 0.7577723, + 0.760527, + 0.758951, + 0.7599246, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 diff --git a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py b/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py index 93583e8814480..3bdb01281a103 100644 --- a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py +++ b/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py @@ -20,10 +20,18 @@ import paddle from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer -from ppdiffusers import (AutoencoderKL, DDIMScheduler, LDMTextToImagePipeline, - UNet2DConditionModel) -from ppdiffusers.utils.testing_utils import (load_numpy, nightly, - require_paddle_gpu, slow) +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + LDMTextToImagePipeline, + UNet2DConditionModel, +) +from ppdiffusers.utils.testing_utils import ( + load_numpy, + nightly, + require_paddle_gpu, + slow, +) from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS from ..test_pipelines_common import PipelineTesterMixin @@ -55,13 +63,15 @@ def get_dummy_components(self): out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, ) + cross_attention_dim=32, + ) scheduler = DDIMScheduler( beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, - set_alpha_to_one=False, ) + set_alpha_to_one=False, + ) paddle.seed(0) vae = AutoencoderKL( block_out_channels=(32, 64), @@ -69,7 +79,8 @@ def get_dummy_components(self): out_channels=3, down_block_types=("DownEncoderBlock2D", "DownEncoderBlock2D"), up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"), - latent_channels=4, ) + latent_channels=4, + ) paddle.seed(0) text_encoder_config = CLIPTextConfig( bos_token_id=0, @@ -80,10 +91,10 @@ def get_dummy_components(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, ) + vocab_size=1000, + ) text_encoder = CLIPTextModel(text_encoder_config).eval() - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") components = { "unet": unet, "scheduler": scheduler, @@ -113,17 +124,19 @@ def test_inference_text2img(self): image = pipe(**inputs).images assert image.shape == (1, 64, 64, 3) image_slice = image[0, -3:, -3:, -1] - expected_slice = np.array([ - 0.28524342, - 0.23806289, - 0.38151595, - 0.21939021, - 0.26112252, - 0.5172909, - 0.25647423, - 0.25049314, - 0.47979864, - ]) + expected_slice = np.array( + [ + 0.28524342, + 0.23806289, + 0.38151595, + 0.21939021, + 0.26112252, + 0.5172909, + 0.25647423, + 0.25049314, + 0.47979864, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001 @@ -150,24 +163,25 @@ def get_inputs(self, dtype="float32", seed=0): return inputs def test_ldm_default_ddim(self): - pipe = LDMTextToImagePipeline.from_pretrained( - "CompVis/ldm-text2im-large-256") + pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256") pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = pipe(**inputs).images image_slice = image[0, -3:, -3:, -1].flatten() assert image.shape == (1, 256, 256, 3) - expected_slice = np.array([ - 0.51825, - 0.5285, - 0.52543, - 0.54258, - 0.52304, - 0.52569, - 0.54363, - 0.55276, - 0.56878, - ]) + expected_slice = np.array( + [ + 0.51825, + 0.5285, + 0.52543, + 0.54258, + 0.52304, + 0.52569, + 0.54363, + 0.55276, + 0.56878, + ] + ) max_diff = np.abs(expected_slice - image_slice).max() assert max_diff < 0.02 @@ -195,8 +209,7 @@ def get_inputs(self, dtype="float32", seed=0): return inputs def test_ldm_default_ddim(self): - pipe = LDMTextToImagePipeline.from_pretrained( - "CompVis/ldm-text2im-large-256") + pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256") pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = pipe(**inputs).images[0] diff --git a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py index 32472986acf44..aea2e7538f903 100644 --- a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py +++ b/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py @@ -19,10 +19,8 @@ import numpy as np import paddle -from ppdiffusers import (DDIMScheduler, LDMSuperResolutionPipeline, UNet2DModel, - VQModel) -from ppdiffusers.utils import (PIL_INTERPOLATION, floats_tensor, load_image, - slow) +from ppdiffusers import DDIMScheduler, LDMSuperResolutionPipeline, UNet2DModel, VQModel +from ppdiffusers.utils import PIL_INTERPOLATION, floats_tensor, load_image, slow from ppdiffusers.utils.testing_utils import require_paddle @@ -32,8 +30,7 @@ def dummy_image(self): batch_size = 1 num_channels = 3 sizes = 32, 32 - image = floats_tensor( - (batch_size, num_channels) + sizes, rng=random.Random(0)) + image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)) return image @property @@ -46,7 +43,8 @@ def dummy_uncond_unet(self): in_channels=6, out_channels=3, down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), ) + up_block_types=("AttnUpBlock2D", "UpBlock2D"), + ) return model @property @@ -58,15 +56,15 @@ def dummy_vq_model(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=3, ) + latent_channels=3, + ) return model def test_inference_superresolution(self): unet = self.dummy_uncond_unet scheduler = DDIMScheduler() vqvae = self.dummy_vq_model - ldm = LDMSuperResolutionPipeline( - unet=unet, vqvae=vqvae, scheduler=scheduler) + ldm = LDMSuperResolutionPipeline(unet=unet, vqvae=vqvae, scheduler=scheduler) ldm.set_progress_bar_config(disable=None) init_image = self.dummy_image generator = paddle.Generator().manual_seed(0) @@ -74,20 +72,23 @@ def test_inference_superresolution(self): image=init_image, generator=generator, num_inference_steps=2, - output_type="numpy", ).images + output_type="numpy", + ).images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.12982202, - 0.8338444, - 0.46506804, - 0.5459576, - 0.6662215, - 0.38444045, - 0.72195464, - 0.5719301, - 0.36579454, - ]) + expected_slice = np.array( + [ + 0.12982202, + 0.8338444, + 0.46506804, + 0.5459576, + 0.6662215, + 0.38444045, + 0.72195464, + 0.5719301, + 0.36579454, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05 def test_inference_superresolution_fp16(self): @@ -96,12 +97,10 @@ def test_inference_superresolution_fp16(self): vqvae = self.dummy_vq_model unet = unet.to(dtype=paddle.float16) vqvae = vqvae.to(dtype=paddle.float16) - ldm = LDMSuperResolutionPipeline( - unet=unet, vqvae=vqvae, scheduler=scheduler) + ldm = LDMSuperResolutionPipeline(unet=unet, vqvae=vqvae, scheduler=scheduler) ldm.set_progress_bar_config(disable=None) init_image = self.dummy_image - image = ldm(init_image, num_inference_steps=2, - output_type="numpy").images + image = ldm(init_image, num_inference_steps=2, output_type="numpy").images assert image.shape == (1, 64, 64, 3) @@ -112,21 +111,17 @@ def test_inference_superresolution(self): init_image = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/vq_diffusion/teddy_bear_pool.png" ) - init_image = init_image.resize( - (64, 64), resample=PIL_INTERPOLATION["lanczos"]) - ldm = LDMSuperResolutionPipeline.from_pretrained( - "duongna/ldm-super-resolution") + init_image = init_image.resize((64, 64), resample=PIL_INTERPOLATION["lanczos"]) + ldm = LDMSuperResolutionPipeline.from_pretrained("duongna/ldm-super-resolution") ldm.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) image = ldm( image=init_image, generator=generator, num_inference_steps=20, - output_type="numpy", ).images + output_type="numpy", + ).images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 256, 256, 3) - expected_slice = np.array([ - 0.7644, 0.7679, 0.7642, 0.7633, 0.7666, 0.756, 0.7425, 0.7257, - 0.6907 - ]) + expected_slice = np.array([0.7644, 0.7679, 0.7642, 0.7633, 0.7666, 0.756, 0.7425, 0.7257, 0.6907]) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05 diff --git a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py b/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py index 5ad34d0481b67..89319ee92bcb2 100644 --- a/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py +++ b/ppdiffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py @@ -34,7 +34,8 @@ def dummy_uncond_unet(self): in_channels=3, out_channels=3, down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), ) + up_block_types=("AttnUpBlock2D", "UpBlock2D"), + ) return model @property @@ -46,7 +47,8 @@ def dummy_vq_model(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=3, ) + latent_channels=3, + ) return model @property @@ -61,7 +63,8 @@ def dummy_text_encoder(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, ) + vocab_size=1000, + ) return CLIPTextModel(config).eval() def test_inference_uncond(self): @@ -71,33 +74,33 @@ def test_inference_uncond(self): ldm = LDMPipeline(unet=unet, vqvae=vae, scheduler=scheduler) ldm.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) - image = ldm(generator=generator, - num_inference_steps=2, - output_type="numpy").images + image = ldm(generator=generator, num_inference_steps=2, output_type="numpy").images generator = paddle.Generator().manual_seed(0) image_from_tuple = ldm( generator=generator, num_inference_steps=2, output_type="numpy", - return_dict=False, )[0] + return_dict=False, + )[0] image_slice = image[0, -3:, -3:, -1] image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.827049, - 1.0, - 0.6244688, - 0.7729403, - 1.0, - 0.73071766, - 0.6108738, - 0.9107263, - 0.7249622, - ]) + expected_slice = np.array( + [ + 0.827049, + 1.0, + 0.6244688, + 0.7729403, + 1.0, + 0.73071766, + 0.6108738, + 0.9107263, + 0.7249622, + ] + ) tolerance = 0.01 assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance - assert (np.abs(image_from_tuple_slice.flatten() - expected_slice).max() - < tolerance) + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < tolerance @slow @@ -107,21 +110,21 @@ def test_inference_uncond(self): ldm = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256") ldm.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) - image = ldm(generator=generator, - num_inference_steps=5, - output_type="numpy").images + image = ldm(generator=generator, num_inference_steps=5, output_type="numpy").images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 256, 256, 3) - expected_slice = np.array([ - 0.59802866, - 0.61698544, - 0.62753576, - 0.6128236, - 0.60961217, - 0.617262, - 0.6060791, - 0.60261935, - 0.6129079, - ]) + expected_slice = np.array( + [ + 0.59802866, + 0.61698544, + 0.62753576, + 0.6128236, + 0.60961217, + 0.617262, + 0.6060791, + 0.60261935, + 0.6129079, + ] + ) tolerance = 0.01 assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance diff --git a/ppdiffusers/tests/pipelines/paint_by_example/test_paint_by_example.py b/ppdiffusers/tests/pipelines/paint_by_example/test_paint_by_example.py index 35c1718941567..00025bde5002d 100644 --- a/ppdiffusers/tests/pipelines/paint_by_example/test_paint_by_example.py +++ b/ppdiffusers/tests/pipelines/paint_by_example/test_paint_by_example.py @@ -22,14 +22,20 @@ from paddlenlp.transformers import CLIPImageProcessor, CLIPVisionConfig from PIL import Image -from ppdiffusers import (AutoencoderKL, PaintByExamplePipeline, PNDMScheduler, - UNet2DConditionModel) +from ppdiffusers import ( + AutoencoderKL, + PaintByExamplePipeline, + PNDMScheduler, + UNet2DConditionModel, +) from ppdiffusers.pipelines.paint_by_example import PaintByExampleImageEncoder from ppdiffusers.utils import floats_tensor, load_image, slow from ppdiffusers.utils.testing_utils import require_paddle_gpu -from ..pipeline_params import (IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, - IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS) +from ..pipeline_params import ( + IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, + IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS, +) from ..test_pipelines_common import PipelineTesterMixin @@ -48,7 +54,8 @@ def get_dummy_components(self): out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, ) + cross_attention_dim=32, + ) scheduler = PNDMScheduler(skip_prk_steps=True) paddle.seed(0) vae = AutoencoderKL( @@ -57,7 +64,8 @@ def get_dummy_components(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, ) + latent_channels=4, + ) paddle.seed(0) config = CLIPVisionConfig( hidden_size=32, @@ -67,7 +75,8 @@ def get_dummy_components(self): num_attention_heads=4, num_hidden_layers=5, image_size=32, - patch_size=4, ) + patch_size=4, + ) image_encoder = PaintByExampleImageEncoder(config, proj_size=32) feature_extractor = CLIPImageProcessor(crop_size=32, size=32) components = { @@ -93,13 +102,9 @@ def test_save_load_float16(self): def get_dummy_inputs(self, seed=0): image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)) image = image.cpu().transpose(perm=[0, 2, 3, 1])[0] - init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize( - (64, 64)) - mask_image = ( - Image.fromarray(np.uint8(image + 4)).convert("RGB").resize( - (64, 64))) - example_image = Image.fromarray(np.uint8(image)).convert("RGB").resize( - (32, 32)) + init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64)) + mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64)) + example_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32)) generator = paddle.Generator().manual_seed(seed) inputs = { @@ -122,17 +127,19 @@ def test_paint_by_example_inpaint(self): image = output.images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.82595694, - 0.51862055, - 0.5474039, - 0.2411496, - 0.20220888, - 0.3430622, - 0.3558151, - 0.06606945, - 0.4550809, - ]) + expected_slice = np.array( + [ + 0.82595694, + 0.51862055, + 0.5474039, + 0.2411496, + 0.20220888, + 0.3430622, + 0.3558151, + 0.06606945, + 0.4550809, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_paint_by_example_image_tensor(self): @@ -172,8 +179,7 @@ def test_paint_by_example(self): example_image = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/paint_by_example/panda.jpg" ) - pipe = PaintByExamplePipeline.from_pretrained( - "Fantasy-Studio/Paint-by-Example") + pipe = PaintByExamplePipeline.from_pretrained("Fantasy-Studio/Paint-by-Example") pipe.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(seed=321) output = pipe( @@ -183,12 +189,10 @@ def test_paint_by_example(self): generator=generator, guidance_scale=5.0, num_inference_steps=50, - output_type="np", ) + output_type="np", + ) image = output.images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.4834, 0.4811, 0.4874, 0.5122, 0.5081, 0.5144, 0.5291, 0.529, - 0.5374 - ]) + expected_slice = np.array([0.4834, 0.4811, 0.4874, 0.5122, 0.5081, 0.5144, 0.5291, 0.529, 0.5374]) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.02 diff --git a/ppdiffusers/tests/pipelines/pipeline_params.py b/ppdiffusers/tests/pipelines/pipeline_params.py index 33b041a173248..9f835e6e783cc 100644 --- a/ppdiffusers/tests/pipelines/pipeline_params.py +++ b/ppdiffusers/tests/pipelines/pipeline_params.py @@ -22,80 +22,89 @@ # I.e. a text to image pipeline with non-configurable height and width arguments # should set its attribute as `params = TEXT_TO_IMAGE_PARAMS - {'height', 'width'}`. -TEXT_TO_IMAGE_PARAMS = frozenset([ - "prompt", - "height", - "width", - "guidance_scale", - "negative_prompt", - "prompt_embeds", - "negative_prompt_embeds", - "cross_attention_kwargs", -]) +TEXT_TO_IMAGE_PARAMS = frozenset( + [ + "prompt", + "height", + "width", + "guidance_scale", + "negative_prompt", + "prompt_embeds", + "negative_prompt_embeds", + "cross_attention_kwargs", + ] +) TEXT_TO_IMAGE_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"]) -IMAGE_VARIATION_PARAMS = frozenset([ - "image", - "height", - "width", - "guidance_scale", -]) +IMAGE_VARIATION_PARAMS = frozenset( + [ + "image", + "height", + "width", + "guidance_scale", + ] +) IMAGE_VARIATION_BATCH_PARAMS = frozenset(["image"]) -TEXT_GUIDED_IMAGE_VARIATION_PARAMS = frozenset([ - "prompt", - "image", - "height", - "width", - "guidance_scale", - "negative_prompt", - "prompt_embeds", - "negative_prompt_embeds", -]) - -TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS = frozenset( - ["prompt", "image", "negative_prompt"]) - -TEXT_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset([ - # Text guided image variation with an image mask - "prompt", - "image", - "mask_image", - "height", - "width", - "guidance_scale", - "negative_prompt", - "prompt_embeds", - "negative_prompt_embeds", -]) - -TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset( - ["prompt", "image", "mask_image", "negative_prompt"]) - -IMAGE_INPAINTING_PARAMS = frozenset([ - # image variation with an image mask - "image", - "mask_image", - "height", - "width", - "guidance_scale", -]) +TEXT_GUIDED_IMAGE_VARIATION_PARAMS = frozenset( + [ + "prompt", + "image", + "height", + "width", + "guidance_scale", + "negative_prompt", + "prompt_embeds", + "negative_prompt_embeds", + ] +) + +TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS = frozenset(["prompt", "image", "negative_prompt"]) + +TEXT_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset( + [ + # Text guided image variation with an image mask + "prompt", + "image", + "mask_image", + "height", + "width", + "guidance_scale", + "negative_prompt", + "prompt_embeds", + "negative_prompt_embeds", + ] +) + +TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["prompt", "image", "mask_image", "negative_prompt"]) + +IMAGE_INPAINTING_PARAMS = frozenset( + [ + # image variation with an image mask + "image", + "mask_image", + "height", + "width", + "guidance_scale", + ] +) IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["image", "mask_image"]) -IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset([ - "example_image", - "image", - "mask_image", - "height", - "width", - "guidance_scale", -]) +IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset( + [ + "example_image", + "image", + "mask_image", + "height", + "width", + "guidance_scale", + ] +) -IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset( - ["example_image", "image", "mask_image"]) +IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["example_image", "image", "mask_image"]) CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS = frozenset(["class_labels"]) @@ -109,15 +118,17 @@ UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS = frozenset([]) -TEXT_TO_AUDIO_PARAMS = frozenset([ - "prompt", - "audio_length_in_s", - "guidance_scale", - "negative_prompt", - "prompt_embeds", - "negative_prompt_embeds", - "cross_attention_kwargs", -]) +TEXT_TO_AUDIO_PARAMS = frozenset( + [ + "prompt", + "audio_length_in_s", + "guidance_scale", + "negative_prompt", + "prompt_embeds", + "negative_prompt_embeds", + "cross_attention_kwargs", + ] +) TEXT_TO_AUDIO_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"]) TOKENS_TO_AUDIO_GENERATION_PARAMS = frozenset(["input_tokens"]) diff --git a/ppdiffusers/tests/pipelines/pndm/test_pndm.py b/ppdiffusers/tests/pipelines/pndm/test_pndm.py index 2255f43742f71..bfa6285a45d5f 100644 --- a/ppdiffusers/tests/pipelines/pndm/test_pndm.py +++ b/ppdiffusers/tests/pipelines/pndm/test_pndm.py @@ -33,7 +33,8 @@ def dummy_uncond_unet(self): in_channels=3, out_channels=3, down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), ) + up_block_types=("AttnUpBlock2D", "UpBlock2D"), + ) return model def test_inference(self): @@ -42,22 +43,20 @@ def test_inference(self): pndm = PNDMPipeline(unet=unet, scheduler=scheduler) pndm.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) - image = pndm( - generator=generator, num_inference_steps=20, - output_type="numpy").images + image = pndm(generator=generator, num_inference_steps=20, output_type="numpy").images generator = paddle.Generator().manual_seed(0) image_from_tuple = pndm( generator=generator, num_inference_steps=20, output_type="numpy", - return_dict=False, )[0] + return_dict=False, + )[0] image_slice = image[0, -3:, -3:, -1] image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] assert image.shape == (1, 32, 32, 3) expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0]) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max( - ) < 0.01 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01 @slow @@ -73,15 +72,17 @@ def test_inference_cifar10(self): image = pndm(generator=generator, output_type="numpy").images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([ - 0.15949559211730957, - 0.17172572016716003, - 0.17315810918807983, - 0.1836635172367096, - 0.1823960244655609, - 0.1799020767211914, - 0.21776044368743896, - 0.22992581129074097, - 0.21678516268730164, - ]) + expected_slice = np.array( + [ + 0.15949559211730957, + 0.17172572016716003, + 0.17315810918807983, + 0.1836635172367096, + 0.1823960244655609, + 0.1799020767211914, + 0.21776044368743896, + 0.22992581129074097, + 0.21678516268730164, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 diff --git a/ppdiffusers/tests/pipelines/repaint/test_repaint.py b/ppdiffusers/tests/pipelines/repaint/test_repaint.py index 3bce3769af1be..9d27e3b1c5061 100644 --- a/ppdiffusers/tests/pipelines/repaint/test_repaint.py +++ b/ppdiffusers/tests/pipelines/repaint/test_repaint.py @@ -20,11 +20,14 @@ import paddle from ppdiffusers import RePaintPipeline, RePaintScheduler, UNet2DModel -from ppdiffusers.utils.testing_utils import (load_image, load_numpy, nightly, - require_paddle_gpu) +from ppdiffusers.utils.testing_utils import ( + load_image, + load_numpy, + nightly, + require_paddle_gpu, +) -from ..pipeline_params import (IMAGE_INPAINTING_BATCH_PARAMS, - IMAGE_INPAINTING_PARAMS) +from ..pipeline_params import IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_INPAINTING_PARAMS from ..test_pipelines_common import PipelineTesterMixin @@ -49,7 +52,8 @@ def get_dummy_components(self): in_channels=3, out_channels=3, down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), ) + up_block_types=("AttnUpBlock2D", "UpBlock2D"), + ) scheduler = RePaintScheduler() components = {"unet": unet, "scheduler": scheduler} return components @@ -80,17 +84,19 @@ def test_repaint(self): image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([ - 0.08341709, - 0.54262626, - 0.549711, - 0.00903523, - 0.0, - 1.0, - 0.05136755, - 0.5604646, - 0.6273578, - ]) + expected_slice = np.array( + [ + 0.08341709, + 0.54262626, + 0.549711, + 0.00903523, + 0.0, + 1.0, + 0.05136755, + 0.5604646, + 0.6273578, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001 # RePaint can hardly be made deterministic since the scheduler is currently always @@ -133,7 +139,8 @@ def test_celebahq(self): jump_length=10, jump_n_sample=10, generator=generator, - output_type="np", ) + output_type="np", + ) image = output.images[0] assert image.shape == (256, 256, 3) assert np.abs(expected_image - image).mean() < 0.01 diff --git a/ppdiffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py b/ppdiffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py index f3b799000aa41..97af9d23e974c 100644 --- a/ppdiffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py +++ b/ppdiffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py @@ -33,7 +33,8 @@ def dummy_uncond_unet(self): in_channels=3, out_channels=3, down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), ) + up_block_types=("AttnUpBlock2D", "UpBlock2D"), + ) return model def test_inference(self): @@ -42,22 +43,20 @@ def test_inference(self): sde_ve = ScoreSdeVePipeline(unet=unet, scheduler=scheduler) sde_ve.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) - image = sde_ve( - num_inference_steps=2, output_type="numpy", - generator=generator).images + image = sde_ve(num_inference_steps=2, output_type="numpy", generator=generator).images generator = paddle.Generator().manual_seed(0) image_from_tuple = sde_ve( num_inference_steps=2, output_type="numpy", generator=generator, - return_dict=False, )[0] + return_dict=False, + )[0] image_slice = image[0, -3:, -3:, -1] image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] assert image.shape == (1, 32, 32, 3) expected_slice = np.array([0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0]) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max( - ) < 0.01 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01 @slow @@ -70,9 +69,7 @@ def test_inference(self): sde_ve = ScoreSdeVePipeline(unet=model, scheduler=scheduler) sde_ve.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) - image = sde_ve( - num_inference_steps=10, output_type="numpy", - generator=generator).images + image = sde_ve(num_inference_steps=10, output_type="numpy", generator=generator).images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 256, 256, 3) expected_slice = np.array([1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0]) diff --git a/ppdiffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py b/ppdiffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py index 6188cab488e6a..cf7e0a7ba17a7 100644 --- a/ppdiffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py +++ b/ppdiffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py @@ -22,10 +22,16 @@ import paddle from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer -from ppdiffusers import (AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, - PNDMScheduler, UNet2DConditionModel) -from ppdiffusers.pipelines.semantic_stable_diffusion import \ - SemanticStableDiffusionPipeline as StableDiffusionPipeline +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + UNet2DConditionModel, +) +from ppdiffusers.pipelines.semantic_stable_diffusion import ( + SemanticStableDiffusionPipeline as StableDiffusionPipeline, +) from ppdiffusers.utils import floats_tensor, nightly from ppdiffusers.utils.testing_utils import require_paddle_gpu @@ -41,8 +47,7 @@ def dummy_image(self): batch_size = 1 num_channels = 3 sizes = 32, 32 - image = floats_tensor( - (batch_size, num_channels) + sizes, rng=random.Random(0)) + image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)) return image @property @@ -56,7 +61,8 @@ def dummy_cond_unet(self): out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, ) + cross_attention_dim=32, + ) return model @property @@ -68,7 +74,8 @@ def dummy_vae(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, ) + latent_channels=4, + ) return model @property @@ -83,7 +90,8 @@ def dummy_text_encoder(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, ) + vocab_size=1000, + ) return CLIPTextModel(config).eval() @property @@ -108,11 +116,11 @@ def test_semantic_diffusion_ddim(self): beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, - set_alpha_to_one=False, ) + set_alpha_to_one=False, + ) vae = self.dummy_vae bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") sd_pipe = StableDiffusionPipeline( unet=unet, scheduler=scheduler, @@ -120,7 +128,8 @@ def test_semantic_diffusion_ddim(self): text_encoder=bert, tokenizer=tokenizer, safety_checker=None, - feature_extractor=self.dummy_extractor, ) + feature_extractor=self.dummy_extractor, + ) sd_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" generator = paddle.Generator().manual_seed(0) @@ -129,7 +138,8 @@ def test_semantic_diffusion_ddim(self): generator=generator, guidance_scale=6.0, num_inference_steps=2, - output_type="np", ) + output_type="np", + ) image = output.images generator = paddle.Generator().manual_seed(0) image_from_tuple = sd_pipe( @@ -138,29 +148,31 @@ def test_semantic_diffusion_ddim(self): guidance_scale=6.0, num_inference_steps=2, output_type="np", - return_dict=False, )[0] + return_dict=False, + )[0] image_slice = image[0, -3:, -3:, -1] image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.28401083, - 0.23724163, - 0.38141036, - 0.2201719, - 0.26111937, - 0.5176592, - 0.25668317, - 0.25036532, - 0.47986418, - ]) + expected_slice = np.array( + [ + 0.28401083, + 0.23724163, + 0.38141036, + 0.2201719, + 0.26111937, + 0.5176592, + 0.25668317, + 0.25036532, + 0.47986418, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max( - ) < 0.01 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01 def test_semantic_diffusion_no_safety_checker(self): pipe = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-lms-pipe", - safety_checker=None) + "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None + ) assert isinstance(pipe, StableDiffusionPipeline) assert isinstance(pipe.scheduler, LMSDiscreteScheduler) assert pipe.safety_checker is None @@ -168,8 +180,7 @@ def test_semantic_diffusion_no_safety_checker(self): assert image is not None with tempfile.TemporaryDirectory() as tmpdirname: pipe.save_pretrained(tmpdirname) - pipe = StableDiffusionPipeline.from_pretrained( - tmpdirname, from_diffusers=False) + pipe = StableDiffusionPipeline.from_pretrained(tmpdirname, from_diffusers=False) assert pipe.safety_checker is None image = pipe("example prompt", num_inference_steps=2).images[0] assert image is not None @@ -179,8 +190,7 @@ def test_semantic_diffusion_pndm(self): scheduler = PNDMScheduler(skip_prk_steps=True) vae = self.dummy_vae bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") sd_pipe = StableDiffusionPipeline( unet=unet, scheduler=scheduler, @@ -188,7 +198,8 @@ def test_semantic_diffusion_pndm(self): text_encoder=bert, tokenizer=tokenizer, safety_checker=None, - feature_extractor=self.dummy_extractor, ) + feature_extractor=self.dummy_extractor, + ) sd_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" generator = paddle.Generator().manual_seed(0) @@ -197,7 +208,8 @@ def test_semantic_diffusion_pndm(self): generator=generator, guidance_scale=6.0, num_inference_steps=2, - output_type="np", ) + output_type="np", + ) image = output.images generator = paddle.Generator().manual_seed(0) image_from_tuple = sd_pipe( @@ -206,24 +218,26 @@ def test_semantic_diffusion_pndm(self): guidance_scale=6.0, num_inference_steps=2, output_type="np", - return_dict=False, )[0] + return_dict=False, + )[0] image_slice = image[0, -3:, -3:, -1] image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.18612236, - 0.24176982, - 0.36099488, - 0.21807766, - 0.27262795, - 0.51991826, - 0.22258872, - 0.22143877, - 0.4452843, - ]) + expected_slice = np.array( + [ + 0.18612236, + 0.24176982, + 0.36099488, + 0.21807766, + 0.27262795, + 0.51991826, + 0.22258872, + 0.22143877, + 0.4452843, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.02 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max( - ) < 0.02 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.02 def test_semantic_diffusion_fp16(self): """Test that stable diffusion works with fp16""" @@ -231,8 +245,7 @@ def test_semantic_diffusion_fp16(self): scheduler = PNDMScheduler(skip_prk_steps=True) vae = self.dummy_vae bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") unet = unet.to(dtype=paddle.float16) vae = vae.to(dtype=paddle.float16) bert = bert.to(dtype=paddle.float16) @@ -243,11 +256,11 @@ def test_semantic_diffusion_fp16(self): text_encoder=bert, tokenizer=tokenizer, safety_checker=None, - feature_extractor=self.dummy_extractor, ) + feature_extractor=self.dummy_extractor, + ) sd_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" - image = sd_pipe( - [prompt], num_inference_steps=2, output_type="np").images + image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images assert image.shape == (1, 64, 64, 3) @@ -260,8 +273,7 @@ def tearDown(self): # paddle.device.cuda.empty_cache() def test_positive_guidance(self): - pipe = StableDiffusionPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5") + pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") pipe.set_progress_bar_config(disable=None) prompt = "a photo of a cat" edit = { @@ -283,7 +295,8 @@ def test_positive_guidance(self): num_inference_steps=50, output_type="np", width=512, - height=512, ) + height=512, + ) image = output.images image_slice = image[0, -3:, -3:, -1] expected_slice = [ @@ -308,7 +321,8 @@ def test_positive_guidance(self): output_type="np", width=512, height=512, - **edit, ) + **edit, + ) image = output.images image_slice = image[0, -3:, -3:, -1] expected_slice = [ @@ -326,8 +340,7 @@ def test_positive_guidance(self): assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_negative_guidance(self): - pipe = StableDiffusionPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5") + pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") pipe.set_progress_bar_config(disable=None) prompt = "an image of a crowded boulevard, realistic, 4k" edit = { @@ -349,7 +362,8 @@ def test_negative_guidance(self): num_inference_steps=50, output_type="np", width=512, - height=512, ) + height=512, + ) image = output.images image_slice = image[0, -3:, -3:, -1] expected_slice = [ @@ -374,7 +388,8 @@ def test_negative_guidance(self): output_type="np", width=512, height=512, - **edit, ) + **edit, + ) image = output.images image_slice = image[0, -3:, -3:, -1] expected_slice = [ @@ -392,13 +407,11 @@ def test_negative_guidance(self): assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_multi_cond_guidance(self): - pipe = StableDiffusionPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5") + pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") pipe.set_progress_bar_config(disable=None) prompt = "a castle next to a river" edit = { - "editing_prompt": - ["boat on a river, boat", "monet, impression, sunrise"], + "editing_prompt": ["boat on a river, boat", "monet, impression, sunrise"], "reverse_editing_direction": False, "edit_warmup_steps": [15, 18], "edit_guidance_scale": 6, @@ -416,7 +429,8 @@ def test_multi_cond_guidance(self): num_inference_steps=50, output_type="np", width=512, - height=512, ) + height=512, + ) image = output.images image_slice = image[0, -3:, -3:, -1] expected_slice = [ @@ -441,7 +455,8 @@ def test_multi_cond_guidance(self): output_type="np", width=512, height=512, - **edit, ) + **edit, + ) image = output.images image_slice = image[0, -3:, -3:, -1] expected_slice = [ @@ -459,8 +474,7 @@ def test_multi_cond_guidance(self): assert np.abs(image_slice.flatten() - expected_slice).max() < 0.02 def test_guidance_fp16(self): - pipe = StableDiffusionPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", paddle_dtype=paddle.float16) + pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", paddle_dtype=paddle.float16) pipe.set_progress_bar_config(disable=None) prompt = "a photo of a cat" edit = { @@ -482,7 +496,8 @@ def test_guidance_fp16(self): num_inference_steps=50, output_type="np", width=512, - height=512, ) + height=512, + ) image = output.images image_slice = image[0, -3:, -3:, -1] expected_slice = [ @@ -507,7 +522,8 @@ def test_guidance_fp16(self): output_type="np", width=512, height=512, - **edit, ) + **edit, + ) image = output.images image_slice = image[0, -3:, -3:, -1] expected_slice = [ diff --git a/ppdiffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/ppdiffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py index 9355b00dcdff0..465b997e0c007 100644 --- a/ppdiffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py +++ b/ppdiffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py @@ -18,15 +18,19 @@ import numpy as np import paddle -from ppdiffusers import (DDPMScheduler, MidiProcessor, - SpectrogramDiffusionPipeline) +from ppdiffusers import DDPMScheduler, MidiProcessor, SpectrogramDiffusionPipeline from ppdiffusers.pipelines.spectrogram_diffusion import ( - SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder) + SpectrogramContEncoder, + SpectrogramNotesEncoder, + T5FilmDecoder, +) from ppdiffusers.training_utils import enable_full_determinism from ppdiffusers.utils import require_paddle_gpu, slow -from ..pipeline_params import (TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS, - TOKENS_TO_AUDIO_GENERATION_PARAMS) +from ..pipeline_params import ( + TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS, + TOKENS_TO_AUDIO_GENERATION_PARAMS, +) from ..test_pipelines_common import PipelineTesterMixin enable_full_determinism(42) @@ -38,8 +42,7 @@ # is not compatible with python 3.8 which we run in the CI. # https://github.com/huggingface/diffusers/actions/runs/4830121056/jobs/8605954838#step:7:98 # @unittest.skip("The note-seq package currently throws an error on import") -class SpectrogramDiffusionPipelineFastTests(PipelineTesterMixin, - unittest.TestCase): +class SpectrogramDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = SpectrogramDiffusionPipeline required_optional_params = PipelineTesterMixin.required_optional_params - { "callback", @@ -65,7 +68,8 @@ def get_dummy_components(self): num_heads=1, d_kv=4, d_ff=2048, - feed_forward_proj="gated-gelu", ) + feed_forward_proj="gated-gelu", + ) notes_encoder.eval() paddle.seed(0) continuous_encoder = SpectrogramContEncoder( @@ -77,7 +81,8 @@ def get_dummy_components(self): num_heads=1, d_kv=4, d_ff=2048, - feed_forward_proj="gated-gelu", ) + feed_forward_proj="gated-gelu", + ) continuous_encoder.eval() paddle.seed(0) @@ -90,7 +95,8 @@ def get_dummy_components(self): num_heads=1, d_kv=4, d_ff=2048, - dropout_rate=0.1, ) + dropout_rate=0.1, + ) decoder.eval() scheduler = DDPMScheduler() @@ -108,23 +114,26 @@ def get_dummy_inputs(self, seed=0): generator = paddle.Generator().manual_seed(seed) inputs = { - "input_tokens": [[ - 1134, - 90, - 1135, - 1133, - 1080, - 112, - 1132, - 1080, - 1133, - 1079, - 133, - 1132, - 1079, - 1133, - 1, - ] + [0] * 2033], + "input_tokens": [ + [ + 1134, + 90, + 1135, + 1133, + 1080, + 112, + 1132, + 1080, + 1133, + 1079, + 133, + 1132, + 1079, + 1133, + 1, + ] + + [0] * 2033 + ], "generator": generator, "num_inference_steps": 4, "output_type": "mel", @@ -144,17 +153,19 @@ def test_spectrogram_diffusion(self): mel_slice = mel[0, -3:, -3:] assert mel_slice.shape == (3, 3) - expected_slice = np.array([ - -11.46511, - 4.0, - -8.506372, - -11.512925, - -11.512925, - -10.417862, - -8.077912, - 3.7985802, - 4.0, - ]) + expected_slice = np.array( + [ + -11.46511, + 4.0, + -8.506372, + -11.512925, + -11.512925, + -10.417862, + -8.077912, + 3.7985802, + 4.0, + ] + ) assert np.abs(mel_slice.flatten() - expected_slice).max() < 1e-2 def test_save_load_local(self): @@ -191,8 +202,7 @@ def tearDown(self): def test_callback(self): # TODO - test that pipeline can decode tokens in a callback # so that music can be played live - pipe = SpectrogramDiffusionPipeline.from_pretrained( - "google/music-spectrogram-diffusion") + pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion") melgan = pipe.melgan pipe.melgan = None @@ -215,12 +225,12 @@ def callback(step, mel_output): num_inference_steps=5, generator=generator, callback=callback, - output_type="mel", ) + output_type="mel", + ) def test_spectrogram_fast(self): - pipe = SpectrogramDiffusionPipeline.from_pretrained( - "google/music-spectrogram-diffusion") + pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion") pipe.set_progress_bar_config(disable=None) processor = MidiProcessor() @@ -237,8 +247,7 @@ def test_spectrogram_fast(self): def test_spectrogram(self): - pipe = SpectrogramDiffusionPipeline.from_pretrained( - "google/music-spectrogram-diffusion") + pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion") pipe.set_progress_bar_config(disable=None) processor = MidiProcessor() @@ -249,8 +258,7 @@ def test_spectrogram(self): input_tokens = input_tokens[:4] generator = paddle.Generator().manual_seed(0) - output = pipe( - input_tokens, num_inference_steps=100, generator=generator) + output = pipe(input_tokens, num_inference_steps=100, generator=generator) audio = output.audios[0] assert abs(np.abs(audio).sum() - 14418.089) < 5e-2 diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_cycle_diffusion.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_cycle_diffusion.py index f5beae09ac46f..50c27ff574be4 100644 --- a/ppdiffusers/tests/pipelines/stable_diffusion/test_cycle_diffusion.py +++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_cycle_diffusion.py @@ -21,13 +21,19 @@ import paddle from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer -from ppdiffusers import (AutoencoderKL, CycleDiffusionPipeline, DDIMScheduler, - UNet2DConditionModel) +from ppdiffusers import ( + AutoencoderKL, + CycleDiffusionPipeline, + DDIMScheduler, + UNet2DConditionModel, +) from ppdiffusers.utils import floats_tensor, load_image, slow from ppdiffusers.utils.testing_utils import require_paddle_gpu -from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, - TEXT_GUIDED_IMAGE_VARIATION_PARAMS) +from ..pipeline_params import ( + TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_PARAMS, +) from ..test_pipelines_common import PipelineTesterMixin @@ -39,11 +45,8 @@ class CycleDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase): "width", "negative_prompt_embeds", } - required_optional_params = PipelineTesterMixin.required_optional_params - { - "latents" - } - batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union( - {"source_prompt"}) + required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} + batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"source_prompt"}) def get_dummy_components(self): paddle.seed(0) @@ -55,14 +58,16 @@ def get_dummy_components(self): out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, ) + cross_attention_dim=32, + ) scheduler = DDIMScheduler( beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000, clip_sample=False, - set_alpha_to_one=False, ) + set_alpha_to_one=False, + ) paddle.seed(0) vae = AutoencoderKL( block_out_channels=[32, 64], @@ -70,7 +75,8 @@ def get_dummy_components(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, ) + latent_channels=4, + ) paddle.seed(0) text_encoder_config = CLIPTextConfig( bos_token_id=0, @@ -81,10 +87,10 @@ def get_dummy_components(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, ) + vocab_size=1000, + ) text_encoder = CLIPTextModel(text_encoder_config).eval() - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") components = { "unet": unet, "scheduler": scheduler, @@ -123,17 +129,19 @@ def test_stable_diffusion_cycle(self): images = output.images image_slice = images[0, -3:, -3:, -1] assert images.shape == (1, 32, 32, 3) - expected_slice = np.array([ - 0.04812625, - 0.77983606, - 0.71009433, - 0.15924984, - 0.9788434, - 0.49732354, - 0.362224, - 0.6481595, - 0.4530744, - ]) + expected_slice = np.array( + [ + 0.04812625, + 0.77983606, + 0.71009433, + 0.15924984, + 0.9788434, + 0.49732354, + 0.362224, + 0.6481595, + 0.4530744, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_cycle_fp16(self): @@ -148,17 +156,19 @@ def test_stable_diffusion_cycle_fp16(self): images = output.images image_slice = images[0, -3:, -3:, -1] assert images.shape == (1, 32, 32, 3) - expected_slice = np.array([ - 0.05053711, - 0.78125, - 0.7114258, - 0.15991211, - 0.9785156, - 0.49804688, - 0.36279297, - 0.6484375, - 0.45361328, - ]) + expected_slice = np.array( + [ + 0.05053711, + 0.78125, + 0.7114258, + 0.15991211, + 0.9785156, + 0.49804688, + 0.36279297, + 0.6484375, + 0.45361328, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 @unittest.skip("non-deterministic pipeline") @@ -178,18 +188,17 @@ def test_cycle_diffusion_pipeline_fp16(self): init_image = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/cycle-diffusion/black_colored_car.png" ) - expected_image = np.array([[0.14477539, 0.20483398, 0.14135742], - [0.10009766, 0.17602539, 0.11083984]]) + expected_image = np.array([[0.14477539, 0.20483398, 0.14135742], [0.10009766, 0.17602539, 0.11083984]]) init_image = init_image.resize((512, 512)) model_id = "CompVis/stable-diffusion-v1-4" - scheduler = DDIMScheduler.from_pretrained( - model_id, subfolder="scheduler") + scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler") pipe = CycleDiffusionPipeline.from_pretrained( model_id, scheduler=scheduler, safety_checker=None, paddle_dtype=paddle.float16, - revision="fp16", ) + revision="fp16", + ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() source_prompt = "A black colored car" @@ -205,7 +214,8 @@ def test_cycle_diffusion_pipeline_fp16(self): guidance_scale=3, source_guidance_scale=1, generator=generator, - output_type="np", ) + output_type="np", + ) image = output.images assert np.abs(image[0][0][:2] - expected_image).max() < 0.5 @@ -213,14 +223,11 @@ def test_cycle_diffusion_pipeline(self): init_image = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/cycle-diffusion/black_colored_car.png" ) - expected_image = np.array([[0.16294342, 0.20514232, 0.14554858], - [0.11476257, 0.16831946, 0.11495486]]) + expected_image = np.array([[0.16294342, 0.20514232, 0.14554858], [0.11476257, 0.16831946, 0.11495486]]) init_image = init_image.resize((512, 512)) model_id = "CompVis/stable-diffusion-v1-4" - scheduler = DDIMScheduler.from_pretrained( - model_id, subfolder="scheduler") - pipe = CycleDiffusionPipeline.from_pretrained( - model_id, scheduler=scheduler, safety_checker=None) + scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler") + pipe = CycleDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, safety_checker=None) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() source_prompt = "A black colored car" @@ -236,6 +243,7 @@ def test_cycle_diffusion_pipeline(self): guidance_scale=3, source_guidance_scale=1, generator=generator, - output_type="np", ) + output_type="np", + ) image = output.images assert np.abs(image[0][0][:2] - expected_image).max() < 0.01 diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py index 184bd9f7b4927..042ad47fa00eb 100644 --- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -22,10 +22,17 @@ from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer from ppdiffusers import ( - AutoencoderKL, DDIMScheduler, DPMSolverMultistepScheduler, - EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, - LMSDiscreteScheduler, PNDMScheduler, StableDiffusionPipeline, - UNet2DConditionModel, logging) + AutoencoderKL, + DDIMScheduler, + DPMSolverMultistepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + StableDiffusionPipeline, + UNet2DConditionModel, + logging, +) from ppdiffusers.utils import load_numpy, nightly, slow from ppdiffusers.utils.testing_utils import CaptureLogger, require_paddle_gpu @@ -49,13 +56,15 @@ def get_dummy_components(self): out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, ) + cross_attention_dim=32, + ) scheduler = DDIMScheduler( beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, - set_alpha_to_one=False, ) + set_alpha_to_one=False, + ) paddle.seed(0) vae = AutoencoderKL( block_out_channels=[32, 64], @@ -63,7 +72,8 @@ def get_dummy_components(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, ) + latent_channels=4, + ) paddle.seed(0) text_encoder_config = CLIPTextConfig( bos_token_id=0, @@ -74,10 +84,10 @@ def get_dummy_components(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, ) + vocab_size=1000, + ) text_encoder = CLIPTextModel(text_encoder_config).eval() - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") components = { "unet": unet, "scheduler": scheduler, @@ -110,17 +120,19 @@ def test_stable_diffusion_ddim(self): image = output.images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.28519553, - 0.23807192, - 0.38150552, - 0.21930423, - 0.26092762, - 0.51721215, - 0.25639117, - 0.25039536, - 0.47978917, - ]) + expected_slice = np.array( + [ + 0.28519553, + 0.23807192, + 0.38150552, + 0.21930423, + 0.26092762, + 0.51721215, + 0.25639117, + 0.25039536, + 0.47978917, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_lora(self): @@ -159,14 +171,14 @@ def test_stable_diffusion_prompt_embeds(self): padding="max_length", max_length=sd_pipe.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_inputs = text_inputs["input_ids"] prompt_embeds = sd_pipe.text_encoder(text_inputs)[0] inputs["prompt_embeds"] = prompt_embeds output = sd_pipe(**inputs) image_slice_2 = output.images[0, -3:, -3:, -1] - assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max( - ) < 0.0001 + assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 0.0001 def test_stable_diffusion_negative_prompt_embeds(self): components = self.get_dummy_components() @@ -187,14 +199,14 @@ def test_stable_diffusion_negative_prompt_embeds(self): padding="max_length", max_length=sd_pipe.tokenizer.model_max_length, truncation=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_inputs = text_inputs["input_ids"] embeds.append(sd_pipe.text_encoder(text_inputs)[0]) inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds output = sd_pipe(**inputs) image_slice_2 = output.images[0, -3:, -3:, -1] - assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max( - ) < 0.0001 + assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 0.0001 def test_stable_diffusion_ddim_factor_8(self): components = self.get_dummy_components() @@ -205,17 +217,19 @@ def test_stable_diffusion_ddim_factor_8(self): image = output.images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 136, 136, 3) - expected_slice = np.array([ - 0.39545745, - 0.94682777, - 0.6828775, - 0.42496994, - 0.49475053, - 0.48353004, - 0.27300328, - 0.30724254, - 0.50566095, - ]) + expected_slice = np.array( + [ + 0.39545745, + 0.94682777, + 0.6828775, + 0.42496994, + 0.49475053, + 0.48353004, + 0.27300328, + 0.30724254, + 0.50566095, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 @@ -229,23 +243,25 @@ def test_stable_diffusion_pndm(self): image = output.images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.18620703, - 0.24143961, - 0.3609084, - 0.21810293, - 0.27230006, - 0.51992655, - 0.22248471, - 0.2213102, - 0.44538254, - ]) + expected_slice = np.array( + [ + 0.18620703, + 0.24143961, + 0.3609084, + 0.21810293, + 0.27230006, + 0.51992655, + 0.22248471, + 0.2213102, + 0.44538254, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_no_safety_checker(self): pipe = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-lms-pipe", - safety_checker=None) + "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None + ) assert isinstance(pipe, StableDiffusionPipeline) assert isinstance(pipe.scheduler, LMSDiscreteScheduler) assert pipe.safety_checker is None @@ -253,8 +269,7 @@ def test_stable_diffusion_no_safety_checker(self): assert image is not None with tempfile.TemporaryDirectory() as tmpdirname: pipe.save_pretrained(tmpdirname) - pipe = StableDiffusionPipeline.from_pretrained( - tmpdirname, from_diffusers=False) + pipe = StableDiffusionPipeline.from_pretrained(tmpdirname, from_diffusers=False) assert pipe.safety_checker is None image = pipe("example prompt", num_inference_steps=2).images[0] assert image is not None @@ -262,80 +277,82 @@ def test_stable_diffusion_no_safety_checker(self): def test_stable_diffusion_k_lms(self): components = self.get_dummy_components() sd_pipe = StableDiffusionPipeline(**components) - sd_pipe.scheduler = LMSDiscreteScheduler.from_config( - sd_pipe.scheduler.config) + sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs() output = sd_pipe(**inputs) image = output.images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.29910105, - 0.22905633, - 0.37701294, - 0.21332851, - 0.26000416, - 0.52840894, - 0.25865072, - 0.25947532, - 0.47509664, - ]) + expected_slice = np.array( + [ + 0.29910105, + 0.22905633, + 0.37701294, + 0.21332851, + 0.26000416, + 0.52840894, + 0.25865072, + 0.25947532, + 0.47509664, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_k_euler_ancestral(self): components = self.get_dummy_components() sd_pipe = StableDiffusionPipeline(**components) - sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config( - sd_pipe.scheduler.config) + sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs() output = sd_pipe(**inputs) image = output.images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.29917336, - 0.22854236, - 0.37669897, - 0.2137424, - 0.25940597, - 0.528258, - 0.25919583, - 0.2594489, - 0.47522712, - ]) + expected_slice = np.array( + [ + 0.29917336, + 0.22854236, + 0.37669897, + 0.2137424, + 0.25940597, + 0.528258, + 0.25919583, + 0.2594489, + 0.47522712, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_k_euler(self): components = self.get_dummy_components() sd_pipe = StableDiffusionPipeline(**components) - sd_pipe.scheduler = EulerDiscreteScheduler.from_config( - sd_pipe.scheduler.config) + sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs() output = sd_pipe(**inputs) image = output.images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.29910135, - 0.22905621, - 0.3770129, - 0.21332836, - 0.26000386, - 0.52840906, - 0.2586509, - 0.2594754, - 0.47509673, - ]) + expected_slice = np.array( + [ + 0.29910135, + 0.22905621, + 0.3770129, + 0.21332836, + 0.26000386, + 0.52840906, + 0.2586509, + 0.2594754, + 0.47509673, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_vae_slicing(self): components = self.get_dummy_components() - components["scheduler"] = LMSDiscreteScheduler.from_config(components[ - "scheduler"].config) + components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config) sd_pipe = StableDiffusionPipeline(**components) sd_pipe.set_progress_bar_config(disable=None) image_count = 4 @@ -346,9 +363,7 @@ def test_stable_diffusion_vae_slicing(self): inputs = self.get_dummy_inputs() inputs["prompt"] = [inputs["prompt"]] * image_count output_2 = sd_pipe(**inputs) - assert ( - np.abs(output_2.images.flatten() - output_1.images.flatten()).max() - < 0.003) + assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 0.003 def test_stable_diffusion_vae_tiling(self): components = self.get_dummy_components() @@ -367,7 +382,8 @@ def test_stable_diffusion_vae_tiling(self): generator=generator, guidance_scale=6.0, num_inference_steps=2, - output_type="np", ) + output_type="np", + ) # make sure tiled vae decode yields the same result sd_pipe.enable_vae_tiling() @@ -377,11 +393,10 @@ def test_stable_diffusion_vae_tiling(self): generator=generator, guidance_scale=6.0, num_inference_steps=2, - output_type="np", ) + output_type="np", + ) - assert ( - np.abs(output_2.images.flatten() - output_1.images.flatten()).max() - < 5e-1) + assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 5e-1 def test_stable_diffusion_negative_prompt(self): components = self.get_dummy_components() @@ -394,17 +409,19 @@ def test_stable_diffusion_negative_prompt(self): image = output.images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.16709289, - 0.26912582, - 0.35834038, - 0.23045751, - 0.30960953, - 0.5324909, - 0.20372942, - 0.2368694, - 0.43633103, - ]) + expected_slice = np.array( + [ + 0.16709289, + 0.26912582, + 0.35834038, + 0.23045751, + 0.30960953, + 0.5324909, + 0.20372942, + 0.2368694, + 0.43633103, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_num_images_per_prompt(self): @@ -416,59 +433,59 @@ def test_stable_diffusion_num_images_per_prompt(self): images = sd_pipe(prompt, num_inference_steps=2, output_type="np").images assert images.shape == (1, 64, 64, 3) batch_size = 2 - images = sd_pipe( - [prompt] * batch_size, num_inference_steps=2, - output_type="np").images + images = sd_pipe([prompt] * batch_size, num_inference_steps=2, output_type="np").images assert images.shape == (batch_size, 64, 64, 3) num_images_per_prompt = 2 images = sd_pipe( prompt, num_inference_steps=2, output_type="np", - num_images_per_prompt=num_images_per_prompt, ).images + num_images_per_prompt=num_images_per_prompt, + ).images assert images.shape == (num_images_per_prompt, 64, 64, 3) batch_size = 2 images = sd_pipe( [prompt] * batch_size, num_inference_steps=2, output_type="np", - num_images_per_prompt=num_images_per_prompt, ).images + num_images_per_prompt=num_images_per_prompt, + ).images assert images.shape == (batch_size * num_images_per_prompt, 64, 64, 3) def test_stable_diffusion_long_prompt(self): components = self.get_dummy_components() - components["scheduler"] = LMSDiscreteScheduler.from_config(components[ - "scheduler"].config) + components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config) sd_pipe = StableDiffusionPipeline(**components) sd_pipe.set_progress_bar_config(disable=None) do_classifier_free_guidance = True negative_prompt = None num_images_per_prompt = 1 - logger = logging.get_logger( - "ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion") + logger = logging.get_logger("ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion") prompt = 25 * "@" with CaptureLogger(logger) as cap_logger_3: text_embeddings_3 = sd_pipe._encode_prompt( prompt, num_images_per_prompt, do_classifier_free_guidance, - negative_prompt, ) + negative_prompt, + ) prompt = 100 * "@" with CaptureLogger(logger) as cap_logger: text_embeddings = sd_pipe._encode_prompt( prompt, num_images_per_prompt, do_classifier_free_guidance, - negative_prompt, ) + negative_prompt, + ) negative_prompt = "Hello" with CaptureLogger(logger) as cap_logger_2: text_embeddings_2 = sd_pipe._encode_prompt( prompt, num_images_per_prompt, do_classifier_free_guidance, - negative_prompt, ) - assert (text_embeddings_3.shape == text_embeddings_2.shape == - text_embeddings.shape) + negative_prompt, + ) + assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape assert text_embeddings.shape[1] == 77 assert cap_logger.out == cap_logger_2.out assert cap_logger.out.count("@") == 25 @@ -476,20 +493,14 @@ def test_stable_diffusion_long_prompt(self): def test_stable_diffusion_height_width_opt(self): components = self.get_dummy_components() - components["scheduler"] = LMSDiscreteScheduler.from_config(components[ - "scheduler"].config) + components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config) sd_pipe = StableDiffusionPipeline(**components) sd_pipe.set_progress_bar_config(disable=None) prompt = "hey" output = sd_pipe(prompt, num_inference_steps=1, output_type="np") image_shape = output.images[0].shape[:2] assert image_shape == (64, 64) - output = sd_pipe( - prompt, - num_inference_steps=1, - height=96, - width=96, - output_type="np") + output = sd_pipe(prompt, num_inference_steps=1, height=96, width=96, output_type="np") image_shape = output.images[0].shape[:2] assert image_shape == (96, 96) config = dict(sd_pipe.unet.config) @@ -523,113 +534,116 @@ def get_inputs(self, dtype="float32", seed=0): return inputs def test_stable_diffusion_1_1_pndm(self): - sd_pipe = StableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-1") + sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1") sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1].flatten() assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.43625, - 0.43554, - 0.3667, - 0.4066, - 0.39703, - 0.38658, - 0.43936, - 0.43557, - 0.40592, - ]) + expected_slice = np.array( + [ + 0.43625, + 0.43554, + 0.3667, + 0.4066, + 0.39703, + 0.38658, + 0.43936, + 0.43557, + 0.40592, + ] + ) assert np.abs(image_slice - expected_slice).max() < 0.0001 def test_stable_diffusion_1_4_pndm(self): - sd_pipe = StableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4") + sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1].flatten() assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.574, - 0.47841, - 0.31625, - 0.63583, - 0.58306, - 0.55056, - 0.50825, - 0.56306, - 0.55748, - ]) + expected_slice = np.array( + [ + 0.574, + 0.47841, + 0.31625, + 0.63583, + 0.58306, + 0.55056, + 0.50825, + 0.56306, + 0.55748, + ] + ) assert np.abs(image_slice - expected_slice).max() < 0.0001 def test_stable_diffusion_ddim(self): - sd_pipe = StableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None) + sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None) sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1].flatten() assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.38019, - 0.28647, - 0.27321, - 0.40377, - 0.3829, - 0.35446, - 0.39218, - 0.38165, - 0.42239, - ]) + expected_slice = np.array( + [ + 0.38019, + 0.28647, + 0.27321, + 0.40377, + 0.3829, + 0.35446, + 0.39218, + 0.38165, + 0.42239, + ] + ) assert np.abs(image_slice - expected_slice).max() < 0.0001 def test_stable_diffusion_lms(self): - sd_pipe = StableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None) - sd_pipe.scheduler = LMSDiscreteScheduler.from_config( - sd_pipe.scheduler.config) + sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None) + sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1].flatten() assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.10542, - 0.0962, - 0.07332, - 0.09015, - 0.09382, - 0.07597, - 0.08496, - 0.07806, - 0.06455, - ]) + expected_slice = np.array( + [ + 0.10542, + 0.0962, + 0.07332, + 0.09015, + 0.09382, + 0.07597, + 0.08496, + 0.07806, + 0.06455, + ] + ) assert np.abs(image_slice - expected_slice).max() < 0.0001 def test_stable_diffusion_dpm(self): - sd_pipe = StableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None) - sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config( - sd_pipe.scheduler.config) + sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None) + sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1].flatten() assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.03503, - 0.03494, - 0.01087, - 0.03128, - 0.02552, - 0.00803, - 0.00742, - 0.00372, - 0.0, - ]) + expected_slice = np.array( + [ + 0.03503, + 0.03494, + 0.01087, + 0.03128, + 0.02552, + 0.00803, + 0.00742, + 0.00372, + 0.0, + ] + ) assert np.abs(image_slice - expected_slice).max() < 0.0001 # def test_stable_diffusion_attention_slicing(self): @@ -670,8 +684,7 @@ def test_stable_diffusion_dpm(self): # assert np.abs(image_sliced - image).max() < 0.01 def test_stable_diffusion_fp16_vs_autocast(self): - pipe = StableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16) + pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16) pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs(dtype="float16") image_fp16 = pipe(**inputs).images @@ -684,8 +697,7 @@ def test_stable_diffusion_fp16_vs_autocast(self): def test_stable_diffusion_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, - latents: paddle.Tensor) -> None: + def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 @@ -693,40 +705,41 @@ def callback_fn(step: int, timestep: int, latents = latents.detach().cpu().numpy() assert latents.shape == (1, 4, 64, 64) latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([ - -0.5693, - -0.3018, - -0.9746, - 0.0518, - -0.877, - 0.7559, - -1.7402, - 0.1022, - 1.1582, - ]) - assert np.abs(latents_slice.flatten() - expected_slice).max( - ) < 0.05 + expected_slice = np.array( + [ + -0.5693, + -0.3018, + -0.9746, + 0.0518, + -0.877, + 0.7559, + -1.7402, + 0.1022, + 1.1582, + ] + ) + assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05 elif step == 2: latents = latents.detach().cpu().numpy() assert latents.shape == (1, 4, 64, 64) latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([ - -0.1958, - -0.2993, - -1.0166, - -0.5005, - -0.481, - 0.6162, - -0.9492, - 0.6621, - 1.4492, - ]) - assert np.abs(latents_slice.flatten() - expected_slice).max( - ) < 0.05 + expected_slice = np.array( + [ + -0.1958, + -0.2993, + -1.0166, + -0.5005, + -0.481, + 0.6162, + -0.9492, + 0.6621, + 1.4492, + ] + ) + assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05 callback_fn.has_been_called = False - pipe = StableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16) + pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() inputs = self.get_inputs(dtype="float16") @@ -758,8 +771,7 @@ def get_inputs(self, dtype="float32", seed=0): return inputs def test_stable_diffusion_1_4_pndm(self): - sd_pipe = StableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4") + sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images[0] @@ -770,22 +782,22 @@ def test_stable_diffusion_1_4_pndm(self): assert max_diff < 0.001 def test_stable_diffusion_1_5_pndm(self): - sd_pipe = StableDiffusionPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5") + sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images[0] - expected_image = np.array([ - [0.7839468, 0.6564859, 0.48896512], - [0.78088367, 0.6400461, 0.447728], - [0.81458974, 0.67865074, 0.51496047], - ]) + expected_image = np.array( + [ + [0.7839468, 0.6564859, 0.48896512], + [0.78088367, 0.6400461, 0.447728], + [0.81458974, 0.67865074, 0.51496047], + ] + ) max_diff = np.abs(expected_image - image[0][0:3]).max() assert max_diff < 0.001 def test_stable_diffusion_ddim(self): - sd_pipe = StableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4") + sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() @@ -797,10 +809,8 @@ def test_stable_diffusion_ddim(self): assert max_diff < 0.001 def test_stable_diffusion_lms(self): - sd_pipe = StableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4") - sd_pipe.scheduler = LMSDiscreteScheduler.from_config( - sd_pipe.scheduler.config) + sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") + sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images[0] @@ -811,34 +821,34 @@ def test_stable_diffusion_lms(self): assert max_diff < 0.001 def test_stable_diffusion_euler(self): - sd_pipe = StableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4") - sd_pipe.scheduler = EulerDiscreteScheduler.from_config( - sd_pipe.scheduler.config) + sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") + sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images[0] - expected_image = np.array([ - [0.7907467, 0.69895816, 0.5911293], - [0.7878128, 0.6815276, 0.55695873], - [0.79491043, 0.69076216, 0.58900857], - ]) + expected_image = np.array( + [ + [0.7907467, 0.69895816, 0.5911293], + [0.7878128, 0.6815276, 0.55695873], + [0.79491043, 0.69076216, 0.58900857], + ] + ) max_diff = np.abs(expected_image - image[0][0:3]).max() assert max_diff < 0.001 def test_stable_diffusion_dpm(self): - sd_pipe = StableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4") - sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config( - sd_pipe.scheduler.config) + sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") + sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() inputs["num_inference_steps"] = 25 image = sd_pipe(**inputs).images[0] - expected_image = np.array([ - [0.8398815, 0.7510048, 0.6475117], - [0.8548264, 0.75703114, 0.63529825], - [0.8559129, 0.75676, 0.6597851], - ]) + expected_image = np.array( + [ + [0.8398815, 0.7510048, 0.6475117], + [0.8548264, 0.75703114, 0.63529825], + [0.8559129, 0.75676, 0.6597851], + ] + ) max_diff = np.abs(expected_image - image[0][0:3]).max() assert max_diff < 0.001 diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py index 5c19060a6d83a..4a6a51ef4cefb 100644 --- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py +++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py @@ -20,19 +20,24 @@ import paddle from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer -from ppdiffusers import (AutoencoderKL, PNDMScheduler, - StableDiffusionAdapterPipeline, T2IAdapter, - UNet2DConditionModel) +from ppdiffusers import ( + AutoencoderKL, + PNDMScheduler, + StableDiffusionAdapterPipeline, + T2IAdapter, + UNet2DConditionModel, +) from ppdiffusers.utils import floats_tensor, load_image, load_numpy, slow from ppdiffusers.utils.import_utils import is_ppxformers_available -from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, - TEXT_GUIDED_IMAGE_VARIATION_PARAMS) +from ..pipeline_params import ( + TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_PARAMS, +) from ..test_pipelines_common import PipelineTesterMixin -class StableDiffusionAdapterPipelineFastTests(PipelineTesterMixin, - unittest.TestCase): +class StableDiffusionAdapterPipelineFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionAdapterPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS @@ -47,7 +52,8 @@ def get_dummy_components(self): out_channels=4, down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, ) + cross_attention_dim=32, + ) scheduler = PNDMScheduler(skip_prk_steps=True) paddle.Generator().manual_seed(seed=0) vae = AutoencoderKL( @@ -56,7 +62,8 @@ def get_dummy_components(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, ) + latent_channels=4, + ) vae_scale_factor = 2 paddle.Generator().manual_seed(seed=0) text_encoder_config = CLIPTextConfig( @@ -68,10 +75,10 @@ def get_dummy_components(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, ) + vocab_size=1000, + ) text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") paddle.Generator().manual_seed(seed=0) adapter = T2IAdapter( block_out_channels=[32, 64], @@ -80,7 +87,8 @@ def get_dummy_components(self): kernel_size=1, res_block_skip=True, use_conv=False, - input_scale_factor=vae_scale_factor, ) + input_scale_factor=vae_scale_factor, + ) components = { "adapter": adapter, "unet": unet, @@ -114,30 +122,30 @@ def test_stable_diffusion_adapter_default_case(self): image = sd_pipe(**inputs).images image_slice = image[(0), -3:, -3:, (-1)] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.9088084, - 0.6012194, - 0.43046606, - 0.7228667, - 0.46428588, - 0.30164504, - 0.508494, - 0.6241546, - 0.55453974, - ]) + expected_slice = np.array( + [ + 0.9088084, + 0.6012194, + 0.43046606, + 0.7228667, + 0.46428588, + 0.30164504, + 0.508494, + 0.6241546, + 0.55453974, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.005 def test_attention_slicing_forward_pass(self): - return self._test_attention_slicing_forward_pass( - expected_max_diff=0.002) + return self._test_attention_slicing_forward_pass(expected_max_diff=0.002) @unittest.skipIf( not is_ppxformers_available(), reason="XFormers attention is only available with CUDA and `xformers` installed", ) def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass( - expected_max_diff=0.002) + self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=0.002) def test_inference_batch_single_identical(self): self._test_inference_batch_single_identical(expected_max_diff=0.002) @@ -153,16 +161,12 @@ def tearDown(self): def get_inputs(self, revision="segmentation", dtype="float32", seed=0): generator = paddle.Generator().manual_seed(seed) image_urls = { - "segmentation": - "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/segmentation/sample_input.png", - "keypose": - "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/keypose/sample_input.png", - "depth": - "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/depth/sample_input.png", + "segmentation": "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/segmentation/sample_input.png", + "keypose": "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/keypose/sample_input.png", + "depth": "https://huggingface.co/RzZ/sd-v1-4-adapter-pipeline/resolve/depth/sample_input.png", } prompt_by_rev = { - "segmentation": - "A black Honda motorcycle parked in front of a garage", + "segmentation": "A black Honda motorcycle parked in front of a garage", "keypose": "An astronaut on the moon", "depth": "An office room with nice view", } @@ -180,9 +184,8 @@ def get_inputs(self, revision="segmentation", dtype="float32", seed=0): def test_stable_diffusion_segmentation_adapter(self): adapter = T2IAdapter.from_pretrained("RzZ/sd-v1-4-adapter-seg") pipe = StableDiffusionAdapterPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", - adapter=adapter, - safety_checker=None) + "CompVis/stable-diffusion-v1-4", adapter=adapter, safety_checker=None + ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() inputs = self.get_inputs(revision="segmentation") @@ -196,9 +199,8 @@ def test_stable_diffusion_segmentation_adapter(self): def test_stable_diffusion_keypose_adapter(self): adapter = T2IAdapter.from_pretrained("RzZ/sd-v1-4-adapter-keypose") pipe = StableDiffusionAdapterPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", - adapter=adapter, - safety_checker=None) + "CompVis/stable-diffusion-v1-4", adapter=adapter, safety_checker=None + ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() inputs = self.get_inputs(revision="keypose") @@ -212,9 +214,8 @@ def test_stable_diffusion_keypose_adapter(self): def test_stable_diffusion_depth_adapter(self): adapter = T2IAdapter.from_pretrained("RzZ/sd-v1-4-adapter-depth") pipe = StableDiffusionAdapterPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", - adapter=adapter, - safety_checker=None) + "CompVis/stable-diffusion-v1-4", adapter=adapter, safety_checker=None + ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() inputs = self.get_inputs(revision="depth") diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py index ddebfd6234a13..8b85c7bd484db 100644 --- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py +++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py @@ -19,9 +19,13 @@ import paddle from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer -from ppdiffusers import (AutoencoderKL, ControlNetModel, DDIMScheduler, - StableDiffusionControlNetPipeline, - UNet2DConditionModel) +from ppdiffusers import ( + AutoencoderKL, + ControlNetModel, + DDIMScheduler, + StableDiffusionControlNetPipeline, + UNet2DConditionModel, +) from ppdiffusers.utils import load_image, load_numpy, randn_tensor, slow from ppdiffusers.utils.import_utils import is_ppxformers_available from ppdiffusers.utils.testing_utils import require_paddle_gpu @@ -30,8 +34,7 @@ from ..test_pipelines_common import PipelineTesterMixin -class StableDiffusionControlNetPipelineFastTests(PipelineTesterMixin, - unittest.TestCase): +class StableDiffusionControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionControlNetPipeline params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS @@ -46,7 +49,8 @@ def get_dummy_components(self): out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, ) + cross_attention_dim=32, + ) paddle.seed(0) controlnet = ControlNetModel( block_out_channels=(32, 64), @@ -54,14 +58,16 @@ def get_dummy_components(self): in_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), cross_attention_dim=32, - conditioning_embedding_out_channels=(16, 32), ) + conditioning_embedding_out_channels=(16, 32), + ) paddle.seed(0) scheduler = DDIMScheduler( beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, - set_alpha_to_one=False, ) + set_alpha_to_one=False, + ) paddle.seed(0) vae = AutoencoderKL( block_out_channels=[32, 64], @@ -69,7 +75,8 @@ def get_dummy_components(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, ) + latent_channels=4, + ) paddle.seed(0) text_encoder_config = CLIPTextConfig( bos_token_id=0, @@ -80,10 +87,10 @@ def get_dummy_components(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, ) + vocab_size=1000, + ) text_encoder = CLIPTextModel(text_encoder_config).eval() - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") components = { "unet": unet, @@ -106,8 +113,10 @@ def get_dummy_inputs(self, seed=0): 1, 3, 32 * controlnet_embedder_scale_factor, - 32 * controlnet_embedder_scale_factor, ), - generator=generator, ) + 32 * controlnet_embedder_scale_factor, + ), + generator=generator, + ) inputs = { "prompt": "A painting of a squirrel eating a burger", @@ -128,8 +137,7 @@ def test_attention_slicing_forward_pass(self): reason="XFormers attention is only available with CUDA and `xformers` installed", ) def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass( - expected_max_diff=1e-2) + self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-2) def test_inference_batch_single_identical(self): self._test_inference_batch_single_identical(expected_max_diff=2e-3) @@ -144,13 +152,11 @@ def tearDown(self): paddle.device.cuda.empty_cache() def test_canny(self): - controlnet = ControlNetModel.from_pretrained( - "lllyasviel/sd-controlnet-canny") + controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny") pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - safety_checker=None, - controlnet=controlnet) + "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet + ) pipe.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) @@ -172,13 +178,11 @@ def test_canny(self): assert np.abs(expected_image - image).max() < 5e-3 def test_depth(self): - controlnet = ControlNetModel.from_pretrained( - "lllyasviel/sd-controlnet-depth") + controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-depth") pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - safety_checker=None, - controlnet=controlnet) + "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet + ) pipe.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) @@ -200,13 +204,11 @@ def test_depth(self): assert np.abs(expected_image - image).max() < 5e-3 def test_hed(self): - controlnet = ControlNetModel.from_pretrained( - "lllyasviel/sd-controlnet-hed") + controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-hed") pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - safety_checker=None, - controlnet=controlnet) + "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet + ) pipe.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) @@ -228,13 +230,11 @@ def test_hed(self): assert np.abs(expected_image - image).max() < 5e-3 def test_mlsd(self): - controlnet = ControlNetModel.from_pretrained( - "lllyasviel/sd-controlnet-mlsd") + controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-mlsd") pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - safety_checker=None, - controlnet=controlnet) + "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet + ) pipe.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) @@ -256,13 +256,11 @@ def test_mlsd(self): assert np.abs(expected_image - image).max() < 5e-3 def test_normal(self): - controlnet = ControlNetModel.from_pretrained( - "lllyasviel/sd-controlnet-normal") + controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-normal") pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - safety_checker=None, - controlnet=controlnet) + "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet + ) pipe.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) @@ -284,13 +282,11 @@ def test_normal(self): assert np.abs(expected_image - image).max() < 5e-3 def test_openpose(self): - controlnet = ControlNetModel.from_pretrained( - "lllyasviel/sd-controlnet-openpose") + controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose") pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - safety_checker=None, - controlnet=controlnet) + "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet + ) pipe.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) @@ -312,13 +308,11 @@ def test_openpose(self): assert np.abs(expected_image - image).max() < 5e-3 def test_scribble(self): - controlnet = ControlNetModel.from_pretrained( - "lllyasviel/sd-controlnet-scribble") + controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-scribble") pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - safety_checker=None, - controlnet=controlnet) + "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet + ) pipe.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(5) @@ -340,13 +334,11 @@ def test_scribble(self): assert np.abs(expected_image - image).max() < 5e-3 def test_seg(self): - controlnet = ControlNetModel.from_pretrained( - "lllyasviel/sd-controlnet-seg") + controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg") pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - safety_checker=None, - controlnet=controlnet) + "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet + ) pipe.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(5) diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py index 8739a78286b5f..a73cfcdbf1291 100644 --- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py +++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py @@ -19,24 +19,28 @@ import numpy as np import paddle -from paddlenlp.transformers import (CLIPImageProcessor, CLIPVisionConfig, - CLIPVisionModelWithProjection) +from paddlenlp.transformers import ( + CLIPImageProcessor, + CLIPVisionConfig, + CLIPVisionModelWithProjection, +) from PIL import Image -from ppdiffusers import (AutoencoderKL, DPMSolverMultistepScheduler, - PNDMScheduler, StableDiffusionImageVariationPipeline, - UNet2DConditionModel) -from ppdiffusers.utils import (floats_tensor, load_image, load_numpy, nightly, - slow) +from ppdiffusers import ( + AutoencoderKL, + DPMSolverMultistepScheduler, + PNDMScheduler, + StableDiffusionImageVariationPipeline, + UNet2DConditionModel, +) +from ppdiffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow from ppdiffusers.utils.testing_utils import require_paddle_gpu -from ..pipeline_params import (IMAGE_VARIATION_BATCH_PARAMS, - IMAGE_VARIATION_PARAMS) +from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS from ..test_pipelines_common import PipelineTesterMixin -class StableDiffusionImageVariationPipelineFastTests(PipelineTesterMixin, - unittest.TestCase): +class StableDiffusionImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionImageVariationPipeline params = IMAGE_VARIATION_PARAMS batch_params = IMAGE_VARIATION_BATCH_PARAMS @@ -51,7 +55,8 @@ def get_dummy_components(self): out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, ) + cross_attention_dim=32, + ) scheduler = PNDMScheduler(skip_prk_steps=True) paddle.seed(0) vae = AutoencoderKL( @@ -60,7 +65,8 @@ def get_dummy_components(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, ) + latent_channels=4, + ) paddle.seed(0) image_encoder_config = CLIPVisionConfig( hidden_size=32, @@ -70,7 +76,8 @@ def get_dummy_components(self): num_attention_heads=4, num_hidden_layers=5, image_size=32, - patch_size=4, ) + patch_size=4, + ) image_encoder = CLIPVisionModelWithProjection(image_encoder_config) feature_extractor = CLIPImageProcessor(crop_size=32, size=32) components = { @@ -106,17 +113,19 @@ def test_stable_diffusion_img_variation_default_case(self): image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.22073305, - 0.22751817, - 0.32176197, - 0.26315716, - 0.25681925, - 0.41432184, - 0.2454437, - 0.10104704, - 0.32165903, - ]) + expected_slice = np.array( + [ + 0.22073305, + 0.22751817, + 0.32176197, + 0.26315716, + 0.25681925, + 0.41432184, + 0.2454437, + 0.10104704, + 0.32165903, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001 @@ -130,17 +139,19 @@ def test_stable_diffusion_img_variation_multiple_images(self): image = output.images image_slice = image[-1, -3:, -3:, -1] assert image.shape == (2, 64, 64, 3) - expected_slice = np.array([ - 0.61040395, - 0.7414253, - 0.5950623, - 0.5843509, - 0.25609648, - 0.28481025, - 0.61782926, - 0.3014974, - 0.35131538, - ]) + expected_slice = np.array( + [ + 0.61040395, + 0.7414253, + 0.5950623, + 0.5843509, + 0.25609648, + 0.28481025, + 0.61782926, + 0.3014974, + 0.35131538, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001 @@ -154,9 +165,7 @@ def tearDown(self): def get_inputs(self, dtype="float32", seed=0): generator = paddle.Generator().manual_seed(seed) - init_image = load_image( - "https://paddlenlp.bj.bcebos.com/data/images/input_image_vermeer.png" - ) + init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_image_vermeer.png") latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64)) latents = paddle.to_tensor(latents).cast(dtype) inputs = { @@ -171,30 +180,32 @@ def get_inputs(self, dtype="float32", seed=0): def test_stable_diffusion_img_variation_pipeline_default(self): sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained( - "fusing/sd-image-variations-diffusers", safety_checker=None) + "fusing/sd-image-variations-diffusers", safety_checker=None + ) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1].flatten() assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.5717014670372009, - 0.47024625539779663, - 0.47462183237075806, - 0.6388776898384094, - 0.5250844359397888, - 0.500831663608551, - 0.638043999671936, - 0.5769134163856506, - 0.5223015546798706, - ]) + expected_slice = np.array( + [ + 0.5717014670372009, + 0.47024625539779663, + 0.47462183237075806, + 0.6388776898384094, + 0.5250844359397888, + 0.500831663608551, + 0.638043999671936, + 0.5769134163856506, + 0.5223015546798706, + ] + ) assert np.abs(image_slice - expected_slice).max() < 0.0001 def test_stable_diffusion_img_variation_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, - latents: paddle.Tensor) -> None: + def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 @@ -202,42 +213,45 @@ def callback_fn(step: int, timestep: int, latents = latents.detach().cpu().numpy() assert latents.shape == (1, 4, 64, 64) latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([ - -0.1621, - 0.2837, - -0.7979, - -0.1221, - -1.3057, - 0.7681, - -2.1191, - 0.0464, - 1.6309, - ]) - assert np.abs(latents_slice.flatten() - expected_slice).max( - ) < 0.05 + expected_slice = np.array( + [ + -0.1621, + 0.2837, + -0.7979, + -0.1221, + -1.3057, + 0.7681, + -2.1191, + 0.0464, + 1.6309, + ] + ) + assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05 elif step == 2: latents = latents.detach().cpu().numpy() assert latents.shape == (1, 4, 64, 64) latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([ - 0.6299, - 1.75, - 1.1992, - -2.1582, - -1.8994, - 0.7334, - -0.709, - 1.0137, - 1.5273, - ]) - assert np.abs(latents_slice.flatten() - expected_slice).max( - ) < 0.05 + expected_slice = np.array( + [ + 0.6299, + 1.75, + 1.1992, + -2.1582, + -1.8994, + 0.7334, + -0.709, + 1.0137, + 1.5273, + ] + ) + assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05 callback_fn.has_been_called = False pipe = StableDiffusionImageVariationPipeline.from_pretrained( "fusing/sd-image-variations-diffusers", safety_checker=None, - paddle_dtype=paddle.float16, ) + paddle_dtype=paddle.float16, + ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() inputs = self.get_inputs(dtype="float16") @@ -256,9 +270,7 @@ def tearDown(self): def get_inputs(self, dtype="float32", seed=0): generator = paddle.Generator().manual_seed(seed) - init_image = load_image( - "https://paddlenlp.bj.bcebos.com/data/images/input_image_vermeer.png" - ) + init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_image_vermeer.png") latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64)) latents = paddle.to_tensor(latents).cast(dtype) inputs = { @@ -272,28 +284,21 @@ def get_inputs(self, dtype="float32", seed=0): return inputs def test_img_variation_pndm(self): - sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained( - "fusing/sd-image-variations-diffusers") + sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained("fusing/sd-image-variations-diffusers") sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images[0] - expected_image = load_numpy( - "https://paddlenlp.bj.bcebos.com/data/images/lambdalabs_variations_pndm.npy" - ) + expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/lambdalabs_variations_pndm.npy") max_diff = np.abs(expected_image - image).max() assert max_diff < 0.001 def test_img_variation_dpm(self): - sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained( - "fusing/sd-image-variations-diffusers") - sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config( - sd_pipe.scheduler.config) + sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained("fusing/sd-image-variations-diffusers") + sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() inputs["num_inference_steps"] = 25 image = sd_pipe(**inputs).images[0] - expected_image = load_numpy( - "https://paddlenlp.bj.bcebos.com/data/images/lambdalabs_variations_dpm_multi.npy" - ) + expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/lambdalabs_variations_dpm_multi.npy") max_diff = np.abs(expected_image - image).max() assert max_diff < 0.001 diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index 86f394a233323..101468b9a4534 100644 --- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -21,27 +21,30 @@ import paddle from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer -from ppdiffusers import (AutoencoderKL, DDIMScheduler, - DPMSolverMultistepScheduler, LMSDiscreteScheduler, - PNDMScheduler, StableDiffusionImg2ImgPipeline, - UNet2DConditionModel) +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + DPMSolverMultistepScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + StableDiffusionImg2ImgPipeline, + UNet2DConditionModel, +) from ppdiffusers.image_processor import VaeImageProcessor -from ppdiffusers.utils import (floats_tensor, load_image, load_numpy, nightly, - slow) +from ppdiffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow from ppdiffusers.utils.testing_utils import require_paddle_gpu -from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, - TEXT_GUIDED_IMAGE_VARIATION_PARAMS) +from ..pipeline_params import ( + TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_PARAMS, +) from ..test_pipelines_common import PipelineTesterMixin -class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, - unittest.TestCase): +class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionImg2ImgPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"} - required_optional_params = PipelineTesterMixin.required_optional_params - { - "latents" - } + required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS def get_dummy_components(self): @@ -54,7 +57,8 @@ def get_dummy_components(self): out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, ) + cross_attention_dim=32, + ) scheduler = PNDMScheduler(skip_prk_steps=True) paddle.seed(0) vae = AutoencoderKL( @@ -63,7 +67,8 @@ def get_dummy_components(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, ) + latent_channels=4, + ) paddle.seed(0) text_encoder_config = CLIPTextConfig( bos_token_id=0, @@ -74,10 +79,10 @@ def get_dummy_components(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, ) + vocab_size=1000, + ) text_encoder = CLIPTextModel(text_encoder_config).eval() - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") components = { "unet": unet, "scheduler": scheduler, @@ -101,8 +106,7 @@ def get_dummy_inputs(self, seed=0, input_image_type="pd", output_type="np"): input_image = image.numpy().transpose(0, 2, 3, 1) input_image = VaeImageProcessor.numpy_to_pil(input_image) else: - raise ValueError( - f"unsupported input_image_type {input_image_type}.") + raise ValueError(f"unsupported input_image_type {input_image_type}.") if output_type not in ["pd", "np", "pil"]: raise ValueError(f"unsupported output_type {output_type}") @@ -125,17 +129,19 @@ def test_stable_diffusion_img2img_default_case(self): image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([ - 0.50082374, - 0.49329656, - 0.4963757, - 0.46307105, - 0.44599247, - 0.4877512, - 0.560709, - 0.56884044, - 0.5738671, - ]) + expected_slice = np.array( + [ + 0.50082374, + 0.49329656, + 0.4963757, + 0.46307105, + 0.44599247, + 0.4877512, + 0.560709, + 0.56884044, + 0.5738671, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001 def test_stable_diffusion_img2img_negative_prompt(self): @@ -149,17 +155,19 @@ def test_stable_diffusion_img2img_negative_prompt(self): image = output.images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([ - 0.48659712, - 0.4004616, - 0.4762491, - 0.49117112, - 0.5414775, - 0.58218545, - 0.5550886, - 0.52305603, - 0.61624044, - ]) + expected_slice = np.array( + [ + 0.48659712, + 0.4004616, + 0.4762491, + 0.49117112, + 0.5414775, + 0.58218545, + 0.5550886, + 0.52305603, + 0.61624044, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001 def test_stable_diffusion_img2img_multiple_init_images(self): @@ -173,40 +181,45 @@ def test_stable_diffusion_img2img_multiple_init_images(self): image = sd_pipe(**inputs).images image_slice = image[-1, -3:, -3:, -1] assert image.shape == (2, 32, 32, 3) - expected_slice = np.array([ - 0.49016288, - 0.23989454, - 0.4229045, - 0.56873804, - 0.467226, - 0.5793949, - 0.6967555, - 0.7027658, - 0.5809763, - ]) + expected_slice = np.array( + [ + 0.49016288, + 0.23989454, + 0.4229045, + 0.56873804, + 0.467226, + 0.5793949, + 0.6967555, + 0.7027658, + 0.5809763, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001 def test_stable_diffusion_img2img_k_lms(self): components = self.get_dummy_components() components["scheduler"] = LMSDiscreteScheduler( - beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear") + beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear" + ) sd_pipe = StableDiffusionImg2ImgPipeline(**components) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs() image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([ - 0.29999942, - 0.5206376, - 0.37915814, - 0.4033721, - 0.7630579, - 0.4642547, - 0.5823178, - 0.6936951, - 0.48969278, - ]) + expected_slice = np.array( + [ + 0.29999942, + 0.5206376, + 0.37915814, + 0.4033721, + 0.7630579, + 0.4642547, + 0.5823178, + 0.6936951, + 0.48969278, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001 def test_pt_np_pil_outputs_equivalent(self): @@ -218,10 +231,8 @@ def test_pt_np_pil_outputs_equivalent(self): output_np = sd_pipe(**self.get_dummy_inputs(output_type="np"))[0] output_pil = sd_pipe(**self.get_dummy_inputs(output_type="pil"))[0] - assert np.abs(output_pt.numpy().transpose(0, 2, 3, 1) - output_np).max( - ) <= 1e-4 - assert np.abs(np.array(output_pil[0]) - (output_np * 255).round()).max( - ) <= 1e-4 + assert np.abs(output_pt.numpy().transpose(0, 2, 3, 1) - output_np).max() <= 1e-4 + assert np.abs(np.array(output_pil[0]) - (output_np * 255).round()).max() <= 1e-4 def test_image_types_consistent(self): components = self.get_dummy_components() @@ -245,9 +256,7 @@ def tearDown(self): def get_inputs(self, dtype="float32", seed=0): generator = paddle.Generator().manual_seed(seed) - init_image = load_image( - "https://paddlenlp.bj.bcebos.com/data/images/sketch-mountains-input.png" - ) + init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/sketch-mountains-input.png") inputs = { "prompt": "a fantasy landscape, concept art, high resolution", "image": init_image, @@ -286,25 +295,26 @@ def get_inputs(self, dtype="float32", seed=0): # assert mean_diff < 5e-2 def test_stable_diffusion_img2img_default(self): - pipe = StableDiffusionImg2ImgPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None) + pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() inputs = self.get_inputs() image = pipe(**inputs).images image_slice = image[0, -3:, -3:, -1].flatten() assert image.shape == (1, 512, 768, 3) - expected_slice = np.array([ - 0.27150, - 0.14849, - 0.15605, - 0.26740, - 0.16954, - 0.18204, - 0.31470, - 0.26311, - 0.24525, - ]) + expected_slice = np.array( + [ + 0.27150, + 0.14849, + 0.15605, + 0.26740, + 0.16954, + 0.18204, + 0.31470, + 0.26311, + 0.24525, + ] + ) assert np.abs(expected_slice - image_slice).max() < 0.001 # def test_img2img_safety_checker_works(self): @@ -322,8 +332,7 @@ def test_stable_diffusion_img2img_default(self): # assert np.abs(out.images[0]).sum() < 1e-5 # should be all zeros def test_stable_diffusion_img2img_k_lms(self): - pipe = StableDiffusionImg2ImgPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None) + pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None) pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() @@ -331,22 +340,23 @@ def test_stable_diffusion_img2img_k_lms(self): image = pipe(**inputs).images image_slice = image[0, -3:, -3:, -1].flatten() assert image.shape == (1, 512, 768, 3) - expected_slice = np.array([ - 0.04890, - 0.04862, - 0.06422, - 0.04655, - 0.05108, - 0.05307, - 0.05926, - 0.08759, - 0.06852, - ]) + expected_slice = np.array( + [ + 0.04890, + 0.04862, + 0.06422, + 0.04655, + 0.05108, + 0.05307, + 0.05926, + 0.08759, + 0.06852, + ] + ) assert np.abs(expected_slice - image_slice).max() < 0.001 def test_stable_diffusion_img2img_ddim(self): - pipe = StableDiffusionImg2ImgPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None) + pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() @@ -354,24 +364,25 @@ def test_stable_diffusion_img2img_ddim(self): image = pipe(**inputs).images image_slice = image[0, -3:, -3:, -1].flatten() assert image.shape == (1, 512, 768, 3) - expected_slice = np.array([ - 0.06069, - 0.05703, - 0.08054, - 0.05797, - 0.06286, - 0.06234, - 0.08438, - 0.11151, - 0.08068, - ]) + expected_slice = np.array( + [ + 0.06069, + 0.05703, + 0.08054, + 0.05797, + 0.06286, + 0.06234, + 0.08438, + 0.11151, + 0.08068, + ] + ) assert np.abs(expected_slice - image_slice).max() < 0.001 def test_stable_diffusion_img2img_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, - latents: paddle.Tensor) -> None: + def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 @@ -379,42 +390,45 @@ def callback_fn(step: int, timestep: int, latents = latents.detach().cpu().numpy() assert latents.shape == (1, 4, 64, 96) latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([ - 0.7650054097175598, - 0.10256098955869675, - 0.4976114332675934, - 3.388350009918213, - 3.7242040634155273, - 4.272988796234131, - 2.4656283855438232, - 3.483647108078003, - 1.765011191368103, - ]) - assert np.abs(latents_slice.flatten() - expected_slice).max( - ) < 0.05 + expected_slice = np.array( + [ + 0.7650054097175598, + 0.10256098955869675, + 0.4976114332675934, + 3.388350009918213, + 3.7242040634155273, + 4.272988796234131, + 2.4656283855438232, + 3.483647108078003, + 1.765011191368103, + ] + ) + assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05 elif step == 2: latents = latents.detach().cpu().numpy() assert latents.shape == (1, 4, 64, 96) latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([ - 0.7580092549324036, - 0.10288780182600021, - 0.4941849708557129, - 3.3663346767425537, - 3.7071609497070312, - 4.25173807144165, - 2.4461638927459717, - 3.451681137084961, - 1.761878490447998, - ]) - assert np.abs(latents_slice.flatten() - expected_slice).max( - ) < 0.05 + expected_slice = np.array( + [ + 0.7580092549324036, + 0.10288780182600021, + 0.4941849708557129, + 3.3663346767425537, + 3.7071609497070312, + 4.25173807144165, + 2.4461638927459717, + 3.451681137084961, + 1.761878490447998, + ] + ) + assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05 callback_fn.has_been_called = False pipe = StableDiffusionImg2ImgPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", safety_checker=None, - paddle_dtype=paddle.float16, ) + paddle_dtype=paddle.float16, + ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() inputs = self.get_inputs(dtype="float16") @@ -423,13 +437,10 @@ def callback_fn(step: int, timestep: int, assert number_of_steps == 2 def test_stable_diffusion_img2img_pipeline_multiple_of_8(self): - init_image = load_image( - "https://paddlenlp.bj.bcebos.com/data/images/sketch-mountains-input.jpg" - ) + init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/sketch-mountains-input.jpg") init_image = init_image.resize((760, 504)) model_id = "CompVis/stable-diffusion-v1-4" - pipe = StableDiffusionImg2ImgPipeline.from_pretrained( - model_id, safety_checker=None) + pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_id, safety_checker=None) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() prompt = "A fantasy landscape, trending on artstation" @@ -440,21 +451,24 @@ def test_stable_diffusion_img2img_pipeline_multiple_of_8(self): strength=0.75, guidance_scale=7.5, generator=generator, - output_type="np", ) + output_type="np", + ) image = output.images[0] image_slice = image[255:258, 383:386, -1] assert image.shape == (504, 760, 3) - expected_slice = np.array([ - 0.71240354, - 0.71053374, - 0.69922864, - 0.7139934, - 0.7106118, - 0.69451976, - 0.71982634, - 0.71717453, - 0.70306426, - ]) + expected_slice = np.array( + [ + 0.71240354, + 0.71053374, + 0.69922864, + 0.7139934, + 0.7106118, + 0.69451976, + 0.71982634, + 0.71717453, + 0.70306426, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.005 @@ -468,9 +482,7 @@ def tearDown(self): def get_inputs(self, dtype="float32", seed=0): generator = paddle.Generator().manual_seed(seed) - init_image = load_image( - "https://paddlenlp.bj.bcebos.com/data/images/sketch-mountains-input.png" - ) + init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/sketch-mountains-input.png") inputs = { "prompt": "a fantasy landscape, concept art, high resolution", "image": init_image, @@ -483,59 +495,45 @@ def get_inputs(self, dtype="float32", seed=0): return inputs def test_img2img_pndm(self): - sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5") + sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") sd_pipe sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images[0] - expected_image = load_numpy( - "https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_pndm.npy" - ) + expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_pndm.npy") max_diff = np.abs(expected_image - image).max() assert max_diff < 0.001 def test_img2img_ddim(self): - sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5") + sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config) sd_pipe sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images[0] - expected_image = load_numpy( - "https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_ddim.npy" - ) + expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_ddim.npy") max_diff = np.abs(expected_image - image).max() assert max_diff < 0.001 def test_img2img_lms(self): - sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5") - sd_pipe.scheduler = LMSDiscreteScheduler.from_config( - sd_pipe.scheduler.config) + sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") + sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config) sd_pipe sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images[0] - expected_image = load_numpy( - "https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_lms.npy" - ) + expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_lms.npy") max_diff = np.abs(expected_image - image).max() assert max_diff < 0.001 def test_img2img_dpm(self): - sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5") - sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config( - sd_pipe.scheduler.config) + sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") + sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config) sd_pipe sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() inputs["num_inference_steps"] = 30 image = sd_pipe(**inputs).images[0] - expected_image = load_numpy( - "https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_dpm.npy" - ) + expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_1_5_dpm.npy") max_diff = np.abs(expected_image - image).max() assert max_diff < 0.001 diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py index 70688fa0182a1..0a815f465532b 100644 --- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py @@ -22,22 +22,28 @@ from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer from PIL import Image -from ppdiffusers import (AutoencoderKL, DPMSolverMultistepScheduler, - LMSDiscreteScheduler, PNDMScheduler, - StableDiffusionInpaintPipeline, UNet2DConditionModel) -from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import \ - prepare_mask_and_masked_image -from ppdiffusers.utils import (floats_tensor, load_image, load_numpy, nightly, - slow) +from ppdiffusers import ( + AutoencoderKL, + DPMSolverMultistepScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + StableDiffusionInpaintPipeline, + UNet2DConditionModel, +) +from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import ( + prepare_mask_and_masked_image, +) +from ppdiffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow from ppdiffusers.utils.testing_utils import require_paddle_gpu -from ..pipeline_params import (TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, - TEXT_GUIDED_IMAGE_INPAINTING_PARAMS) +from ..pipeline_params import ( + TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_INPAINTING_PARAMS, +) from ..test_pipelines_common import PipelineTesterMixin -class StableDiffusionInpaintPipelineFastTests(PipelineTesterMixin, - unittest.TestCase): +class StableDiffusionInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionInpaintPipeline params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS @@ -52,7 +58,8 @@ def get_dummy_components(self): out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, ) + cross_attention_dim=32, + ) scheduler = PNDMScheduler(skip_prk_steps=True) paddle.seed(0) vae = AutoencoderKL( @@ -61,7 +68,8 @@ def get_dummy_components(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, ) + latent_channels=4, + ) paddle.seed(0) text_encoder_config = CLIPTextConfig( bos_token_id=0, @@ -72,10 +80,10 @@ def get_dummy_components(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, ) + vocab_size=1000, + ) text_encoder = CLIPTextModel(text_encoder_config).eval() - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") components = { "unet": unet, "scheduler": scheduler, @@ -90,11 +98,8 @@ def get_dummy_components(self): def get_dummy_inputs(self, seed=0): image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)) image = image.cpu().transpose(perm=[0, 2, 3, 1])[0] - init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize( - (64, 64)) - mask_image = ( - Image.fromarray(np.uint8(image + 4)).convert("RGB").resize( - (64, 64))) + init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64)) + mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64)) generator = paddle.Generator().manual_seed(seed) inputs = { @@ -116,17 +121,19 @@ def test_stable_diffusion_inpaint(self): image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.55786943, - 0.628228, - 0.49147403, - 0.3191774, - 0.39249492, - 0.46521175, - 0.29909956, - 0.21160087, - 0.42932406, - ]) + expected_slice = np.array( + [ + 0.55786943, + 0.628228, + 0.49147403, + 0.3191774, + 0.39249492, + 0.46521175, + 0.29909956, + 0.21160087, + 0.42932406, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_inpaint_image_tensor(self): @@ -138,11 +145,11 @@ def test_stable_diffusion_inpaint_image_tensor(self): out_pil = output.images inputs = self.get_dummy_inputs() inputs["image"] = ( - paddle.to_tensor(np.array(inputs["image"]) / 127.5 - 1) - .transpose(perm=[2, 0, 1]).unsqueeze(axis=0)) + paddle.to_tensor(np.array(inputs["image"]) / 127.5 - 1).transpose(perm=[2, 0, 1]).unsqueeze(axis=0) + ) inputs["mask_image"] = ( - paddle.to_tensor(np.array(inputs["mask_image"]) / 255) - .transpose(perm=[2, 0, 1])[:1].unsqueeze(axis=0)) + paddle.to_tensor(np.array(inputs["mask_image"]) / 255).transpose(perm=[2, 0, 1])[:1].unsqueeze(axis=0) + ) output = sd_pipe(**inputs) out_tensor = output.images assert out_pil.shape == (1, 64, 64, 3) @@ -166,13 +173,10 @@ def tearDown(self): def get_inputs(self, dtype="float32", seed=0): generator = paddle.Generator().manual_seed(seed) - init_image = load_image( - "https://paddlenlp.bj.bcebos.com/data/images/input_bench_image.png") - mask_image = load_image( - "https://paddlenlp.bj.bcebos.com/data/images/input_bench_mask.png") + init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_bench_image.png") + mask_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_bench_mask.png") inputs = { - "prompt": - "Face of a yellow cat, high resolution, sitting on a park bench", + "prompt": "Face of a yellow cat, high resolution, sitting on a park bench", "image": init_image, "mask_image": mask_image, "generator": generator, @@ -184,53 +188,60 @@ def get_inputs(self, dtype="float32", seed=0): def test_stable_diffusion_inpaint_ddim(self): pipe = StableDiffusionInpaintPipeline.from_pretrained( - "runwayml/stable-diffusion-inpainting", safety_checker=None) + "runwayml/stable-diffusion-inpainting", safety_checker=None + ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() inputs = self.get_inputs() image = pipe(**inputs).images image_slice = image[0, 253:256, 253:256, -1].flatten() assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.05978, - 0.10983, - 0.10514, - 0.07922, - 0.08483, - 0.08587, - 0.05302, - 0.03218, - 0.01636, - ]) + expected_slice = np.array( + [ + 0.05978, + 0.10983, + 0.10514, + 0.07922, + 0.08483, + 0.08587, + 0.05302, + 0.03218, + 0.01636, + ] + ) assert np.abs(expected_slice - image_slice).max() < 0.0001 def test_stable_diffusion_inpaint_fp16(self): pipe = StableDiffusionInpaintPipeline.from_pretrained( "runwayml/stable-diffusion-inpainting", paddle_dtype=paddle.float16, - safety_checker=None, ) + safety_checker=None, + ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() inputs = self.get_inputs(dtype="float16") image = pipe(**inputs).images image_slice = image[0, 253:256, 253:256, -1].flatten() assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.9921875, - 0.9477539, - 0.90234375, - 0.96484375, - 0.9189453, - 0.875, - 0.9316406, - 0.9013672, - 0.875, - ]) + expected_slice = np.array( + [ + 0.9921875, + 0.9477539, + 0.90234375, + 0.96484375, + 0.9189453, + 0.875, + 0.9316406, + 0.9013672, + 0.875, + ] + ) assert np.abs(expected_slice - image_slice).max() < 0.05 def test_stable_diffusion_inpaint_pndm(self): pipe = StableDiffusionInpaintPipeline.from_pretrained( - "runwayml/stable-diffusion-inpainting", safety_checker=None) + "runwayml/stable-diffusion-inpainting", safety_checker=None + ) pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() @@ -238,22 +249,25 @@ def test_stable_diffusion_inpaint_pndm(self): image = pipe(**inputs).images image_slice = image[0, 253:256, 253:256, -1].flatten() assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.06892, - 0.06994, - 0.07905, - 0.05366, - 0.04709, - 0.04890, - 0.04107, - 0.05083, - 0.04180, - ]) + expected_slice = np.array( + [ + 0.06892, + 0.06994, + 0.07905, + 0.05366, + 0.04709, + 0.04890, + 0.04107, + 0.05083, + 0.04180, + ] + ) assert np.abs(expected_slice - image_slice).max() < 0.0001 def test_stable_diffusion_inpaint_k_lms(self): pipe = StableDiffusionInpaintPipeline.from_pretrained( - "runwayml/stable-diffusion-inpainting", safety_checker=None) + "runwayml/stable-diffusion-inpainting", safety_checker=None + ) pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() @@ -261,17 +275,19 @@ def test_stable_diffusion_inpaint_k_lms(self): image = pipe(**inputs).images image_slice = image[0, 253:256, 253:256, -1].flatten() assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.23513, - 0.22413, - 0.29442, - 0.24243, - 0.26214, - 0.30329, - 0.26431, - 0.25025, - 0.25197, - ]) + expected_slice = np.array( + [ + 0.23513, + 0.22413, + 0.29442, + 0.24243, + 0.26214, + 0.30329, + 0.26431, + 0.25025, + 0.25197, + ] + ) assert np.abs(expected_slice - image_slice).max() < 0.0001 @@ -285,13 +301,10 @@ def tearDown(self): def get_inputs(self, dtype="float32", seed=0): generator = paddle.Generator().manual_seed(seed) - init_image = load_image( - "https://paddlenlp.bj.bcebos.com/data/images/input_bench_image.png") - mask_image = load_image( - "https://paddlenlp.bj.bcebos.com/data/images/input_bench_mask.png") + init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_bench_image.png") + mask_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_bench_mask.png") inputs = { - "prompt": - "Face of a yellow cat, high resolution, sitting on a park bench", + "prompt": "Face of a yellow cat, high resolution, sitting on a park bench", "image": init_image, "mask_image": mask_image, "generator": generator, @@ -302,52 +315,40 @@ def get_inputs(self, dtype="float32", seed=0): return inputs def test_inpaint_ddim(self): - sd_pipe = StableDiffusionInpaintPipeline.from_pretrained( - "runwayml/stable-diffusion-inpainting") + sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting") sd_pipe sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images[0] - expected_image = load_numpy( - "https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_inpaint_ddim.npy" - ) + expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_inpaint_ddim.npy") max_diff = np.abs(expected_image - image).max() assert max_diff < 0.001 def test_inpaint_pndm(self): - sd_pipe = StableDiffusionInpaintPipeline.from_pretrained( - "runwayml/stable-diffusion-inpainting") + sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting") sd_pipe.scheduler = PNDMScheduler.from_config(sd_pipe.scheduler.config) sd_pipe sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images[0] - expected_image = load_numpy( - "https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_inpaint_pndm.npy" - ) + expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_inpaint_pndm.npy") max_diff = np.abs(expected_image - image).max() assert max_diff < 0.001 def test_inpaint_lms(self): - sd_pipe = StableDiffusionInpaintPipeline.from_pretrained( - "runwayml/stable-diffusion-inpainting") - sd_pipe.scheduler = LMSDiscreteScheduler.from_config( - sd_pipe.scheduler.config) + sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting") + sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config) sd_pipe sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images[0] - expected_image = load_numpy( - "https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_inpaint_lms.npy" - ) + expected_image = load_numpy("https://paddlenlp.bj.bcebos.com/data/images/stable_diffusion_inpaint_lms.npy") max_diff = np.abs(expected_image - image).max() assert max_diff < 0.001 def test_inpaint_dpm(self): - sd_pipe = StableDiffusionInpaintPipeline.from_pretrained( - "runwayml/stable-diffusion-inpainting") - sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config( - sd_pipe.scheduler.config) + sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting") + sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config) sd_pipe sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() @@ -360,8 +361,7 @@ def test_inpaint_dpm(self): assert max_diff < 0.001 -class StableDiffusionInpaintingPrepareMaskAndMaskedImageTests( - unittest.TestCase): +class StableDiffusionInpaintingPrepareMaskAndMaskedImageTests(unittest.TestCase): def test_pil_inputs(self): im = np.random.randint(0, 255, (32, 32, 3), dtype=np.uint8) im = Image.fromarray(im) @@ -389,8 +389,7 @@ def test_np_inputs(self): mask_np = np.random.randint(0, 255, (32, 32), dtype=np.uint8) > 127.5 mask_pil = Image.fromarray((mask_np * 255).astype(np.uint8)) t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np) - t_mask_pil, t_masked_pil = prepare_mask_and_masked_image(im_pil, - mask_pil) + t_mask_pil, t_masked_pil = prepare_mask_and_masked_image(im_pil, mask_pil) self.assertTrue((t_mask_np == t_mask_pil).all()) self.assertTrue((t_masked_np == t_masked_pil).all()) @@ -401,7 +400,8 @@ def test_paddle_3D_2D_inputs(self): mask_np = mask_tensor.numpy() t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image( - im_tensor / 127.5 - 1, mask_tensor.cast("int64")) + im_tensor / 127.5 - 1, mask_tensor.cast("int64") + ) t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np) self.assertTrue((t_mask_tensor == t_mask_np).all()) @@ -413,7 +413,8 @@ def test_paddle_3D_3D_inputs(self): im_np = im_tensor.numpy().transpose(1, 2, 0) mask_np = mask_tensor.numpy()[0] t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image( - im_tensor / 127.5 - 1, mask_tensor.cast("int64")) + im_tensor / 127.5 - 1, mask_tensor.cast("int64") + ) t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np) self.assertTrue((t_mask_tensor == t_mask_np).all()) self.assertTrue((t_masked_tensor == t_masked_np).all()) @@ -424,7 +425,8 @@ def test_paddle_4D_2D_inputs(self): im_np = im_tensor.numpy()[0].transpose(1, 2, 0) mask_np = mask_tensor.numpy() t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image( - im_tensor / 127.5 - 1, mask_tensor.cast("int64")) + im_tensor / 127.5 - 1, mask_tensor.cast("int64") + ) t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np) self.assertTrue((t_mask_tensor == t_mask_np).all()) self.assertTrue((t_masked_tensor == t_masked_np).all()) @@ -435,19 +437,20 @@ def test_paddle_4D_3D_inputs(self): im_np = im_tensor.numpy()[0].transpose(1, 2, 0) mask_np = mask_tensor.numpy()[0] t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image( - im_tensor / 127.5 - 1, mask_tensor.cast("int64")) + im_tensor / 127.5 - 1, mask_tensor.cast("int64") + ) t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np) self.assertTrue((t_mask_tensor == t_mask_np).all()) self.assertTrue((t_masked_tensor == t_masked_np).all()) def test_paddle_4D_4D_inputs(self): im_tensor = paddle.randint(0, 255, (1, 3, 32, 32)).cast("uint8") - mask_tensor = paddle.randint(0, 255, - (1, 1, 32, 32)).cast("uint8") > 127.5 + mask_tensor = paddle.randint(0, 255, (1, 1, 32, 32)).cast("uint8") > 127.5 im_np = im_tensor.numpy()[0].transpose(1, 2, 0) mask_np = mask_tensor.numpy()[0][0] t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image( - im_tensor / 127.5 - 1, mask_tensor.cast("int64")) + im_tensor / 127.5 - 1, mask_tensor.cast("int64") + ) t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np) self.assertTrue((t_mask_tensor == t_mask_np.cast("float64")).all()) self.assertTrue((t_masked_tensor == t_masked_np).all()) @@ -458,11 +461,9 @@ def test_paddle_batch_4D_3D(self): im_nps = [im.numpy().transpose(1, 2, 0) for im in im_tensor] mask_nps = [mask.numpy() for mask in mask_tensor] t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image( - im_tensor / 127.5 - 1, mask_tensor.cast("int64")) - nps = [ - prepare_mask_and_masked_image(i, m) - for i, m in zip(im_nps, mask_nps) - ] + im_tensor / 127.5 - 1, mask_tensor.cast("int64") + ) + nps = [prepare_mask_and_masked_image(i, m) for i, m in zip(im_nps, mask_nps)] t_mask_np = paddle.concat(x=[n[0] for n in nps]) t_masked_np = paddle.concat(x=[n[1] for n in nps]) self.assertTrue((t_mask_tensor == t_mask_np).all()) @@ -475,11 +476,9 @@ def test_paddle_batch_4D_4D(self): im_nps = [im.numpy().transpose(1, 2, 0) for im in im_tensor] mask_nps = [mask.numpy() for mask in mask_tensor] t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image( - im_tensor / 127.5 - 1, mask_tensor.cast("int64")) - nps = [ - prepare_mask_and_masked_image(i, m) - for i, m in zip(im_nps, mask_nps) - ] + im_tensor / 127.5 - 1, mask_tensor.cast("int64") + ) + nps = [prepare_mask_and_masked_image(i, m) for i, m in zip(im_nps, mask_nps)] t_mask_np = paddle.concat(x=[n[0] for n in nps]) t_masked_np = paddle.concat(x=[n[1] for n in nps]) self.assertTrue((t_mask_tensor == t_mask_np).all()) @@ -487,44 +486,28 @@ def test_paddle_batch_4D_4D(self): def test_shape_mismatch(self): with self.assertRaises(AssertionError): - prepare_mask_and_masked_image( - paddle.randn(shape=[3, 32, 32]), paddle.randn(shape=[64, 64])) + prepare_mask_and_masked_image(paddle.randn(shape=[3, 32, 32]), paddle.randn(shape=[64, 64])) with self.assertRaises(AssertionError): - prepare_mask_and_masked_image( - paddle.randn(shape=[2, 3, 32, 32]), - paddle.randn(shape=[4, 64, 64])) + prepare_mask_and_masked_image(paddle.randn(shape=[2, 3, 32, 32]), paddle.randn(shape=[4, 64, 64])) with self.assertRaises(AssertionError): - prepare_mask_and_masked_image( - paddle.randn(shape=[2, 3, 32, 32]), - paddle.randn(shape=[4, 1, 64, 64])) + prepare_mask_and_masked_image(paddle.randn(shape=[2, 3, 32, 32]), paddle.randn(shape=[4, 1, 64, 64])) def test_type_mismatch(self): with self.assertRaises(TypeError): - prepare_mask_and_masked_image( - paddle.rand(shape=[3, 32, 32]), - paddle.rand(shape=[3, 32, 32]).numpy()) + prepare_mask_and_masked_image(paddle.rand(shape=[3, 32, 32]), paddle.rand(shape=[3, 32, 32]).numpy()) with self.assertRaises(TypeError): - prepare_mask_and_masked_image( - paddle.rand(shape=[3, 32, 32]).numpy(), - paddle.rand(shape=[3, 32, 32])) + prepare_mask_and_masked_image(paddle.rand(shape=[3, 32, 32]).numpy(), paddle.rand(shape=[3, 32, 32])) def test_channels_first(self): with self.assertRaises(AssertionError): - prepare_mask_and_masked_image( - paddle.rand(shape=[32, 32, 3]), paddle.rand(shape=[3, 32, 32])) + prepare_mask_and_masked_image(paddle.rand(shape=[32, 32, 3]), paddle.rand(shape=[3, 32, 32])) def test_tensor_range(self): with self.assertRaises(ValueError): - prepare_mask_and_masked_image( - paddle.ones(shape=[3, 32, 32]) * 2, paddle.rand(shape=[32, 32])) + prepare_mask_and_masked_image(paddle.ones(shape=[3, 32, 32]) * 2, paddle.rand(shape=[32, 32])) with self.assertRaises(ValueError): - prepare_mask_and_masked_image( - paddle.ones(shape=[3, 32, 32]) * -2, - paddle.rand(shape=[32, 32])) + prepare_mask_and_masked_image(paddle.ones(shape=[3, 32, 32]) * -2, paddle.rand(shape=[32, 32])) with self.assertRaises(ValueError): - prepare_mask_and_masked_image( - paddle.rand(shape=[3, 32, 32]), paddle.ones(shape=[32, 32]) * 2) + prepare_mask_and_masked_image(paddle.rand(shape=[3, 32, 32]), paddle.ones(shape=[32, 32]) * 2) with self.assertRaises(ValueError): - prepare_mask_and_masked_image( - paddle.rand(shape=[3, 32, 32]), - paddle.ones(shape=[32, 32]) * -1) + prepare_mask_and_masked_image(paddle.rand(shape=[3, 32, 32]), paddle.ones(shape=[32, 32]) * -1) diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py index 6866f1a367654..aef0082255467 100644 --- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py +++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py @@ -22,13 +22,23 @@ from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer from PIL import Image -from ppdiffusers import (AutoencoderKL, DDIMScheduler, - DPMSolverMultistepScheduler, LMSDiscreteScheduler, - PNDMScheduler, StableDiffusionInpaintPipelineLegacy, - UNet2DConditionModel, UNet2DModel, VQModel) +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + DPMSolverMultistepScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + StableDiffusionInpaintPipelineLegacy, + UNet2DConditionModel, + UNet2DModel, + VQModel, +) from ppdiffusers.utils import floats_tensor, load_image, nightly, slow -from ppdiffusers.utils.testing_utils import (load_numpy, preprocess_image, - require_paddle_gpu) +from ppdiffusers.utils.testing_utils import ( + load_numpy, + preprocess_image, + require_paddle_gpu, +) class StableDiffusionInpaintLegacyPipelineFastTests(unittest.TestCase): @@ -42,8 +52,7 @@ def dummy_image(self): batch_size = 1 num_channels = 3 sizes = 32, 32 - image = floats_tensor( - (batch_size, num_channels) + sizes, rng=random.Random(0)) + image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)) return image @property @@ -56,7 +65,8 @@ def dummy_uncond_unet(self): in_channels=3, out_channels=3, down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), ) + up_block_types=("AttnUpBlock2D", "UpBlock2D"), + ) return model @property @@ -70,7 +80,8 @@ def dummy_cond_unet(self): out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, ) + cross_attention_dim=32, + ) return model @property @@ -84,7 +95,8 @@ def dummy_cond_unet_inpaint(self): out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, ) + cross_attention_dim=32, + ) return model @property @@ -96,7 +108,8 @@ def dummy_vq_model(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=3, ) + latent_channels=3, + ) return model @property @@ -108,7 +121,8 @@ def dummy_vae(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, ) + latent_channels=4, + ) return model @property @@ -123,7 +137,8 @@ def dummy_text_encoder(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, ) + vocab_size=1000, + ) return CLIPTextModel(config).eval() @property @@ -146,13 +161,10 @@ def test_stable_diffusion_inpaint_legacy(self): scheduler = PNDMScheduler(skip_prk_steps=True) vae = self.dummy_vae bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0] init_image = Image.fromarray(np.uint8(image)).convert("RGB") - mask_image = ( - Image.fromarray(np.uint8(image + 4)).convert("RGB").resize( - (32, 32))) + mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32)) sd_pipe = StableDiffusionInpaintPipelineLegacy( unet=unet, scheduler=scheduler, @@ -160,7 +172,8 @@ def test_stable_diffusion_inpaint_legacy(self): text_encoder=bert, tokenizer=tokenizer, safety_checker=None, - feature_extractor=self.dummy_extractor, ) + feature_extractor=self.dummy_extractor, + ) sd_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" generator = paddle.Generator().manual_seed(0) @@ -171,7 +184,8 @@ def test_stable_diffusion_inpaint_legacy(self): num_inference_steps=2, output_type="np", image=init_image, - mask_image=mask_image, ) + mask_image=mask_image, + ) image = output.images generator = paddle.Generator().manual_seed(0) image_from_tuple = sd_pipe( @@ -182,32 +196,33 @@ def test_stable_diffusion_inpaint_legacy(self): output_type="np", image=init_image, mask_image=mask_image, - return_dict=False, )[0] + return_dict=False, + )[0] image_slice = image[0, -3:, -3:, -1] image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([ - 0.01514593, - 0.46352747, - 0.34991893, - 0.29177475, - 0.5415823, - 0.56992227, - 0.39533705, - 0.67953515, - 0.5445507, - ]) + expected_slice = np.array( + [ + 0.01514593, + 0.46352747, + 0.34991893, + 0.29177475, + 0.5415823, + 0.56992227, + 0.39533705, + 0.67953515, + 0.5445507, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max( - ) < 0.01 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_inpaint_legacy_batched(self): unet = self.dummy_cond_unet scheduler = PNDMScheduler(skip_prk_steps=True) vae = self.dummy_vae bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") image = self.dummy_image.permute(0, 2, 3, 1)[0] init_image = Image.fromarray(np.uint8(image)).convert("RGB") @@ -222,7 +237,8 @@ def test_stable_diffusion_inpaint_legacy_batched(self): text_encoder=bert, tokenizer=tokenizer, safety_checker=None, - feature_extractor=self.dummy_extractor, ) + feature_extractor=self.dummy_extractor, + ) sd_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" @@ -234,35 +250,40 @@ def test_stable_diffusion_inpaint_legacy_batched(self): num_inference_steps=2, output_type="np", image=init_images_tens, - mask_image=init_masks_tens, ).images + mask_image=init_masks_tens, + ).images assert images.shape == (2, 32, 32, 3) image_slice_0 = images[0, -3:, -3:, -1].flatten() image_slice_1 = images[1, -3:, -3:, -1].flatten() - expected_slice_0 = np.array([ - 0.50299895, - 0.6465979, - 0.3489662, - 0.28862774, - 0.59657216, - 0.41669005, - 0.19621253, - 0.27549136, - 0.39040852, - ]) - expected_slice_1 = np.array([ - 0.70079666, - 0.5616544, - 0.5304112, - 0.38820785, - 0.3118701, - 0.47477302, - 0.37215403, - 0.3785481, - 0.50153226, - ]) + expected_slice_0 = np.array( + [ + 0.50299895, + 0.6465979, + 0.3489662, + 0.28862774, + 0.59657216, + 0.41669005, + 0.19621253, + 0.27549136, + 0.39040852, + ] + ) + expected_slice_1 = np.array( + [ + 0.70079666, + 0.5616544, + 0.5304112, + 0.38820785, + 0.3118701, + 0.47477302, + 0.37215403, + 0.3785481, + 0.50153226, + ] + ) assert np.abs(expected_slice_0 - image_slice_0).max() < 1e-2 assert np.abs(expected_slice_1 - image_slice_1).max() < 1e-2 @@ -272,13 +293,10 @@ def test_stable_diffusion_inpaint_legacy_negative_prompt(self): scheduler = PNDMScheduler(skip_prk_steps=True) vae = self.dummy_vae bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0] init_image = Image.fromarray(np.uint8(image)).convert("RGB") - mask_image = ( - Image.fromarray(np.uint8(image + 4)).convert("RGB").resize( - (32, 32))) + mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32)) sd_pipe = StableDiffusionInpaintPipelineLegacy( unet=unet, scheduler=scheduler, @@ -286,7 +304,8 @@ def test_stable_diffusion_inpaint_legacy_negative_prompt(self): text_encoder=bert, tokenizer=tokenizer, safety_checker=None, - feature_extractor=self.dummy_extractor, ) + feature_extractor=self.dummy_extractor, + ) sd_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" negative_prompt = "french fries" @@ -299,21 +318,24 @@ def test_stable_diffusion_inpaint_legacy_negative_prompt(self): num_inference_steps=2, output_type="np", image=init_image, - mask_image=mask_image, ) + mask_image=mask_image, + ) image = output.images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([ - 0.0, - 0.43941003, - 0.32130337, - 0.31442684, - 0.566114, - 0.56392324, - 0.3946159, - 0.6844422, - 0.5345681, - ]) + expected_slice = np.array( + [ + 0.0, + 0.43941003, + 0.32130337, + 0.31442684, + 0.566114, + 0.56392324, + 0.3946159, + 0.6844422, + 0.5345681, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self): @@ -321,13 +343,10 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self): scheduler = PNDMScheduler(skip_prk_steps=True) vae = self.dummy_vae bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0] init_image = Image.fromarray(np.uint8(image)).convert("RGB") - mask_image = ( - Image.fromarray(np.uint8(image + 4)).convert("RGB").resize( - (32, 32))) + mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32)) sd_pipe = StableDiffusionInpaintPipelineLegacy( unet=unet, scheduler=scheduler, @@ -335,7 +354,8 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self): text_encoder=bert, tokenizer=tokenizer, safety_checker=None, - feature_extractor=self.dummy_extractor, ) + feature_extractor=self.dummy_extractor, + ) sd_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" images = sd_pipe( @@ -343,7 +363,8 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self): num_inference_steps=2, output_type="np", image=init_image, - mask_image=mask_image, ).images + mask_image=mask_image, + ).images assert images.shape == (1, 32, 32, 3) batch_size = 2 images = sd_pipe( @@ -351,7 +372,8 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self): num_inference_steps=2, output_type="np", image=init_image, - mask_image=mask_image, ).images + mask_image=mask_image, + ).images assert images.shape == (batch_size, 32, 32, 3) num_images_per_prompt = 2 images = sd_pipe( @@ -360,7 +382,8 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self): output_type="np", image=init_image, mask_image=mask_image, - num_images_per_prompt=num_images_per_prompt, ).images + num_images_per_prompt=num_images_per_prompt, + ).images assert images.shape == (num_images_per_prompt, 32, 32, 3) batch_size = 2 images = sd_pipe( @@ -369,7 +392,8 @@ def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self): output_type="np", image=init_image, mask_image=mask_image, - num_images_per_prompt=num_images_per_prompt, ).images + num_images_per_prompt=num_images_per_prompt, + ).images assert images.shape == (batch_size * num_images_per_prompt, 32, 32, 3) @@ -383,10 +407,8 @@ def tearDown(self): def get_inputs(self, seed=0): generator = paddle.Generator().manual_seed(seed) - init_image = load_image( - "https://paddlenlp.bj.bcebos.com/data/images/input_bench_image.png") - mask_image = load_image( - "https://paddlenlp.bj.bcebos.com/data/images/input_bench_mask.png") + init_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_bench_image.png") + mask_image = load_image("https://paddlenlp.bj.bcebos.com/data/images/input_bench_mask.png") inputs = { "prompt": "A red cat sitting on a park bench", "image": init_image, @@ -401,29 +423,33 @@ def get_inputs(self, seed=0): def test_stable_diffusion_inpaint_legacy_pndm(self): pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None) + "CompVis/stable-diffusion-v1-4", safety_checker=None + ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() inputs = self.get_inputs() image = pipe(**inputs).images image_slice = image[0, 253:256, 253:256, -1].flatten() assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.27226633, - 0.29068208, - 0.3450312, - 0.21444553, - 0.26328486, - 0.34392387, - 0.18026042, - 0.24961185, - 0.3214044, - ]) + expected_slice = np.array( + [ + 0.27226633, + 0.29068208, + 0.3450312, + 0.21444553, + 0.26328486, + 0.34392387, + 0.18026042, + 0.24961185, + 0.3214044, + ] + ) assert np.abs(expected_slice - image_slice).max() < 0.0001 def test_stable_diffusion_inpaint_legacy_batched(self): pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None) + "CompVis/stable-diffusion-v1-4", safety_checker=None + ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() @@ -443,35 +469,40 @@ def test_stable_diffusion_inpaint_legacy_batched(self): image_slice_0 = image[0, 253:256, 253:256, -1].flatten() image_slice_1 = image[1, 253:256, 253:256, -1].flatten() - expected_slice_0 = np.array([ - 0.27526367, - 0.29158682, - 0.35184938, - 0.21504477, - 0.26708275, - 0.35169, - 0.18185198, - 0.2572803, - 0.32425082, - ]) - expected_slice_1 = np.array([ - 0.0, - 0.18929192, - 0.7068148, - 0.07977328, - 0.13444492, - 0.5016247, - 0.49761847, - 0.2830933, - 0.36412603, - ]) + expected_slice_0 = np.array( + [ + 0.27526367, + 0.29158682, + 0.35184938, + 0.21504477, + 0.26708275, + 0.35169, + 0.18185198, + 0.2572803, + 0.32425082, + ] + ) + expected_slice_1 = np.array( + [ + 0.0, + 0.18929192, + 0.7068148, + 0.07977328, + 0.13444492, + 0.5016247, + 0.49761847, + 0.2830933, + 0.36412603, + ] + ) assert np.abs(expected_slice_0 - image_slice_0).max() < 1e-4 assert np.abs(expected_slice_1 - image_slice_1).max() < 1e-4 def test_stable_diffusion_inpaint_legacy_k_lms(self): pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None) + "CompVis/stable-diffusion-v1-4", safety_checker=None + ) pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() @@ -479,24 +510,25 @@ def test_stable_diffusion_inpaint_legacy_k_lms(self): image = pipe(**inputs).images image_slice = image[0, 253:256, 253:256, -1].flatten() assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.29036117, - 0.28907132, - 0.32839334, - 0.26510137, - 0.2820784, - 0.31148806, - 0.29358387, - 0.29515788, - 0.28257304, - ]) + expected_slice = np.array( + [ + 0.29036117, + 0.28907132, + 0.32839334, + 0.26510137, + 0.2820784, + 0.31148806, + 0.29358387, + 0.29515788, + 0.28257304, + ] + ) assert np.abs(expected_slice - image_slice).max() < 0.0001 def test_stable_diffusion_inpaint_legacy_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, - latents: paddle.Tensor) -> None: + def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 @@ -504,42 +536,45 @@ def callback_fn(step: int, timestep: int, latents = latents.detach().cpu().numpy() assert latents.shape == (1, 4, 64, 64) latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([ - -0.103, - 1.415, - -0.02197, - -0.5103, - -0.5903, - 0.1953, - 0.75, - 0.3477, - -1.356, - ]) - assert np.abs(latents_slice.flatten() - expected_slice).max( - ) < 0.001 + expected_slice = np.array( + [ + -0.103, + 1.415, + -0.02197, + -0.5103, + -0.5903, + 0.1953, + 0.75, + 0.3477, + -1.356, + ] + ) + assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.001 elif step == 2: latents = latents.detach().cpu().numpy() assert latents.shape == (1, 4, 64, 64) latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([ - 0.4802, - 1.154, - 0.628, - 0.2322, - 0.2593, - -0.1455, - 0.7075, - -0.1617, - -0.5615, - ]) - assert np.abs(latents_slice.flatten() - expected_slice).max( - ) < 0.001 + expected_slice = np.array( + [ + 0.4802, + 1.154, + 0.628, + 0.2322, + 0.2593, + -0.1455, + 0.7075, + -0.1617, + -0.5615, + ] + ) + assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.001 callback_fn.has_been_called = False pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained( "CompVis/stable-diffusion-v1-4", safety_checker=None, - paddle_dtype=paddle.float16, ) + paddle_dtype=paddle.float16, + ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() inputs = self.get_inputs() @@ -577,20 +612,17 @@ def get_inputs(self, dtype="float32", seed=0): return inputs def test_inpaint_pndm(self): - sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained( - "runwayml/stable-diffusion-v1-5") + sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5") sd_pipe sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images[0] - expected_image = np.array([[0.7330009, 0.80003107, 0.8268216], - [0.73606366, 0.801595, 0.8470554]]) + expected_image = np.array([[0.7330009, 0.80003107, 0.8268216], [0.73606366, 0.801595, 0.8470554]]) max_diff = np.abs(expected_image - image[0][0:2]).max() assert max_diff < 0.001 def test_inpaint_ddim(self): - sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained( - "runwayml/stable-diffusion-v1-5") + sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5") sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config) sd_pipe sd_pipe.set_progress_bar_config(disable=None) @@ -599,36 +631,29 @@ def test_inpaint_ddim(self): expected_image = load_numpy( "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint_legacy/stable_diffusion_1_5_ddim.npy" ) - expected_image = np.array([[0.7290994, 0.794852, 0.82096446], - [0.7330909, 0.79727536, 0.8420528]]) + expected_image = np.array([[0.7290994, 0.794852, 0.82096446], [0.7330909, 0.79727536, 0.8420528]]) max_diff = np.abs(expected_image - image[0][0:2]).max() assert max_diff < 0.001 def test_inpaint_lms(self): - sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained( - "runwayml/stable-diffusion-v1-5") - sd_pipe.scheduler = LMSDiscreteScheduler.from_config( - sd_pipe.scheduler.config) + sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5") + sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config) sd_pipe sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images[0] - expected_image = np.array([[0.74595624, 0.81757987, 0.84589916], - [0.74728143, 0.81736475, 0.86543]]) + expected_image = np.array([[0.74595624, 0.81757987, 0.84589916], [0.74728143, 0.81736475, 0.86543]]) max_diff = np.abs(expected_image - image[0][0:2]).max() assert max_diff < 0.001 def test_inpaint_dpm(self): - sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained( - "runwayml/stable-diffusion-v1-5") - sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config( - sd_pipe.scheduler.config) + sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5") + sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config) sd_pipe sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() inputs["num_inference_steps"] = 30 image = sd_pipe(**inputs).images[0] - expected_image = np.array([[0.7310472, 0.7970823, 0.8231524], - [0.7348697, 0.799358, 0.8439586]]) + expected_image = np.array([[0.7310472, 0.7970823, 0.8231524], [0.7348697, 0.799358, 0.8439586]]) max_diff = np.abs(expected_image - image[0][0:2]).max() assert max_diff < 0.001 diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py index c367a6f472e51..0a6d49df4418f 100644 --- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py +++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py @@ -22,20 +22,26 @@ from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer from PIL import Image -from ppdiffusers import (AutoencoderKL, DDIMScheduler, - EulerAncestralDiscreteScheduler, LMSDiscreteScheduler, - PNDMScheduler, StableDiffusionInstructPix2PixPipeline, - UNet2DConditionModel) +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + EulerAncestralDiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + StableDiffusionInstructPix2PixPipeline, + UNet2DConditionModel, +) from ppdiffusers.utils import floats_tensor, load_image, slow from ppdiffusers.utils.testing_utils import require_paddle_gpu -from ..pipeline_params import (TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, - TEXT_GUIDED_IMAGE_VARIATION_PARAMS) +from ..pipeline_params import ( + TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_PARAMS, +) from ..test_pipelines_common import PipelineTesterMixin -class StableDiffusionInstructPix2PixPipelineFastTests(PipelineTesterMixin, - unittest.TestCase): +class StableDiffusionInstructPix2PixPipelineFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionInstructPix2PixPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - { "height", @@ -54,7 +60,8 @@ def get_dummy_components(self): out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, ) + cross_attention_dim=32, + ) scheduler = PNDMScheduler(skip_prk_steps=True) paddle.seed(0) vae = AutoencoderKL( @@ -63,7 +70,8 @@ def get_dummy_components(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, ) + latent_channels=4, + ) paddle.seed(0) text_encoder_config = CLIPTextConfig( bos_token_id=0, @@ -74,10 +82,10 @@ def get_dummy_components(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, ) + vocab_size=1000, + ) text_encoder = CLIPTextModel(text_encoder_config).eval() - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") components = { "unet": unet, "scheduler": scheduler, @@ -114,17 +122,19 @@ def test_stable_diffusion_pix2pix_default_case(self): image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([ - 0.24897021, - 0.3813318, - 0.15630311, - 0.69198483, - 0.7409521, - 0.55128014, - 0.5978868, - 0.60921687, - 0.47007012, - ]) + expected_slice = np.array( + [ + 0.24897021, + 0.3813318, + 0.15630311, + 0.69198483, + 0.7409521, + 0.55128014, + 0.5978868, + 0.60921687, + 0.47007012, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001 def test_stable_diffusion_pix2pix_negative_prompt(self): @@ -137,17 +147,19 @@ def test_stable_diffusion_pix2pix_negative_prompt(self): image = output.images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([ - 0.27121854, - 0.34936333, - 0.12865198, - 0.77894104, - 0.81688535, - 0.6136005, - 0.62261313, - 0.6386795, - 0.5096967, - ]) + expected_slice = np.array( + [ + 0.27121854, + 0.34936333, + 0.12865198, + 0.77894104, + 0.81688535, + 0.6136005, + 0.62261313, + 0.6386795, + 0.5096967, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001 def test_stable_diffusion_pix2pix_multiple_init_images(self): @@ -164,23 +176,26 @@ def test_stable_diffusion_pix2pix_multiple_init_images(self): image_slice = image[-1, -3:, -3:, -1] assert image.shape == (2, 32, 32, 3) - expected_slice = np.array([ - 0.41508308, - 0.41580454, - 0.5588631, - 0.32340443, - 0.20930073, - 0.35993075, - 0.28470254, - 0.38203996, - 0.51769114, - ]) + expected_slice = np.array( + [ + 0.41508308, + 0.41580454, + 0.5588631, + 0.32340443, + 0.20930073, + 0.35993075, + 0.28470254, + 0.38203996, + 0.51769114, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001 def test_stable_diffusion_pix2pix_euler(self): components = self.get_dummy_components() components["scheduler"] = EulerAncestralDiscreteScheduler( - beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear") + beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear" + ) sd_pipe = StableDiffusionInstructPix2PixPipeline(**components) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs() @@ -189,17 +204,19 @@ def test_stable_diffusion_pix2pix_euler(self): slice = [round(x, 4) for x in image_slice.flatten().tolist()] print(",".join([str(x) for x in slice])) assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([ - 0.26694882, - 0.4288544, - 0.21950376, - 0.74369204, - 0.6756442, - 0.54577595, - 0.5941435, - 0.5603916, - 0.51743454, - ]) + expected_slice = np.array( + [ + 0.26694882, + 0.4288544, + 0.21950376, + 0.74369204, + 0.6756442, + 0.54577595, + 0.5941435, + 0.5603916, + 0.51743454, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001 @@ -213,8 +230,7 @@ def tearDown(self): def get_inputs(self, seed=0): generator = paddle.Generator().manual_seed(seed=seed) - image = load_image( - "https://paddlenlp.bj.bcebos.com/data/images/example.jpg") + image = load_image("https://paddlenlp.bj.bcebos.com/data/images/example.jpg") inputs = { "prompt": "turn him into a cyborg", "image": image, @@ -228,29 +244,33 @@ def get_inputs(self, seed=0): def test_stable_diffusion_pix2pix_default(self): pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained( - "timbrooks/instruct-pix2pix", safety_checker=None) + "timbrooks/instruct-pix2pix", safety_checker=None + ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() inputs = self.get_inputs() image = pipe(**inputs).images image_slice = image[0, -3:, -3:, -1].flatten() assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.32138163, - 0.32519442, - 0.33127248, - 0.32613453, - 0.33317798, - 0.33505, - 0.32397628, - 0.32964426, - 0.32055843, - ]) + expected_slice = np.array( + [ + 0.32138163, + 0.32519442, + 0.33127248, + 0.32613453, + 0.33317798, + 0.33505, + 0.32397628, + 0.32964426, + 0.32055843, + ] + ) assert np.abs(expected_slice - image_slice).max() < 0.001 def test_stable_diffusion_pix2pix_k_lms(self): pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained( - "timbrooks/instruct-pix2pix", safety_checker=None) + "timbrooks/instruct-pix2pix", safety_checker=None + ) pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() @@ -258,22 +278,25 @@ def test_stable_diffusion_pix2pix_k_lms(self): image = pipe(**inputs).images image_slice = image[0, -3:, -3:, -1].flatten() assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.38934484, - 0.3929934, - 0.39973113, - 0.4196028, - 0.42386433, - 0.43073824, - 0.4267708, - 0.43173674, - 0.41896266, - ]) + expected_slice = np.array( + [ + 0.38934484, + 0.3929934, + 0.39973113, + 0.4196028, + 0.42386433, + 0.43073824, + 0.4267708, + 0.43173674, + 0.41896266, + ] + ) assert np.abs(expected_slice - image_slice).max() < 0.001 def test_stable_diffusion_pix2pix_ddim(self): pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained( - "timbrooks/instruct-pix2pix", safety_checker=None) + "timbrooks/instruct-pix2pix", safety_checker=None + ) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() @@ -281,24 +304,25 @@ def test_stable_diffusion_pix2pix_ddim(self): image = pipe(**inputs).images image_slice = image[0, -3:, -3:, -1].flatten() assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.51511174, - 0.5185677, - 0.51326, - 0.5176025, - 0.514665, - 0.519833, - 0.52196854, - 0.5121842, - 0.52435803, - ]) + expected_slice = np.array( + [ + 0.51511174, + 0.5185677, + 0.51326, + 0.5176025, + 0.514665, + 0.519833, + 0.52196854, + 0.5121842, + 0.52435803, + ] + ) assert np.abs(expected_slice - image_slice).max() < 0.001 def test_stable_diffusion_pix2pix_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, - latents: paddle.Tensor) -> None: + def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 @@ -306,28 +330,21 @@ def callback_fn(step: int, timestep: int, latents = latents.detach().cpu().numpy() assert latents.shape == (1, 4, 64, 64) latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([ - -0.7104, -0.8994, -1.387, 1.825, 1.964, 1.377, 1.158, 1.556, - 1.227 - ]) - assert np.abs(latents_slice.flatten() - expected_slice).max( - ) < 0.05 + expected_slice = np.array([-0.7104, -0.8994, -1.387, 1.825, 1.964, 1.377, 1.158, 1.556, 1.227]) + assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05 elif step == 2: latents = latents.detach().cpu().numpy() assert latents.shape == (1, 4, 64, 64) latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([ - -0.7124, -0.9087, -1.384, 1.826, 1.992, 1.368, 1.16, 1.537, - 1.239 - ]) - assert np.abs(latents_slice.flatten() - expected_slice).max( - ) < 0.05 + expected_slice = np.array([-0.7124, -0.9087, -1.384, 1.826, 1.992, 1.368, 1.16, 1.537, 1.239]) + assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05 callback_fn.has_been_called = False pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained( "timbrooks/instruct-pix2pix", safety_checker=None, - paddle_dtype=paddle.float16, ) + paddle_dtype=paddle.float16, + ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() inputs = self.get_inputs() @@ -339,23 +356,24 @@ def test_stable_diffusion_pix2pix_pipeline_multiple_of_8(self): inputs = self.get_inputs() inputs["image"] = inputs["image"].resize((504, 504)) model_id = "timbrooks/instruct-pix2pix" - pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained( - model_id, safety_checker=None) + pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, safety_checker=None) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() output = pipe(**inputs) image = output.images[0] image_slice = image[255:258, 383:386, -1] assert image.shape == (504, 504, 3) - expected_slice = np.array([ - 0.183373, - 0.20458564, - 0.2428664, - 0.18245864, - 0.22010538, - 0.25757712, - 0.19680199, - 0.2185145, - 0.24869373, - ]) + expected_slice = np.array( + [ + 0.183373, + 0.20458564, + 0.2428664, + 0.18245864, + 0.22010538, + 0.25757712, + 0.19680199, + 0.2185145, + 0.24869373, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.005 diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py index 176a0629de209..9f4ef2ff6f041 100644 --- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py +++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py @@ -20,10 +20,15 @@ import paddle from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer -from ppdiffusers import (AutoencoderKL, DDIMScheduler, - EulerAncestralDiscreteScheduler, LMSDiscreteScheduler, - PNDMScheduler, StableDiffusionPanoramaPipeline, - UNet2DConditionModel) +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + EulerAncestralDiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + StableDiffusionPanoramaPipeline, + UNet2DConditionModel, +) from ppdiffusers.utils import slow from ppdiffusers.utils.testing_utils import require_paddle_gpu @@ -31,8 +36,7 @@ from ..test_pipelines_common import PipelineTesterMixin -class StableDiffusionPanoramaPipelineFastTests(PipelineTesterMixin, - unittest.TestCase): +class StableDiffusionPanoramaPipelineFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionPanoramaPipeline params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS @@ -47,7 +51,8 @@ def get_dummy_components(self): out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, ) + cross_attention_dim=32, + ) scheduler = DDIMScheduler() paddle.seed(0) vae = AutoencoderKL( @@ -56,7 +61,8 @@ def get_dummy_components(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, ) + latent_channels=4, + ) paddle.seed(0) text_encoder_config = CLIPTextConfig( bos_token_id=0, @@ -67,10 +73,10 @@ def get_dummy_components(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, ) + vocab_size=1000, + ) text_encoder = CLIPTextModel(text_encoder_config).eval() - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") components = { "unet": unet, "scheduler": scheduler, @@ -103,17 +109,19 @@ def test_stable_diffusion_panorama_default_case(self): image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.28862977, - 0.2441951, - 0.2683525, - 0.33122095, - 0.28755113, - 0.46375293, - 0.254181, - 0.30616608, - 0.4785265, - ]) + expected_slice = np.array( + [ + 0.28862977, + 0.2441951, + 0.2683525, + 0.33122095, + 0.28755113, + 0.46375293, + 0.254181, + 0.30616608, + 0.4785265, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 # override to speed the overall test timing up. @@ -134,40 +142,45 @@ def test_stable_diffusion_panorama_negative_prompt(self): image = output.images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.28995812, - 0.24463832, - 0.2682391, - 0.33033937, - 0.2868188, - 0.46267676, - 0.25425047, - 0.3066897, - 0.47881347, - ]) + expected_slice = np.array( + [ + 0.28995812, + 0.24463832, + 0.2682391, + 0.33033937, + 0.2868188, + 0.46267676, + 0.25425047, + 0.3066897, + 0.47881347, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_panorama_euler(self): components = self.get_dummy_components() components["scheduler"] = EulerAncestralDiscreteScheduler( - beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear") + beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear" + ) sd_pipe = StableDiffusionPanoramaPipeline(**components) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs() image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.32409406, - 0.2660764, - 0.41739762, - 0.18994612, - 0.32522476, - 0.4869789, - 0.13573006, - 0.14128971, - 0.32650158, - ]) + expected_slice = np.array( + [ + 0.32409406, + 0.2660764, + 0.41739762, + 0.18994612, + 0.32522476, + 0.4869789, + 0.13573006, + 0.14128971, + 0.32650158, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_panorama_pndm(self): @@ -201,32 +214,33 @@ def get_inputs(self, seed=0): def test_stable_diffusion_panorama_default(self): model_ckpt = "stabilityai/stable-diffusion-2-base" - scheduler = DDIMScheduler.from_pretrained( - model_ckpt, subfolder="scheduler") - pipe = StableDiffusionPanoramaPipeline.from_pretrained( - model_ckpt, scheduler=scheduler, safety_checker=None) + scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler") + pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() inputs = self.get_inputs() image = pipe(**inputs).images image_slice = image[0, -3:, -3:, -1].flatten() assert image.shape == (1, 512, 2048, 3) - expected_slice = np.array([ - 0.34261876, - 0.3045774, - 0.34545267, - 0.33774284, - 0.3431282, - 0.33453488, - 0.3094663, - 0.32646674, - 0.32534528, - ]) + expected_slice = np.array( + [ + 0.34261876, + 0.3045774, + 0.34545267, + 0.33774284, + 0.3431282, + 0.33453488, + 0.3094663, + 0.32646674, + 0.32534528, + ] + ) assert np.abs(expected_slice - image_slice).max() < 0.01 def test_stable_diffusion_panorama_k_lms(self): pipe = StableDiffusionPanoramaPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-base", safety_checker=None) + "stabilityai/stable-diffusion-2-base", safety_checker=None + ) pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() @@ -234,24 +248,25 @@ def test_stable_diffusion_panorama_k_lms(self): image = pipe(**inputs).images image_slice = image[0, -3:, -3:, -1].flatten() assert image.shape == (1, 512, 2048, 3) - expected_slice = np.array([ - 0.0, - 0.01188838, - 0.02675471, - 0.00534895, - 0.02325496, - 0.01234779, - 0.0348064, - 0.0, - 0.02607787, - ]) + expected_slice = np.array( + [ + 0.0, + 0.01188838, + 0.02675471, + 0.00534895, + 0.02325496, + 0.01234779, + 0.0348064, + 0.0, + 0.02607787, + ] + ) assert np.abs(expected_slice - image_slice).max() < 0.01 def test_stable_diffusion_panorama_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, - latents: paddle.Tensor) -> None: + def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 @@ -259,43 +274,43 @@ def callback_fn(step: int, timestep: int, latents = latents.detach().cpu().numpy() assert latents.shape == (1, 4, 64, 256) latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([ - 0.7392851114273071, - -0.16683124005794525, - 0.2063215672969818, - -0.09840865433216095, - 0.18722617626190186, - -0.08375956118106842, - 0.06995373964309692, - -0.20892930030822754, - -0.157355397939682, - ]) - assert np.abs(latents_slice.flatten() - expected_slice).max( - ) < 0.05 + expected_slice = np.array( + [ + 0.7392851114273071, + -0.16683124005794525, + 0.2063215672969818, + -0.09840865433216095, + 0.18722617626190186, + -0.08375956118106842, + 0.06995373964309692, + -0.20892930030822754, + -0.157355397939682, + ] + ) + assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05 elif step == 2: latents = latents.detach().cpu().numpy() assert latents.shape == (1, 4, 64, 256) latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([ - 0.7368452548980713, - -0.16317462921142578, - 0.20289096236228943, - -0.10271137207746506, - 0.1873130351305008, - -0.08454630523920059, - 0.06944799423217773, - -0.20782311260700226, - -0.15696658194065094, - ]) - assert np.abs(latents_slice.flatten() - expected_slice).max( - ) < 0.05 + expected_slice = np.array( + [ + 0.7368452548980713, + -0.16317462921142578, + 0.20289096236228943, + -0.10271137207746506, + 0.1873130351305008, + -0.08454630523920059, + 0.06944799423217773, + -0.20782311260700226, + -0.15696658194065094, + ] + ) + assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05 callback_fn.has_been_called = False model_ckpt = "stabilityai/stable-diffusion-2-base" - scheduler = DDIMScheduler.from_pretrained( - model_ckpt, subfolder="scheduler") - pipe = StableDiffusionPanoramaPipeline.from_pretrained( - model_ckpt, scheduler=scheduler, safety_checker=None) + scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler") + pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() inputs = self.get_inputs() diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py index 0bee318686efc..d4787ab8eaa4d 100644 --- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py +++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py @@ -21,14 +21,22 @@ from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer from ppdiffusers import ( - AutoencoderKL, DDIMInverseScheduler, DDIMScheduler, DDPMScheduler, - EulerAncestralDiscreteScheduler, LMSDiscreteScheduler, - StableDiffusionPix2PixZeroPipeline, UNet2DConditionModel) + AutoencoderKL, + DDIMInverseScheduler, + DDIMScheduler, + DDPMScheduler, + EulerAncestralDiscreteScheduler, + LMSDiscreteScheduler, + StableDiffusionPix2PixZeroPipeline, + UNet2DConditionModel, +) from ppdiffusers.utils import load_image, slow from ppdiffusers.utils.testing_utils import load_pt, require_paddle_gpu -from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, - TEXT_GUIDED_IMAGE_VARIATION_PARAMS) +from ..pipeline_params import ( + TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_PARAMS, +) from ..test_pipelines_common import PipelineTesterMixin @@ -39,8 +47,7 @@ def to_paddle(x): # we use SGD optimizer in this pipeline, so the result is not stable! -class StableDiffusionPix2PixZeroPipelineFastTests(PipelineTesterMixin, - unittest.TestCase): +class StableDiffusionPix2PixZeroPipelineFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionPix2PixZeroPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS @@ -51,12 +58,14 @@ def setUpClass(cls): cls.source_embeds = to_paddle( load_pt( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/src_emb_0.pt" - )) + ) + ) cls.target_embeds = to_paddle( load_pt( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/tgt_emb_0.pt" - )) + ) + ) def get_dummy_components(self): paddle.seed(0) @@ -68,7 +77,8 @@ def get_dummy_components(self): out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, ) + cross_attention_dim=32, + ) scheduler = DDIMScheduler() paddle.seed(0) vae = AutoencoderKL( @@ -77,7 +87,8 @@ def get_dummy_components(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, ) + latent_channels=4, + ) paddle.seed(0) text_encoder_config = CLIPTextConfig( bos_token_id=0, @@ -88,10 +99,10 @@ def get_dummy_components(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, ) + vocab_size=1000, + ) text_encoder = CLIPTextModel(text_encoder_config).eval() - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") components = { "unet": unet, "scheduler": scheduler, @@ -128,17 +139,19 @@ def test_stable_diffusion_pix2pix_zero_default_case(self): image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.58762765, - 0.17410329, - 0.5067884, - 0.39995563, - 0.02808204, - 0.35726422, - 0.3250693, - 0.3155224, - 0.5268162, - ]) + expected_slice = np.array( + [ + 0.58762765, + 0.17410329, + 0.5067884, + 0.39995563, + 0.02808204, + 0.35726422, + 0.3250693, + 0.3155224, + 0.5268162, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05 def test_stable_diffusion_pix2pix_zero_negative_prompt(self): @@ -151,40 +164,45 @@ def test_stable_diffusion_pix2pix_zero_negative_prompt(self): image = output.images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.5042143, - 0.34658563, - 0.56157184, - 0.3707891, - 0.23746812, - 0.47898933, - 0.2702424, - 0.36307925, - 0.50807047, - ]) + expected_slice = np.array( + [ + 0.5042143, + 0.34658563, + 0.56157184, + 0.3707891, + 0.23746812, + 0.47898933, + 0.2702424, + 0.36307925, + 0.50807047, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05 def test_stable_diffusion_pix2pix_zero_euler(self): components = self.get_dummy_components() components["scheduler"] = EulerAncestralDiscreteScheduler( - beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear") + beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear" + ) sd_pipe = StableDiffusionPix2PixZeroPipeline(**components) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs() image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.4870367, - 0.2677226, - 0.37830275, - 0.63265973, - 0.32151344, - 0.406371, - 0.67513967, - 0.5246535, - 0.55954224, - ]) + expected_slice = np.array( + [ + 0.4870367, + 0.2677226, + 0.37830275, + 0.63265973, + 0.32151344, + 0.406371, + 0.67513967, + 0.5246535, + 0.55954224, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05 def test_stable_diffusion_pix2pix_zero_ddpm(self): @@ -196,17 +214,19 @@ def test_stable_diffusion_pix2pix_zero_ddpm(self): image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.5899046, - 0.17750263, - 0.50616807, - 0.39558932, - 0.02976257, - 0.35918522, - 0.32376733, - 0.31742626, - 0.52768075, - ]) + expected_slice = np.array( + [ + 0.5899046, + 0.17750263, + 0.50616807, + 0.39558932, + 0.02976257, + 0.35918522, + 0.32376733, + 0.31742626, + 0.52768075, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05 def test_stable_diffusion_pix2pix_zero_num_images_per_prompt(self): @@ -218,14 +238,12 @@ def test_stable_diffusion_pix2pix_zero_num_images_per_prompt(self): assert images.shape == (1, 64, 64, 3) num_images_per_prompt = 2 inputs = self.get_dummy_inputs() - images = sd_pipe( - **inputs, num_images_per_prompt=num_images_per_prompt).images + images = sd_pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images assert images.shape == (num_images_per_prompt, 64, 64, 3) batch_size = 2 inputs = self.get_dummy_inputs() inputs["prompt"] = [inputs["prompt"]] * batch_size - images = sd_pipe( - **inputs, num_images_per_prompt=num_images_per_prompt).images + images = sd_pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images assert images.shape == (batch_size * num_images_per_prompt, 64, 64, 3) # Non-determinism caused by the scheduler optimizing the latent inputs during inference @@ -245,14 +263,12 @@ def tearDown(self): @classmethod def setUpClass(cls): cls.source_embeds = to_paddle( - load_pt( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat.pt" - )) + load_pt("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat.pt") + ) cls.target_embeds = to_paddle( - load_pt( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/dog.pt" - )) + load_pt("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/dog.pt") + ) def get_inputs(self, seed=0): generator = paddle.Generator().manual_seed(seed=seed) @@ -272,46 +288,48 @@ def test_stable_diffusion_pix2pix_zero_default(self): pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", safety_checker=None, - paddle_dtype=paddle.float16, ) + paddle_dtype=paddle.float16, + ) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = pipe(**inputs).images image_slice = image[0, -3:, -3:, -1].flatten() assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.8129883, - 0.81933594, - 0.80371094, - 0.8105469, - 0.8076172, - 0.80566406, - 0.81884766, - 0.8330078, - 0.82470703, - ]) + expected_slice = np.array( + [ + 0.8129883, + 0.81933594, + 0.80371094, + 0.8105469, + 0.8076172, + 0.80566406, + 0.81884766, + 0.8330078, + 0.82470703, + ] + ) assert np.abs(expected_slice - image_slice).max() < 0.05 def test_stable_diffusion_pix2pix_zero_k_lms(self): pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", safety_checker=None, - paddle_dtype=paddle.float16, ) + paddle_dtype=paddle.float16, + ) pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = pipe(**inputs).images image_slice = image[0, -3:, -3:, -1].flatten() assert image.shape == (1, 512, 512, 3) - expected_slice = np.array( - [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05053711]) + expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05053711]) assert np.abs(expected_slice - image_slice).max() < 0.05 def test_stable_diffusion_pix2pix_zero_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, - latents: paddle.Tensor) -> None: + def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 @@ -319,42 +337,45 @@ def callback_fn(step: int, timestep: int, latents = latents.detach().cpu().numpy() assert latents.shape == (1, 4, 64, 64) latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([ - 0.93444633, - 1.1613252, - 0.7700033, - 0.18847837, - -1.17147, - 0.07546477, - 0.06142269, - -0.8030814, - -0.59692276, - ]) - assert np.abs(latents_slice.flatten() - expected_slice).max( - ) < 0.05 + expected_slice = np.array( + [ + 0.93444633, + 1.1613252, + 0.7700033, + 0.18847837, + -1.17147, + 0.07546477, + 0.06142269, + -0.8030814, + -0.59692276, + ] + ) + assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05 elif step == 2: latents = latents.detach().cpu().numpy() assert latents.shape == (1, 4, 64, 64) latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([ - 0.93180454, - 1.1606954, - 0.7721853, - 0.18454231, - -1.1679069, - 0.07357024, - 0.06213593, - -0.80399096, - -0.5937987, - ]) - assert np.abs(latents_slice.flatten() - expected_slice).max( - ) < 0.05 + expected_slice = np.array( + [ + 0.93180454, + 1.1606954, + 0.7721853, + 0.18454231, + -1.1679069, + 0.07357024, + 0.06213593, + -0.80399096, + -0.5937987, + ] + ) + assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05 callback_fn.has_been_called = False pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", safety_checker=None, - paddle_dtype=paddle.float16, ) + paddle_dtype=paddle.float16, + ) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() @@ -385,38 +406,29 @@ def test_stable_diffusion_pix2pix_inversion(self): pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", safety_checker=None, - paddle_dtype=paddle.float16, ) - pipe.inverse_scheduler = DDIMScheduler.from_config( - pipe.scheduler.config) - pipe.inverse_scheduler = DDIMInverseScheduler.from_config( - pipe.scheduler.config) + paddle_dtype=paddle.float16, + ) + pipe.inverse_scheduler = DDIMScheduler.from_config(pipe.scheduler.config) + pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config) caption = "a photography of a cat with flowers" pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) pipe.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) - output = pipe.invert( - caption, - image=self.raw_image, - generator=generator, - num_inference_steps=10) + output = pipe.invert(caption, image=self.raw_image, generator=generator, num_inference_steps=10) inv_latents = output[0] image_slice = inv_latents[0, -3:, -3:, -1].flatten() assert tuple(inv_latents.shape) == (1, 4, 64, 64) - expected_slice = np.array([ - 0.8877, 0.0587, 0.77, -1.6035, -0.5962, 0.4827, -0.6265, 1.0498, - -0.8599 - ]) + expected_slice = np.array([0.8877, 0.0587, 0.77, -1.6035, -0.5962, 0.4827, -0.6265, 1.0498, -0.8599]) assert np.abs(expected_slice - image_slice.cpu().numpy()).max() < 0.05 def test_stable_diffusion_pix2pix_full(self): pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", safety_checker=None, - paddle_dtype=paddle.float16, ) - pipe.inverse_scheduler = DDIMScheduler.from_config( - pipe.scheduler.config) - pipe.inverse_scheduler = DDIMInverseScheduler.from_config( - pipe.scheduler.config) + paddle_dtype=paddle.float16, + ) + pipe.inverse_scheduler = DDIMScheduler.from_config(pipe.scheduler.config) + pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config) caption = "a photography of a cat with flowers" pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) pipe.set_progress_bar_config(disable=None) @@ -444,19 +456,22 @@ def test_stable_diffusion_pix2pix_full(self): generator=generator, latents=inv_latents, negative_prompt=caption, - output_type="np", ).images + output_type="np", + ).images image_slice = image[0, -3:, -3:, -1].flatten() - expected_slice = np.array([ - 0.64208984375, - 0.65673828125, - 0.650390625, - 0.6513671875, - 0.646484375, - 0.6650390625, - 0.6513671875, - 0.6640625, - 0.66796875, - ]) + expected_slice = np.array( + [ + 0.64208984375, + 0.65673828125, + 0.650390625, + 0.6513671875, + 0.646484375, + 0.6650390625, + 0.6513671875, + 0.6640625, + 0.66796875, + ] + ) max_diff = np.abs(image_slice - expected_slice).max() assert max_diff < 0.05 diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py index d04d08d9bb18f..aa60def2d023c 100644 --- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py +++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py @@ -20,8 +20,12 @@ import paddle from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer -from ppdiffusers import (AutoencoderKL, DDIMScheduler, - StableDiffusionSAGPipeline, UNet2DConditionModel) +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + StableDiffusionSAGPipeline, + UNet2DConditionModel, +) from ppdiffusers.utils import slow from ppdiffusers.utils.testing_utils import require_paddle_gpu @@ -29,8 +33,7 @@ from ..test_pipelines_common import PipelineTesterMixin -class StableDiffusionSAGPipelineFastTests(PipelineTesterMixin, - unittest.TestCase): +class StableDiffusionSAGPipelineFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionSAGPipeline test_cpu_offload = False params = TEXT_TO_IMAGE_PARAMS @@ -46,13 +49,15 @@ def get_dummy_components(self): out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, ) + cross_attention_dim=32, + ) scheduler = DDIMScheduler( beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, - set_alpha_to_one=False, ) + set_alpha_to_one=False, + ) paddle.seed(0) vae = AutoencoderKL( block_out_channels=[32, 64], @@ -60,7 +65,8 @@ def get_dummy_components(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, ) + latent_channels=4, + ) paddle.seed(0) text_encoder_config = CLIPTextConfig( bos_token_id=0, @@ -71,10 +77,10 @@ def get_dummy_components(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, ) + vocab_size=1000, + ) text_encoder = CLIPTextModel(text_encoder_config).eval() - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") components = { "unet": unet, "scheduler": scheduler, @@ -109,8 +115,7 @@ def tearDown(self): paddle.device.cuda.empty_cache() def test_stable_diffusion_1(self): - sag_pipe = StableDiffusionSAGPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4") + sag_pipe = StableDiffusionSAGPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") sag_pipe.set_progress_bar_config(disable=None) prompt = "." generator = paddle.Generator().manual_seed(0) @@ -120,26 +125,28 @@ def test_stable_diffusion_1(self): guidance_scale=7.5, sag_scale=1.0, num_inference_steps=20, - output_type="np", ) + output_type="np", + ) image = output.images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.7477613, - 0.76045597, - 0.7464366, - 0.778965, - 0.75718963, - 0.7487634, - 0.77530396, - 0.77426934, - 0.7749926, - ]) + expected_slice = np.array( + [ + 0.7477613, + 0.76045597, + 0.7464366, + 0.778965, + 0.75718963, + 0.7487634, + 0.77530396, + 0.77426934, + 0.7749926, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05 def test_stable_diffusion_2(self): - sag_pipe = StableDiffusionSAGPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-1-base") + sag_pipe = StableDiffusionSAGPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base") sag_pipe.set_progress_bar_config(disable=None) prompt = "." generator = paddle.Generator().manual_seed(0) @@ -149,19 +156,22 @@ def test_stable_diffusion_2(self): guidance_scale=7.5, sag_scale=1.0, num_inference_steps=20, - output_type="np", ) + output_type="np", + ) image = output.images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.8771595, - 0.8521123, - 0.8644101, - 0.8680052, - 0.8700466, - 0.8897612, - 0.87766427, - 0.8636212, - 0.86829203, - ]) + expected_slice = np.array( + [ + 0.8771595, + 0.8521123, + 0.8644101, + 0.8680052, + 0.8700466, + 0.8897612, + 0.87766427, + 0.8636212, + 0.86829203, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05 diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py index 07d1870d2afd5..1e95848760207 100644 --- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py +++ b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py @@ -21,10 +21,17 @@ from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer from ppdiffusers import ( - AutoencoderKL, DDIMScheduler, DPMSolverMultistepScheduler, - EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, - LMSDiscreteScheduler, PNDMScheduler, StableDiffusionPipeline, - UNet2DConditionModel, logging) + AutoencoderKL, + DDIMScheduler, + DPMSolverMultistepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + StableDiffusionPipeline, + UNet2DConditionModel, + logging, +) from ppdiffusers.utils import load_numpy, nightly, slow from ppdiffusers.utils.testing_utils import CaptureLogger, require_paddle_gpu @@ -49,13 +56,15 @@ def get_dummy_components(self): up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), cross_attention_dim=32, attention_head_dim=(2, 4), - use_linear_projection=True, ) + use_linear_projection=True, + ) scheduler = DDIMScheduler( beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, - set_alpha_to_one=False, ) + set_alpha_to_one=False, + ) paddle.seed(0) vae = AutoencoderKL( block_out_channels=[32, 64], @@ -64,7 +73,8 @@ def get_dummy_components(self): down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], latent_channels=4, - sample_size=128, ) + sample_size=128, + ) paddle.seed(0) text_encoder_config = CLIPTextConfig( bos_token_id=0, @@ -77,10 +87,10 @@ def get_dummy_components(self): pad_token_id=1, vocab_size=1000, hidden_act="gelu", - projection_dim=512, ) + projection_dim=512, + ) text_encoder = CLIPTextModel(text_encoder_config).eval() - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") components = { "unet": unet, "scheduler": scheduler, @@ -112,17 +122,19 @@ def test_stable_diffusion_ddim(self): image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.3505131, - 0.36318004, - 0.39201266, - 0.12107915, - 0.27704653, - 0.40363187, - 0.09379572, - 0.16225743, - 0.36048344, - ]) + expected_slice = np.array( + [ + 0.3505131, + 0.36318004, + 0.39201266, + 0.12107915, + 0.27704653, + 0.40363187, + 0.09379572, + 0.16225743, + 0.36048344, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_pndm(self): @@ -134,122 +146,127 @@ def test_stable_diffusion_pndm(self): image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.25144678, - 0.35438284, - 0.3613463, - 0.11020249, - 0.3101831, - 0.42739886, - 0.1142821, - 0.17371863, - 0.35148838, - ]) + expected_slice = np.array( + [ + 0.25144678, + 0.35438284, + 0.3613463, + 0.11020249, + 0.3101831, + 0.42739886, + 0.1142821, + 0.17371863, + 0.35148838, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_k_lms(self): components = self.get_dummy_components() - components["scheduler"] = LMSDiscreteScheduler.from_config(components[ - "scheduler"].config) + components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config) sd_pipe = StableDiffusionPipeline(**components) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs() image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.3676631, - 0.38155898, - 0.4023114, - 0.11294425, - 0.2891888, - 0.40432304, - 0.08882684, - 0.1466648, - 0.33633134, - ]) + expected_slice = np.array( + [ + 0.3676631, + 0.38155898, + 0.4023114, + 0.11294425, + 0.2891888, + 0.40432304, + 0.08882684, + 0.1466648, + 0.33633134, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_k_euler_ancestral(self): components = self.get_dummy_components() - components["scheduler"] = EulerAncestralDiscreteScheduler.from_config( - components["scheduler"].config) + components["scheduler"] = EulerAncestralDiscreteScheduler.from_config(components["scheduler"].config) sd_pipe = StableDiffusionPipeline(**components) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs() image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.36797395, - 0.38137895, - 0.40199342, - 0.11330777, - 0.2886864, - 0.40422022, - 0.08929691, - 0.14658183, - 0.3363046, - ]) + expected_slice = np.array( + [ + 0.36797395, + 0.38137895, + 0.40199342, + 0.11330777, + 0.2886864, + 0.40422022, + 0.08929691, + 0.14658183, + 0.3363046, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_k_euler(self): components = self.get_dummy_components() - components["scheduler"] = EulerDiscreteScheduler.from_config(components[ - "scheduler"].config) + components["scheduler"] = EulerDiscreteScheduler.from_config(components["scheduler"].config) sd_pipe = StableDiffusionPipeline(**components) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs() image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.36766386, - 0.3815591, - 0.40231153, - 0.11294428, - 0.28918856, - 0.40432304, - 0.08882678, - 0.14666462, - 0.3363313, - ]) + expected_slice = np.array( + [ + 0.36766386, + 0.3815591, + 0.40231153, + 0.11294428, + 0.28918856, + 0.40432304, + 0.08882678, + 0.14666462, + 0.3363313, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_long_prompt(self): components = self.get_dummy_components() - components["scheduler"] = LMSDiscreteScheduler.from_config(components[ - "scheduler"].config) + components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config) sd_pipe = StableDiffusionPipeline(**components) sd_pipe.set_progress_bar_config(disable=None) do_classifier_free_guidance = True negative_prompt = None num_images_per_prompt = 1 - logger = logging.get_logger( - "ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion") + logger = logging.get_logger("ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion") prompt = 25 * "@" with CaptureLogger(logger) as cap_logger_3: text_embeddings_3 = sd_pipe._encode_prompt( prompt, num_images_per_prompt, do_classifier_free_guidance, - negative_prompt, ) + negative_prompt, + ) prompt = 100 * "@" with CaptureLogger(logger) as cap_logger: text_embeddings = sd_pipe._encode_prompt( prompt, num_images_per_prompt, do_classifier_free_guidance, - negative_prompt, ) + negative_prompt, + ) negative_prompt = "Hello" with CaptureLogger(logger) as cap_logger_2: text_embeddings_2 = sd_pipe._encode_prompt( prompt, num_images_per_prompt, do_classifier_free_guidance, - negative_prompt, ) - assert (text_embeddings_3.shape == text_embeddings_2.shape == - text_embeddings.shape) + negative_prompt, + ) + assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape assert text_embeddings.shape[1] == 77 assert cap_logger.out == cap_logger_2.out assert cap_logger.out.count("@") == 25 @@ -279,68 +296,71 @@ def get_inputs(self, dtype="float32", seed=0): return inputs def test_stable_diffusion_default_ddim(self): - pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-base") + pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base") pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = pipe(**inputs).images image_slice = image[0, -3:, -3:, -1].flatten() assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.49493, - 0.47896, - 0.40798, - 0.54214, - 0.53212, - 0.48202, - 0.47656, - 0.46329, - 0.48506, - ]) + expected_slice = np.array( + [ + 0.49493, + 0.47896, + 0.40798, + 0.54214, + 0.53212, + 0.48202, + 0.47656, + 0.46329, + 0.48506, + ] + ) assert np.abs(image_slice - expected_slice).max() < 0.0001 def test_stable_diffusion_pndm(self): - pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-base") + pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base") pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config) pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = pipe(**inputs).images image_slice = image[0, -3:, -3:, -1].flatten() assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.49493, - 0.47896, - 0.40798, - 0.54214, - 0.53212, - 0.48202, - 0.47656, - 0.46329, - 0.48506, - ]) + expected_slice = np.array( + [ + 0.49493, + 0.47896, + 0.40798, + 0.54214, + 0.53212, + 0.48202, + 0.47656, + 0.46329, + 0.48506, + ] + ) assert np.abs(image_slice - expected_slice).max() < 0.0001 def test_stable_diffusion_k_lms(self): - pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-base") + pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base") pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = pipe(**inputs).images image_slice = image[0, -3:, -3:, -1].flatten() assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.1044, - 0.13115, - 0.111, - 0.10141, - 0.1144, - 0.07215, - 0.11332, - 0.09693, - 0.10006, - ]) + expected_slice = np.array( + [ + 0.1044, + 0.13115, + 0.111, + 0.10141, + 0.1144, + 0.07215, + 0.11332, + 0.09693, + 0.10006, + ] + ) assert np.abs(image_slice - expected_slice).max() < 0.0001 # def test_stable_diffusion_attention_slicing(self): @@ -363,8 +383,7 @@ def test_stable_diffusion_k_lms(self): def test_stable_diffusion_text2img_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, - latents: paddle.Tensor) -> None: + def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 @@ -372,40 +391,43 @@ def callback_fn(step: int, timestep: int, latents = latents.detach().cpu().numpy() assert latents.shape == (1, 4, 64, 64) latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([ - -0.3862, - -0.4507, - -1.1729, - 0.0686, - -1.1045, - 0.7124, - -1.8301, - 0.1903, - 1.2773, - ]) - assert np.abs(latents_slice.flatten() - expected_slice).max( - ) < 0.05 + expected_slice = np.array( + [ + -0.3862, + -0.4507, + -1.1729, + 0.0686, + -1.1045, + 0.7124, + -1.8301, + 0.1903, + 1.2773, + ] + ) + assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05 elif step == 2: latents = latents.detach().cpu().numpy() assert latents.shape == (1, 4, 64, 64) latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([ - 0.272, - -0.1863, - -0.7383, - -0.5029, - -0.7534, - 0.397, - -0.7646, - 0.4468, - 1.2686, - ]) - assert np.abs(latents_slice.flatten() - expected_slice).max( - ) < 0.05 + expected_slice = np.array( + [ + 0.272, + -0.1863, + -0.7383, + -0.5029, + -0.7534, + 0.397, + -0.7646, + 0.4468, + 1.2686, + ] + ) + assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05 callback_fn.has_been_called = False pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-base", paddle_dtype=paddle.float16) + "stabilityai/stable-diffusion-2-base", paddle_dtype=paddle.float16 + ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() inputs = self.get_inputs(dtype="float16") @@ -437,8 +459,7 @@ def get_inputs(self, dtype="float32", seed=0): return inputs def test_stable_diffusion_2_0_default_ddim(self): - sd_pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-base") + sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base") sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images[0] @@ -449,8 +470,7 @@ def test_stable_diffusion_2_0_default_ddim(self): assert max_diff < 0.01 def test_stable_diffusion_2_1_default_pndm(self): - sd_pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-1-base") + sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base") sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images[0] @@ -461,8 +481,7 @@ def test_stable_diffusion_2_1_default_pndm(self): assert max_diff < 0.01 def test_stable_diffusion_ddim(self): # not pass - sd_pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-1-base") + sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base") sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() @@ -474,10 +493,8 @@ def test_stable_diffusion_ddim(self): # not pass assert max_diff < 0.01 def test_stable_diffusion_lms(self): - sd_pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-1-base") - sd_pipe.scheduler = LMSDiscreteScheduler.from_config( - sd_pipe.scheduler.config) + sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base") + sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images[0] @@ -488,10 +505,8 @@ def test_stable_diffusion_lms(self): assert max_diff < 0.01 def test_stable_diffusion_euler(self): - sd_pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-1-base") - sd_pipe.scheduler = EulerDiscreteScheduler.from_config( - sd_pipe.scheduler.config) + sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base") + sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() image = sd_pipe(**inputs).images[0] @@ -502,10 +517,8 @@ def test_stable_diffusion_euler(self): assert max_diff < 0.01 def test_stable_diffusion_dpm(self): # not pass - sd_pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-1-base") - sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config( - sd_pipe.scheduler.config) + sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base") + sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_inputs() inputs["num_inference_steps"] = 25 diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py index 56aa066eb5a02..c63bfcf099735 100644 --- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py +++ b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py @@ -20,9 +20,12 @@ import paddle from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer -from ppdiffusers import (AutoencoderKL, DDIMScheduler, - StableDiffusionAttendAndExcitePipeline, - UNet2DConditionModel) +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + StableDiffusionAttendAndExcitePipeline, + UNet2DConditionModel, +) from ppdiffusers.utils import load_numpy, slow from ppdiffusers.utils.testing_utils import require_paddle_gpu @@ -30,8 +33,7 @@ from ..test_pipelines_common import PipelineTesterMixin -class StableDiffusionAttendAndExcitePipelineFastTests(PipelineTesterMixin, - unittest.TestCase): +class StableDiffusionAttendAndExcitePipelineFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionAttendAndExcitePipeline test_attention_slicing = False params = TEXT_TO_IMAGE_PARAMS @@ -49,13 +51,15 @@ def get_dummy_components(self): up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), cross_attention_dim=32, attention_head_dim=(2, 4), - use_linear_projection=True, ) + use_linear_projection=True, + ) scheduler = DDIMScheduler( beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, - set_alpha_to_one=False, ) + set_alpha_to_one=False, + ) paddle.seed(0) vae = AutoencoderKL( block_out_channels=[32, 64], @@ -64,7 +68,8 @@ def get_dummy_components(self): down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], latent_channels=4, - sample_size=128, ) + sample_size=128, + ) paddle.seed(0) text_encoder_config = CLIPTextConfig( bos_token_id=0, @@ -77,10 +82,10 @@ def get_dummy_components(self): pad_token_id=1, vocab_size=1000, hidden_act="gelu", - projection_dim=512, ) + projection_dim=512, + ) text_encoder = CLIPTextModel(text_encoder_config).eval() - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") components = { "unet": unet, "scheduler": scheduler, @@ -103,9 +108,7 @@ def get_dummy_inputs(self, seed=0): "guidance_scale": 6.0, "output_type": "numpy", "max_iter_to_alter": 2, - "thresholds": { - (0): 0.7 - }, + "thresholds": {(0): 0.7}, } return inputs @@ -117,17 +120,19 @@ def test_inference(self): image = pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] self.assertEqual(image.shape, (1, 64, 64, 3)) - expected_slice = np.array([ - 0.33271241188049316, - 0.3123358190059662, - 0.44427454471588135, - 0.08615309000015259, - 0.26107650995254517, - 0.4551312029361725, - 0.06545555591583252, - 0.1626836657524109, - 0.3982071578502655, - ]) + expected_slice = np.array( + [ + 0.33271241188049316, + 0.3123358190059662, + 0.44427454471588135, + 0.08615309000015259, + 0.26107650995254517, + 0.4551312029361725, + 0.06545555591583252, + 0.1626836657524109, + 0.3982071578502655, + ] + ) max_diff = np.abs(image_slice.flatten() - expected_slice).max() self.assertLessEqual(max_diff, 0.001) @@ -149,7 +154,8 @@ def test_attend_and_excite_fp16(self): pipe = StableDiffusionAttendAndExcitePipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", safety_checker=None, - paddle_dtype=paddle.float16, ) + paddle_dtype=paddle.float16, + ) prompt = "a painting of an elephant with glasses" token_indices = [5, 7] @@ -160,7 +166,8 @@ def test_attend_and_excite_fp16(self): generator=generator, num_inference_steps=5, max_iter_to_alter=5, - output_type="numpy", ).images[0] + output_type="numpy", + ).images[0] expected_image = load_numpy( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/attend-and-excite/elephant_glasses.npy" ) diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py index 077ed16dba212..240b7ae56d4da 100644 --- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py +++ b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py @@ -20,30 +20,39 @@ import numpy as np import paddle -from paddlenlp.transformers import (CLIPTextConfig, CLIPTextModel, - CLIPTokenizer, DPTConfig, - DPTForDepthEstimation, DPTImageProcessor) +from paddlenlp.transformers import ( + CLIPTextConfig, + CLIPTextModel, + CLIPTokenizer, + DPTConfig, + DPTForDepthEstimation, + DPTImageProcessor, +) from PIL import Image -from ppdiffusers import (AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, - PNDMScheduler, StableDiffusionDepth2ImgPipeline, - UNet2DConditionModel) +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + StableDiffusionDepth2ImgPipeline, + UNet2DConditionModel, +) from ppdiffusers.utils import floats_tensor, load_image, nightly, slow from ppdiffusers.utils.testing_utils import require_paddle_gpu -from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, - TEXT_GUIDED_IMAGE_VARIATION_PARAMS) +from ..pipeline_params import ( + TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_PARAMS, +) from ..test_pipelines_common import PipelineTesterMixin -class StableDiffusionDepth2ImgPipelineFastTests(PipelineTesterMixin, - unittest.TestCase): +class StableDiffusionDepth2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionDepth2ImgPipeline test_save_load_optional_components = False params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"} - required_optional_params = PipelineTesterMixin.required_optional_params - { - "latents" - } + required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS def get_dummy_components(self): @@ -58,7 +67,8 @@ def get_dummy_components(self): up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), cross_attention_dim=32, attention_head_dim=(2, 4), - use_linear_projection=True, ) + use_linear_projection=True, + ) scheduler = PNDMScheduler(skip_prk_steps=True) paddle.seed(0) vae = AutoencoderKL( @@ -67,7 +77,8 @@ def get_dummy_components(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, ) + latent_channels=4, + ) paddle.seed(0) text_encoder_config = CLIPTextConfig( bos_token_id=0, @@ -78,10 +89,10 @@ def get_dummy_components(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, ) + vocab_size=1000, + ) text_encoder = CLIPTextModel(text_encoder_config).eval() - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") backbone_config = { "global_padding": "same", "layer_type": "bottleneck", @@ -107,10 +118,10 @@ def get_dummy_components(self): initializer_range=0.02, is_hybrid=True, backbone_config=backbone_config, - backbone_featmap_shape=[1, 384, 24, 24], ) + backbone_featmap_shape=[1, 384, 24, 24], + ) depth_estimator = DPTForDepthEstimation(depth_estimator_config) - feature_extractor = DPTImageProcessor.from_pretrained( - "hf-internal-testing/tiny-random-DPTForDepthEstimation") + feature_extractor = DPTImageProcessor.from_pretrained("hf-internal-testing/tiny-random-DPTForDepthEstimation") components = { "unet": unet, "scheduler": scheduler, @@ -146,8 +157,7 @@ def test_save_load_local(self): output = pipe(**inputs)[0] with tempfile.TemporaryDirectory() as tmpdir: pipe.save_pretrained(tmpdir) - pipe_loaded = self.pipeline_class.from_pretrained( - tmpdir, from_diffusers=False) + pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, from_diffusers=False) pipe_loaded.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs() output_loaded = pipe_loaded(**inputs)[0] @@ -215,17 +225,19 @@ def test_stable_diffusion_depth2img_default_case(self): image = pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([ - 0.35397637, - 0.23190483, - 0.20131412, - 0.27374774, - 0.265134, - 0.4502194, - 0.26852018, - 0.37504935, - 0.43135768, - ]) + expected_slice = np.array( + [ + 0.35397637, + 0.23190483, + 0.20131412, + 0.27374774, + 0.265134, + 0.4502194, + 0.26852018, + 0.37504935, + 0.43135768, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001 def test_stable_diffusion_depth2img_negative_prompt(self): @@ -238,17 +250,19 @@ def test_stable_diffusion_depth2img_negative_prompt(self): image = output.images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([ - 0.40259343, - 0.37764466, - 0.3936328, - 0.3628915, - 0.48100996, - 0.59685427, - 0.22927544, - 0.45186657, - 0.46950823, - ]) + expected_slice = np.array( + [ + 0.40259343, + 0.37764466, + 0.3936328, + 0.3628915, + 0.48100996, + 0.59685427, + 0.22927544, + 0.45186657, + 0.46950823, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001 def test_stable_diffusion_depth2img_multiple_init_images(self): @@ -261,17 +275,19 @@ def test_stable_diffusion_depth2img_multiple_init_images(self): image = pipe(**inputs).images image_slice = image[-1, -3:, -3:, -1] assert image.shape == (2, 32, 32, 3) - expected_slice = np.array([ - 0.8169553, - 0.4573238, - 0.27039874, - 0.60622, - 0.35670877, - 0.39508212, - 0.56803817, - 0.5341117, - 0.44428858, - ]) + expected_slice = np.array( + [ + 0.8169553, + 0.4573238, + 0.27039874, + 0.60622, + 0.35670877, + 0.39508212, + 0.56803817, + 0.5341117, + 0.44428858, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001 def test_stable_diffusion_depth2img_num_images_per_prompt(self): @@ -288,14 +304,12 @@ def test_stable_diffusion_depth2img_num_images_per_prompt(self): assert images.shape == (batch_size, 32, 32, 3) num_images_per_prompt = 2 inputs = self.get_dummy_inputs() - images = pipe( - **inputs, num_images_per_prompt=num_images_per_prompt).images + images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images assert images.shape == (num_images_per_prompt, 32, 32, 3) batch_size = 2 inputs = self.get_dummy_inputs() inputs["prompt"] = [inputs["prompt"]] * batch_size - images = pipe( - **inputs, num_images_per_prompt=num_images_per_prompt).images + images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images assert images.shape == (batch_size * num_images_per_prompt, 32, 32, 3) def test_stable_diffusion_depth2img_pil(self): @@ -305,17 +319,19 @@ def test_stable_diffusion_depth2img_pil(self): inputs = self.get_dummy_inputs() image = pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] - expected_slice = np.array([ - 0.35397637, - 0.23190483, - 0.20131412, - 0.27374774, - 0.265134, - 0.4502194, - 0.26852018, - 0.37504935, - 0.43135768, - ]) + expected_slice = np.array( + [ + 0.35397637, + 0.23190483, + 0.20131412, + 0.27374774, + 0.265134, + 0.4502194, + 0.26852018, + 0.37504935, + 0.43135768, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.001 @@ -345,7 +361,8 @@ def get_inputs(self, dtype="float32", seed=0): def test_stable_diffusion_depth2img_pipeline_default(self): pipe = StableDiffusionDepth2ImgPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-depth", safety_checker=None) + "stabilityai/stable-diffusion-2-depth", safety_checker=None + ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() inputs = self.get_inputs() @@ -353,22 +370,25 @@ def test_stable_diffusion_depth2img_pipeline_default(self): image_slice = image[0, 253:256, 253:256, -1].flatten() assert image.shape == (1, 480, 640, 3) # expected_slice = np.array([0.9057, 0.9365, 0.9258, 0.8937, 0.8555, 0.8541, 0.826, 0.7747, 0.7421]) - expected_slice = np.array([ - 0.75446224, - 0.746921, - 0.7595095, - 0.8161169, - 0.8059271, - 0.7999228, - 0.9052905, - 0.879215, - 0.8690305, - ]) + expected_slice = np.array( + [ + 0.75446224, + 0.746921, + 0.7595095, + 0.8161169, + 0.8059271, + 0.7999228, + 0.9052905, + 0.879215, + 0.8690305, + ] + ) assert np.abs(expected_slice - image_slice).max() < 0.1 def test_stable_diffusion_depth2img_pipeline_k_lms(self): pipe = StableDiffusionDepth2ImgPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-depth", safety_checker=None) + "stabilityai/stable-diffusion-2-depth", safety_checker=None + ) pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() @@ -377,22 +397,25 @@ def test_stable_diffusion_depth2img_pipeline_k_lms(self): image_slice = image[0, 253:256, 253:256, -1].flatten() assert image.shape == (1, 480, 640, 3) # expected_slice = np.array([0.6363, 0.6274, 0.6309, 0.637, 0.6226, 0.6286, 0.6213, 0.6453, 0.6306]) - expected_slice = np.array([ - 0.6395747, - 0.64879197, - 0.6566683, - 0.6438427, - 0.6707787, - 0.63587487, - 0.66576767, - 0.62180007, - 0.6628648, - ]) + expected_slice = np.array( + [ + 0.6395747, + 0.64879197, + 0.6566683, + 0.6438427, + 0.6707787, + 0.63587487, + 0.66576767, + 0.62180007, + 0.6628648, + ] + ) assert np.abs(expected_slice - image_slice).max() < 0.1 def test_stable_diffusion_depth2img_pipeline_ddim(self): pipe = StableDiffusionDepth2ImgPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-depth", safety_checker=None) + "stabilityai/stable-diffusion-2-depth", safety_checker=None + ) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() @@ -401,25 +424,26 @@ def test_stable_diffusion_depth2img_pipeline_ddim(self): image_slice = image[0, 253:256, 253:256, -1].flatten() assert image.shape == (1, 480, 640, 3) # expected_slice = np.array([0.6424, 0.6524, 0.6249, 0.6041, 0.6634, 0.642, 0.6522, 0.6555, 0.6436]) - expected_slice = np.array([ - 0.6283968, - 0.6419119, - 0.6295293, - 0.63652724, - 0.6420511, - 0.61574477, - 0.62251365, - 0.65826833, - 0.6480877, - ]) + expected_slice = np.array( + [ + 0.6283968, + 0.6419119, + 0.6295293, + 0.63652724, + 0.6420511, + 0.61574477, + 0.62251365, + 0.65826833, + 0.6480877, + ] + ) assert np.abs(expected_slice - image_slice).max() < 0.15 def test_stable_diffusion_depth2img_intermediate_state(self): number_of_steps = 0 - def callback_fn(step: int, timestep: int, - latents: paddle.Tensor) -> None: + def callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None: callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 @@ -427,25 +451,27 @@ def callback_fn(step: int, timestep: int, latents = latents.detach().cpu().numpy() assert latents.shape == (1, 4, 60, 80) latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([ - -1.148, - -0.2147, - -0.618, - -2.48, - -2.348, - 0.3945, - -2.05, - -1.566, - -1.52, - ]) - assert np.abs(latents_slice.flatten() - expected_slice).max( - ) < 0.1 + expected_slice = np.array( + [ + -1.148, + -0.2147, + -0.618, + -2.48, + -2.348, + 0.3945, + -2.05, + -1.566, + -1.52, + ] + ) + assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.1 callback_fn.has_been_called = False pipe = StableDiffusionDepth2ImgPipeline.from_pretrained( "stabilityai/stable-diffusion-2-depth", safety_checker=None, - paddle_dtype=paddle.float16, ) + paddle_dtype=paddle.float16, + ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() inputs = self.get_inputs(dtype="float16") diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py index 6e0d5f33a5bdc..a926f2ed14718 100644 --- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py +++ b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py @@ -22,18 +22,23 @@ from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer from PIL import Image -from ppdiffusers import (AutoencoderKL, PNDMScheduler, - StableDiffusionInpaintPipeline, UNet2DConditionModel) +from ppdiffusers import ( + AutoencoderKL, + PNDMScheduler, + StableDiffusionInpaintPipeline, + UNet2DConditionModel, +) from ppdiffusers.utils import floats_tensor, load_image from ppdiffusers.utils.testing_utils import require_paddle_gpu, slow -from ..pipeline_params import (TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, - TEXT_GUIDED_IMAGE_INPAINTING_PARAMS) +from ..pipeline_params import ( + TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_INPAINTING_PARAMS, +) from ..test_pipelines_common import PipelineTesterMixin -class StableDiffusion2InpaintPipelineFastTests(PipelineTesterMixin, - unittest.TestCase): +class StableDiffusion2InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionInpaintPipeline params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS @@ -50,7 +55,8 @@ def get_dummy_components(self): up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), cross_attention_dim=32, attention_head_dim=(2, 4), - use_linear_projection=True, ) + use_linear_projection=True, + ) scheduler = PNDMScheduler(skip_prk_steps=True) paddle.seed(0) vae = AutoencoderKL( @@ -60,7 +66,8 @@ def get_dummy_components(self): down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], latent_channels=4, - sample_size=128, ) + sample_size=128, + ) paddle.seed(0) text_encoder_config = CLIPTextConfig( bos_token_id=0, @@ -73,10 +80,10 @@ def get_dummy_components(self): pad_token_id=1, vocab_size=1000, hidden_act="gelu", - projection_dim=512, ) + projection_dim=512, + ) text_encoder = CLIPTextModel(text_encoder_config).eval() - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") components = { "unet": unet, "scheduler": scheduler, @@ -91,11 +98,8 @@ def get_dummy_components(self): def get_dummy_inputs(self, seed=0): image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)) image = image.cpu().transpose(perm=[0, 2, 3, 1])[0] - init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize( - (64, 64)) - mask_image = ( - Image.fromarray(np.uint8(image + 4)).convert("RGB").resize( - (64, 64))) + init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64)) + mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64)) generator = paddle.Generator().manual_seed(seed) inputs = { @@ -117,17 +121,19 @@ def test_stable_diffusion_inpaint(self): image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.58470726, - 0.49302375, - 0.3954028, - 0.4068969, - 0.33668613, - 0.50350493, - 0.34411103, - 0.25261122, - 0.4531455, - ]) + expected_slice = np.array( + [ + 0.58470726, + 0.49302375, + 0.3954028, + 0.4068969, + 0.33668613, + 0.50350493, + 0.34411103, + 0.25261122, + 0.4531455, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 @@ -151,8 +157,7 @@ def test_stable_diffusion_inpaint_pipeline(self): # 'https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/yellow_cat_sitting_on_a_park_bench.npy' # ) model_id = "stabilityai/stable-diffusion-2-inpainting" - pipe = StableDiffusionInpaintPipeline.from_pretrained( - model_id, safety_checker=None) + pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, safety_checker=None) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() prompt = "Face of a yellow cat, high resolution, sitting on a park bench" @@ -162,7 +167,8 @@ def test_stable_diffusion_inpaint_pipeline(self): image=init_image, mask_image=mask_image, generator=generator, - output_type="np", ) + output_type="np", + ) image = output.images[0] assert image.shape == (512, 512, 3) image = image[-3:, -3:, -1] @@ -186,7 +192,8 @@ def test_stable_diffusion_inpaint_pipeline_fp16(self): # ) model_id = "stabilityai/stable-diffusion-2-inpainting" pipe = StableDiffusionInpaintPipeline.from_pretrained( - model_id, paddle_dtype=paddle.float16, safety_checker=None) + model_id, paddle_dtype=paddle.float16, safety_checker=None + ) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() prompt = "Face of a yellow cat, high resolution, sitting on a park bench" @@ -196,7 +203,8 @@ def test_stable_diffusion_inpaint_pipeline_fp16(self): image=init_image, mask_image=mask_image, generator=generator, - output_type="np", ) + output_type="np", + ) image = output.images[0] assert image.shape == (512, 512, 3) image = image[-3:, -3:, -1] diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py index 0224ae1e8b294..ec93a578bbaf2 100644 --- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py +++ b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py @@ -21,19 +21,24 @@ import paddle from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer -from ppdiffusers import (AutoencoderKL, EulerDiscreteScheduler, - StableDiffusionLatentUpscalePipeline, - StableDiffusionPipeline, UNet2DConditionModel) +from ppdiffusers import ( + AutoencoderKL, + EulerDiscreteScheduler, + StableDiffusionLatentUpscalePipeline, + StableDiffusionPipeline, + UNet2DConditionModel, +) from ppdiffusers.utils import floats_tensor, load_image, slow from ppdiffusers.utils.testing_utils import require_paddle_gpu -from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, - TEXT_GUIDED_IMAGE_VARIATION_PARAMS) +from ..pipeline_params import ( + TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_PARAMS, +) from ..test_pipelines_common import PipelineTesterMixin -class StableDiffusionLatentUpscalePipelineFastTests(PipelineTesterMixin, - unittest.TestCase): +class StableDiffusionLatentUpscalePipelineFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionLatentUpscalePipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - { "height", @@ -42,9 +47,7 @@ class StableDiffusionLatentUpscalePipelineFastTests(PipelineTesterMixin, "negative_prompt_embeds", "prompt_embeds", } - required_optional_params = PipelineTesterMixin.required_optional_params - { - "num_images_per_prompt" - } + required_optional_params = PipelineTesterMixin.required_optional_params - {"num_images_per_prompt"} batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS test_cpu_offload = False @@ -53,8 +56,7 @@ def dummy_image(self): batch_size = 1 num_channels = 4 sizes = 16, 16 - image = floats_tensor( - (batch_size, num_channels) + sizes, rng=random.Random(0)) + image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)) return image def get_dummy_components(self): @@ -72,7 +74,8 @@ def get_dummy_components(self): "KDownBlock2D", "KCrossAttnDownBlock2D", "KCrossAttnDownBlock2D", - "KCrossAttnDownBlock2D", ), + "KCrossAttnDownBlock2D", + ), in_channels=8, mid_block_type=None, only_cross_attention=False, @@ -84,7 +87,9 @@ def get_dummy_components(self): "KCrossAttnUpBlock2D", "KCrossAttnUpBlock2D", "KCrossAttnUpBlock2D", - "KUpBlock2D", ), ) + "KUpBlock2D", + ), + ) vae = AutoencoderKL( block_out_channels=[32, 32, 64, 64], in_channels=3, @@ -101,7 +106,8 @@ def get_dummy_components(self): "UpDecoderBlock2D", "UpDecoderBlock2D", ], - latent_channels=4, ) + latent_channels=4, + ) scheduler = EulerDiscreteScheduler(prediction_type="sample") text_config = CLIPTextConfig( bos_token_id=0, @@ -114,10 +120,10 @@ def get_dummy_components(self): pad_token_id=1, vocab_size=1000, hidden_act="quick_gelu", - projection_dim=512, ) + projection_dim=512, + ) text_encoder = CLIPTextModel(text_config).eval() - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") components = { "unet": model.eval(), "vae": vae.eval(), @@ -147,17 +153,19 @@ def test_inference(self): image = pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] self.assertEqual(image.shape, (1, 256, 256, 3)) - expected_slice = np.array([ - 0.5665861368179321, - 0.7449524402618408, - 0.0, - 0.1325536072254181, - 0.4274534583091736, - 0.0, - 0.0, - 0.14426982402801514, - 0.0, - ]) + expected_slice = np.array( + [ + 0.5665861368179321, + 0.7449524402618408, + 0.0, + 0.1325536072254181, + 0.4274534583091736, + 0.0, + 0.0, + 0.14426982402801514, + 0.0, + ] + ) max_diff = np.abs(image_slice.flatten() - expected_slice).max() self.assertLessEqual(max_diff, 0.001) @@ -175,25 +183,23 @@ def tearDown(self): def test_latent_upscaler_fp16(self): generator = paddle.Generator().manual_seed(seed=33) - pipe = StableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16) + pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", paddle_dtype=paddle.float16) pipe.to("gpu") upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained( - "stabilityai/sd-x2-latent-upscaler", paddle_dtype=paddle.float16) + "stabilityai/sd-x2-latent-upscaler", paddle_dtype=paddle.float16 + ) upscaler.to("gpu") - prompt = ( - "a photo of an astronaut high resolution, unreal engine, ultra realistic" - ) - low_res_latents = pipe( - prompt, generator=generator, output_type="latent").images + prompt = "a photo of an astronaut high resolution, unreal engine, ultra realistic" + low_res_latents = pipe(prompt, generator=generator, output_type="latent").images image = upscaler( prompt=prompt, image=low_res_latents, num_inference_steps=20, guidance_scale=0, generator=generator, - output_type="np", ).images[0] + output_type="np", + ).images[0] # invalid expected_image # expected_image = load_numpy( # "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/latent-upscaler/astronaut_1024.npy" @@ -209,7 +215,8 @@ def test_latent_upscaler_fp16(self): def test_latent_upscaler_fp16_image(self): generator = paddle.Generator().manual_seed(seed=33) upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained( - "stabilityai/sd-x2-latent-upscaler", paddle_dtype=paddle.float16) + "stabilityai/sd-x2-latent-upscaler", paddle_dtype=paddle.float16 + ) upscaler.to("gpu") prompt = "the temple of fire by Ross Tran and Gerardo Dottori, oil on canvas" @@ -222,7 +229,8 @@ def test_latent_upscaler_fp16_image(self): num_inference_steps=20, guidance_scale=0, generator=generator, - output_type="np", ).images[0] + output_type="np", + ).images[0] # invalid expected_image # expected_image = load_numpy( # "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/latent-upscaler/fire_temple_1024.npy" diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py index ca4e467ebdca2..35a135bc747e3 100644 --- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py +++ b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py @@ -22,8 +22,13 @@ from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer from PIL import Image -from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler, - StableDiffusionUpscalePipeline, UNet2DConditionModel) +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + DDPMScheduler, + StableDiffusionUpscalePipeline, + UNet2DConditionModel, +) from ppdiffusers.utils import floats_tensor, load_image, slow from ppdiffusers.utils.testing_utils import require_paddle_gpu @@ -39,8 +44,7 @@ def dummy_image(self): batch_size = 1 num_channels = 3 sizes = (32, 32) - image = floats_tensor( - (batch_size, num_channels) + sizes, rng=random.Random(0)) + image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)) return image @property @@ -55,15 +59,16 @@ def dummy_cond_unet_upscale(self): down_block_types=( "DownBlock2D", "CrossAttnDownBlock2D", - "CrossAttnDownBlock2D", ), - up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D", - "UpBlock2D"), + "CrossAttnDownBlock2D", + ), + up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "UpBlock2D"), cross_attention_dim=32, # SD2-specific config below attention_head_dim=8, use_linear_projection=True, only_cross_attention=(True, True, False), - num_class_embeds=100, ) + num_class_embeds=100, + ) return model @property @@ -78,10 +83,9 @@ def dummy_vae(self): "DownEncoderBlock2D", "DownEncoderBlock2D", ], - up_block_types=[ - "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D" - ], - latent_channels=4, ) + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"], + latent_channels=4, + ) return model @property @@ -99,7 +103,8 @@ def dummy_text_encoder(self): vocab_size=1000, # SD2-specific config below hidden_act="gelu", - projection_dim=512, ) + projection_dim=512, + ) return CLIPTextModel(config).eval() def test_stable_diffusion_upscale(self): @@ -108,11 +113,9 @@ def test_stable_diffusion_upscale(self): scheduler = DDIMScheduler(prediction_type="v_prediction") vae = self.dummy_vae text_encoder = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0] - low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize( - (64, 64)) + low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64)) sd_pipe = StableDiffusionUpscalePipeline( unet=unet, low_res_scheduler=low_res_scheduler, @@ -120,7 +123,8 @@ def test_stable_diffusion_upscale(self): vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, - max_noise_level=350, ) + max_noise_level=350, + ) sd_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" generator = paddle.Generator().manual_seed(0) @@ -131,7 +135,8 @@ def test_stable_diffusion_upscale(self): guidance_scale=6.0, noise_level=20, num_inference_steps=2, - output_type="np", ) + output_type="np", + ) image = output.images generator = paddle.Generator().manual_seed(0) image_from_tuple = sd_pipe( @@ -142,26 +147,27 @@ def test_stable_diffusion_upscale(self): noise_level=20, num_inference_steps=2, output_type="np", - return_dict=False, )[0] + return_dict=False, + )[0] image_slice = image[0, -3:, -3:, -1] image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] expected_height_width = low_res_image.size[0] * 4 - assert image.shape == (1, expected_height_width, expected_height_width, - 3) - expected_slice = np.array([ - 0.0, - 0.0, - 0.3616839, - 0.0, - 0.04877859, - 0.59195685, - 0.23902711, - 0.00838843, - 0.5172206, - ]) + assert image.shape == (1, expected_height_width, expected_height_width, 3) + expected_slice = np.array( + [ + 0.0, + 0.0, + 0.3616839, + 0.0, + 0.04877859, + 0.59195685, + 0.23902711, + 0.00838843, + 0.5172206, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max( - ) < 0.01 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_upscale_batch(self): unet = self.dummy_cond_unet_upscale @@ -169,11 +175,9 @@ def test_stable_diffusion_upscale_batch(self): scheduler = DDIMScheduler(prediction_type="v_prediction") vae = self.dummy_vae text_encoder = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0] - low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize( - (64, 64)) + low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64)) sd_pipe = StableDiffusionUpscalePipeline( unet=unet, low_res_scheduler=low_res_scheduler, @@ -181,7 +185,8 @@ def test_stable_diffusion_upscale_batch(self): vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, - max_noise_level=350, ) + max_noise_level=350, + ) sd_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" output = sd_pipe( @@ -190,7 +195,8 @@ def test_stable_diffusion_upscale_batch(self): guidance_scale=6.0, noise_level=20, num_inference_steps=2, - output_type="np", ) + output_type="np", + ) image = output.images assert image.shape[0] == 2 generator = paddle.Generator().manual_seed(0) @@ -202,7 +208,8 @@ def test_stable_diffusion_upscale_batch(self): guidance_scale=6.0, noise_level=20, num_inference_steps=2, - output_type="np", ) + output_type="np", + ) image = output.images assert image.shape[0] == 2 @@ -213,11 +220,9 @@ def test_stable_diffusion_upscale_fp16(self): scheduler = DDIMScheduler(prediction_type="v_prediction") vae = self.dummy_vae text_encoder = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") image = self.dummy_image.cpu().transpose(perm=[0, 2, 3, 1])[0] - low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize( - (64, 64)) + low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64)) unet = unet.to(dtype=paddle.float16) text_encoder = text_encoder.to(dtype=paddle.float16) sd_pipe = StableDiffusionUpscalePipeline( @@ -227,7 +232,8 @@ def test_stable_diffusion_upscale_fp16(self): vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, - max_noise_level=350, ) + max_noise_level=350, + ) sd_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" generator = paddle.Generator().manual_seed(0) @@ -236,10 +242,10 @@ def test_stable_diffusion_upscale_fp16(self): image=low_res_image, generator=generator, num_inference_steps=2, - output_type="np", ).images + output_type="np", + ).images expected_height_width = low_res_image.size[0] * 4 - assert image.shape == (1, expected_height_width, expected_height_width, - 3) + assert image.shape == (1, expected_height_width, expected_height_width, 3) @slow @@ -264,8 +270,7 @@ def test_stable_diffusion_upscale_pipeline(self): pipe.enable_attention_slicing() prompt = "a cat sitting on a park bench" generator = paddle.Generator().manual_seed(0) - output = pipe( - prompt=prompt, image=image, generator=generator, output_type="np") + output = pipe(prompt=prompt, image=image, generator=generator, output_type="np") image = output.images[0] assert image.shape == (512, 512, 3) image = image[-3:, -3:, -1] diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py index daa755dc68597..b482ca6657633 100644 --- a/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py +++ b/ppdiffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py @@ -20,9 +20,14 @@ import paddle from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer -from ppdiffusers import (AutoencoderKL, DDIMScheduler, - DPMSolverMultistepScheduler, EulerDiscreteScheduler, - StableDiffusionPipeline, UNet2DConditionModel) +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + DPMSolverMultistepScheduler, + EulerDiscreteScheduler, + StableDiffusionPipeline, + UNet2DConditionModel, +) from ppdiffusers.utils import slow from ppdiffusers.utils.testing_utils import require_paddle_gpu @@ -46,7 +51,8 @@ def dummy_cond_unet(self): up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), cross_attention_dim=32, attention_head_dim=(2, 4), - use_linear_projection=True, ) + use_linear_projection=True, + ) return model @property @@ -59,7 +65,8 @@ def dummy_vae(self): down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], latent_channels=4, - sample_size=128, ) + sample_size=128, + ) return model @property @@ -76,7 +83,8 @@ def dummy_text_encoder(self): pad_token_id=1, vocab_size=1000, hidden_act="gelu", - projection_dim=64, ) + projection_dim=64, + ) return CLIPTextModel(config).eval() def test_stable_diffusion_v_pred_ddim(self): @@ -87,11 +95,11 @@ def test_stable_diffusion_v_pred_ddim(self): beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False, - prediction_type="v_prediction", ) + prediction_type="v_prediction", + ) vae = self.dummy_vae bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") sd_pipe = StableDiffusionPipeline( unet=unet, scheduler=scheduler, @@ -100,7 +108,8 @@ def test_stable_diffusion_v_pred_ddim(self): tokenizer=tokenizer, safety_checker=None, feature_extractor=None, - requires_safety_checker=False, ) + requires_safety_checker=False, + ) sd_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" generator = paddle.Generator().manual_seed(0) @@ -109,7 +118,8 @@ def test_stable_diffusion_v_pred_ddim(self): generator=generator, guidance_scale=6.0, num_inference_steps=2, - output_type="np", ) + output_type="np", + ) image = output.images generator = paddle.Generator().manual_seed(0) image_from_tuple = sd_pipe( @@ -118,24 +128,26 @@ def test_stable_diffusion_v_pred_ddim(self): guidance_scale=6.0, num_inference_steps=2, output_type="np", - return_dict=False, )[0] + return_dict=False, + )[0] image_slice = image[0, -3:, -3:, -1] image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.36126757, - 0.40778637, - 0.36956796, - 0.14816678, - 0.25735706, - 0.36562037, - 0.1229952, - 0.22826642, - 0.4154452, - ]) + expected_slice = np.array( + [ + 0.36126757, + 0.40778637, + 0.36956796, + 0.14816678, + 0.25735706, + 0.36562037, + 0.1229952, + 0.22826642, + 0.4154452, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max( - ) < 0.01 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_v_pred_k_euler(self): unet = self.dummy_cond_unet @@ -143,11 +155,11 @@ def test_stable_diffusion_v_pred_k_euler(self): beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", - prediction_type="v_prediction", ) + prediction_type="v_prediction", + ) vae = self.dummy_vae bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") sd_pipe = StableDiffusionPipeline( unet=unet, scheduler=scheduler, @@ -156,7 +168,8 @@ def test_stable_diffusion_v_pred_k_euler(self): tokenizer=tokenizer, safety_checker=None, feature_extractor=None, - requires_safety_checker=False, ) + requires_safety_checker=False, + ) sd_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" generator = paddle.Generator().manual_seed(0) @@ -165,7 +178,8 @@ def test_stable_diffusion_v_pred_k_euler(self): generator=generator, guidance_scale=6.0, num_inference_steps=2, - output_type="np", ) + output_type="np", + ) image = output.images generator = paddle.Generator().manual_seed(0) image_from_tuple = sd_pipe( @@ -174,24 +188,26 @@ def test_stable_diffusion_v_pred_k_euler(self): guidance_scale=6.0, num_inference_steps=2, output_type="np", - return_dict=False, )[0] + return_dict=False, + )[0] image_slice = image[0, -3:, -3:, -1] image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.39991996, - 0.45191997, - 0.34044766, - 0.2136086, - 0.2758901, - 0.31222183, - 0.21658134, - 0.34479994, - 0.43742967, - ]) + expected_slice = np.array( + [ + 0.39991996, + 0.45191997, + 0.34044766, + 0.2136086, + 0.2758901, + 0.31222183, + 0.21658134, + 0.34479994, + 0.43742967, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max( - ) < 0.01 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_v_pred_fp16(self): """Test that stable diffusion v-prediction works with fp16""" @@ -202,11 +218,11 @@ def test_stable_diffusion_v_pred_fp16(self): beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False, - prediction_type="v_prediction", ) + prediction_type="v_prediction", + ) vae = self.dummy_vae bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") unet = unet.to(dtype=paddle.float16) vae = vae.to(dtype=paddle.float16) bert = bert.to(dtype=paddle.float16) @@ -218,15 +234,12 @@ def test_stable_diffusion_v_pred_fp16(self): tokenizer=tokenizer, safety_checker=None, feature_extractor=None, - requires_safety_checker=False, ) + requires_safety_checker=False, + ) sd_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" generator = paddle.Generator().manual_seed(0) - image = sd_pipe( - [prompt], - generator=generator, - num_inference_steps=2, - output_type="np").images + image = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images assert image.shape == (1, 64, 64, 3) @@ -239,8 +252,7 @@ def tearDown(self): paddle.device.cuda.empty_cache() def test_stable_diffusion_v_pred_default(self): - sd_pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2") + sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2") sd_pipe.enable_attention_slicing() sd_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" @@ -250,26 +262,30 @@ def test_stable_diffusion_v_pred_default(self): generator=generator, guidance_scale=7.5, num_inference_steps=20, - output_type="np", ) + output_type="np", + ) image = output.images image_slice = image[0, 253:256, 253:256, -1] assert image.shape == (1, 768, 768, 3) - expected_slice = np.array([ - 0.05667132, - 0.05700234, - 0.04156408, - 0.04631725, - 0.04327643, - 0.06003231, - 0.05165312, - 0.05258191, - 0.0865913, - ]) + expected_slice = np.array( + [ + 0.05667132, + 0.05700234, + 0.04156408, + 0.04631725, + 0.04327643, + 0.06003231, + 0.05165312, + 0.05258191, + 0.0865913, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_v_pred_upcast_attention(self): sd_pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-1", paddle_dtype=paddle.float16) + "stabilityai/stable-diffusion-2-1", paddle_dtype=paddle.float16 + ) sd_pipe.enable_attention_slicing() sd_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" @@ -279,52 +295,51 @@ def test_stable_diffusion_v_pred_upcast_attention(self): generator=generator, guidance_scale=7.5, num_inference_steps=20, - output_type="np", ) + output_type="np", + ) image = output.images image_slice = image[0, 253:256, 253:256, -1] assert image.shape == (1, 768, 768, 3) - expected_slice = np.array([ - 0.04541016, - 0.04516602, - 0.05493164, - 0.05078125, - 0.04296875, - 0.07275391, - 0.06567383, - 0.0534668, - 0.04833984, - ]) + expected_slice = np.array( + [ + 0.04541016, + 0.04516602, + 0.05493164, + 0.05078125, + 0.04296875, + 0.07275391, + 0.06567383, + 0.0534668, + 0.04833984, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.05 def test_stable_diffusion_v_pred_euler(self): - scheduler = EulerDiscreteScheduler.from_pretrained( - "stabilityai/stable-diffusion-2", subfolder="scheduler") - sd_pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2", scheduler=scheduler) + scheduler = EulerDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-2", subfolder="scheduler") + sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", scheduler=scheduler) sd_pipe.enable_attention_slicing() sd_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" generator = paddle.Generator().manual_seed(0) - output = sd_pipe( - [prompt], - generator=generator, - num_inference_steps=5, - output_type="numpy") + output = sd_pipe([prompt], generator=generator, num_inference_steps=5, output_type="numpy") image = output.images image_slice = image[0, 253:256, 253:256, -1] assert image.shape == (1, 768, 768, 3) - expected_slice = np.array([ - 0.03515199, - 0.03756374, - 0.05046153, - 0.04240236, - 0.05509549, - 0.06556576, - 0.04710263, - 0.02758819, - 0.05959105, - ]) + expected_slice = np.array( + [ + 0.03515199, + 0.03756374, + 0.05046153, + 0.04240236, + 0.05509549, + 0.06556576, + 0.04710263, + 0.02758819, + 0.05959105, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_v_pred_dpm(self): @@ -332,9 +347,9 @@ def test_stable_diffusion_v_pred_dpm(self): TODO: update this test after making DPM compatible with V-prediction! """ scheduler = DPMSolverMultistepScheduler.from_pretrained( - "stabilityai/stable-diffusion-2", subfolder="scheduler") - sd_pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2", scheduler=scheduler) + "stabilityai/stable-diffusion-2", subfolder="scheduler" + ) + sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", scheduler=scheduler) sd_pipe.enable_attention_slicing() sd_pipe.set_progress_bar_config(disable=None) prompt = "a photograph of an astronaut riding a horse" @@ -344,20 +359,23 @@ def test_stable_diffusion_v_pred_dpm(self): generator=generator, guidance_scale=7.5, num_inference_steps=5, - output_type="numpy", ).images + output_type="numpy", + ).images image_slice = image[0, 253:256, 253:256, -1] assert image.shape == (1, 768, 768, 3) - expected_slice = np.array([ - 0.20492354, - 0.2115368, - 0.2323401, - 0.2415919, - 0.25598443, - 0.24843931, - 0.25171167, - 0.23580211, - 0.23604062, - ]) + expected_slice = np.array( + [ + 0.20492354, + 0.2115368, + 0.2323401, + 0.2415919, + 0.25598443, + 0.24843931, + 0.25171167, + 0.23580211, + 0.23604062, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 # def test_stable_diffusion_attention_slicing_v_pred(self): @@ -387,30 +405,27 @@ def test_stable_diffusion_text2img_pipeline_v_pred_default(self): # expected_image = load_numpy( # 'https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-text2img/astronaut_riding_a_horse_v_pred.npy' # ) - pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2") + pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2") pipe.enable_attention_slicing() pipe.set_progress_bar_config(disable=None) prompt = "astronaut riding a horse" generator = paddle.Generator().manual_seed(0) - output = pipe( - prompt=prompt, - guidance_scale=7.5, - generator=generator, - output_type="np") + output = pipe(prompt=prompt, guidance_scale=7.5, generator=generator, output_type="np") image = output.images[0] assert image.shape == (768, 768, 3) - expected_image = np.array([ - 0.26713198, - 0.2630347, - 0.25486767, - 0.23375505, - 0.24399692, - 0.22363415, - 0.24688962, - 0.21346492, - 0.23014635, - ]) + expected_image = np.array( + [ + 0.26713198, + 0.2630347, + 0.25486767, + 0.23375505, + 0.24399692, + 0.22363415, + 0.24688962, + 0.21346492, + 0.23014635, + ] + ) image = image[-3:, -3:, -1].flatten() assert np.abs(expected_image - image).max() < 0.075 @@ -419,37 +434,33 @@ def test_stable_diffusion_text2img_pipeline_v_pred_fp16(self): # expected_image = load_numpy( # 'https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-text2img/astronaut_riding_a_horse_v_pred_fp16.npy' # ) - pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2", paddle_dtype=paddle.float16) + pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", paddle_dtype=paddle.float16) pipe.set_progress_bar_config(disable=None) prompt = "astronaut riding a horse" generator = paddle.Generator().manual_seed(0) - output = pipe( - prompt=prompt, - guidance_scale=7.5, - generator=generator, - output_type="np") + output = pipe(prompt=prompt, guidance_scale=7.5, generator=generator, output_type="np") image = output.images[0] assert image.shape == (768, 768, 3) - expected_image = np.array([ - 0.26220703, - 0.25195312, - 0.2434082, - 0.22753906, - 0.23632812, - 0.21777344, - 0.23901367, - 0.20629883, - 0.22192383, - ]) + expected_image = np.array( + [ + 0.26220703, + 0.25195312, + 0.2434082, + 0.22753906, + 0.23632812, + 0.21777344, + 0.23901367, + 0.20629883, + 0.22192383, + ] + ) image = image[-3:, -3:, -1].flatten() assert np.abs(expected_image - image).max() < 0.75 def test_stable_diffusion_text2img_intermediate_state_v_pred(self): number_of_steps = 0 - def test_callback_fn(step: int, timestep: int, - latents: paddle.Tensor) -> None: + def test_callback_fn(step: int, timestep: int, latents: paddle.Tensor) -> None: test_callback_fn.has_been_called = True nonlocal number_of_steps number_of_steps += 1 @@ -457,40 +468,41 @@ def test_callback_fn(step: int, timestep: int, latents = latents.detach().cpu().numpy() assert latents.shape == (1, 4, 96, 96) latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([ - -0.2542, - -1.276, - 0.426, - -0.956, - -1.173, - -0.5884, - 2.416, - 0.1553, - -1.21, - ]) - assert np.abs(latents_slice.flatten() - expected_slice).max( - ) < 0.05 + expected_slice = np.array( + [ + -0.2542, + -1.276, + 0.426, + -0.956, + -1.173, + -0.5884, + 2.416, + 0.1553, + -1.21, + ] + ) + assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05 elif step == 19: latents = latents.detach().cpu().numpy() assert latents.shape == (1, 4, 96, 96) latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([ - -0.959, - -0.964, - -0.614, - 0.0977, - -0.6953, - -0.2343, - 1.551, - -0.03357, - -0.11395, - ]) - assert np.abs(latents_slice.flatten() - expected_slice).max( - ) < 0.05 + expected_slice = np.array( + [ + -0.959, + -0.964, + -0.614, + 0.0977, + -0.6953, + -0.2343, + 1.551, + -0.03357, + -0.11395, + ] + ) + assert np.abs(latents_slice.flatten() - expected_slice).max() < 0.05 test_callback_fn.has_been_called = False - pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2", paddle_dtype=paddle.float16) + pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", paddle_dtype=paddle.float16) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing() prompt = "Andromeda galaxy in a bottle" @@ -501,6 +513,7 @@ def test_callback_fn(step: int, timestep: int, guidance_scale=7.5, generator=generator, callback=test_callback_fn, - callback_steps=1, ) + callback_steps=1, + ) assert test_callback_fn.has_been_called assert number_of_steps == 20 diff --git a/ppdiffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py b/ppdiffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py index 2bfa1261d9065..b2bdac5b34ed7 100644 --- a/ppdiffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py +++ b/ppdiffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py @@ -22,10 +22,16 @@ import paddle from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer -from ppdiffusers import (AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, - PNDMScheduler, UNet2DConditionModel) -from ppdiffusers.pipelines.stable_diffusion_safe import \ - StableDiffusionPipelineSafe as StableDiffusionPipeline +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + UNet2DConditionModel, +) +from ppdiffusers.pipelines.stable_diffusion_safe import ( + StableDiffusionPipelineSafe as StableDiffusionPipeline, +) from ppdiffusers.utils import floats_tensor, nightly from ppdiffusers.utils.testing_utils import require_paddle_gpu @@ -41,8 +47,7 @@ def dummy_image(self): batch_size = 1 num_channels = 3 sizes = 32, 32 - image = floats_tensor( - (batch_size, num_channels) + sizes, rng=random.Random(0)) + image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)) return image @property @@ -56,7 +61,8 @@ def dummy_cond_unet(self): out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, ) + cross_attention_dim=32, + ) return model @property @@ -68,7 +74,8 @@ def dummy_vae(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, ) + latent_channels=4, + ) return model @property @@ -83,7 +90,8 @@ def dummy_text_encoder(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, ) + vocab_size=1000, + ) return CLIPTextModel(config).eval() @property @@ -108,11 +116,11 @@ def test_safe_diffusion_ddim(self): beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, - set_alpha_to_one=False, ) + set_alpha_to_one=False, + ) vae = self.dummy_vae bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") sd_pipe = StableDiffusionPipeline( unet=unet, scheduler=scheduler, @@ -120,7 +128,8 @@ def test_safe_diffusion_ddim(self): text_encoder=bert, tokenizer=tokenizer, safety_checker=None, - feature_extractor=self.dummy_extractor, ) + feature_extractor=self.dummy_extractor, + ) sd_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" generator = paddle.Generator().manual_seed(0) @@ -129,7 +138,8 @@ def test_safe_diffusion_ddim(self): generator=generator, guidance_scale=6.0, num_inference_steps=2, - output_type="np", ) + output_type="np", + ) image = output.images generator = paddle.Generator().manual_seed(0) image_from_tuple = sd_pipe( @@ -138,32 +148,33 @@ def test_safe_diffusion_ddim(self): guidance_scale=6.0, num_inference_steps=2, output_type="np", - return_dict=False, )[0] + return_dict=False, + )[0] image_slice = image[0, -3:, -3:, -1] image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.28519452, - 0.23807159, - 0.38150585, - 0.21930319, - 0.26092738, - 0.517212, - 0.2563907, - 0.2503956, - 0.47978917, - ]) + expected_slice = np.array( + [ + 0.28519452, + 0.23807159, + 0.38150585, + 0.21930319, + 0.26092738, + 0.517212, + 0.2563907, + 0.2503956, + 0.47978917, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max( - ) < 0.01 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_pndm(self): unet = self.dummy_cond_unet scheduler = PNDMScheduler(skip_prk_steps=True) vae = self.dummy_vae bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") sd_pipe = StableDiffusionPipeline( unet=unet, scheduler=scheduler, @@ -171,7 +182,8 @@ def test_stable_diffusion_pndm(self): text_encoder=bert, tokenizer=tokenizer, safety_checker=None, - feature_extractor=self.dummy_extractor, ) + feature_extractor=self.dummy_extractor, + ) sd_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" generator = paddle.Generator().manual_seed(0) @@ -180,7 +192,8 @@ def test_stable_diffusion_pndm(self): generator=generator, guidance_scale=6.0, num_inference_steps=2, - output_type="np", ) + output_type="np", + ) image = output.images generator = paddle.Generator().manual_seed(0) image_from_tuple = sd_pipe( @@ -189,29 +202,31 @@ def test_stable_diffusion_pndm(self): guidance_scale=6.0, num_inference_steps=2, output_type="np", - return_dict=False, )[0] + return_dict=False, + )[0] image_slice = image[0, -3:, -3:, -1] image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 0.18763152, - 0.24242553, - 0.36067978, - 0.21772456, - 0.27213728, - 0.5194623, - 0.2227565, - 0.2217454, - 0.4453961, - ]) + expected_slice = np.array( + [ + 0.18763152, + 0.24242553, + 0.36067978, + 0.21772456, + 0.27213728, + 0.5194623, + 0.2227565, + 0.2217454, + 0.4453961, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max( - ) < 0.01 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01 def test_stable_diffusion_no_safety_checker(self): pipe = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-lms-pipe", - safety_checker=None) + "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None + ) assert isinstance(pipe, StableDiffusionPipeline) assert isinstance(pipe.scheduler, LMSDiscreteScheduler) assert pipe.safety_checker is None @@ -219,8 +234,7 @@ def test_stable_diffusion_no_safety_checker(self): assert image is not None with tempfile.TemporaryDirectory() as tmpdirname: pipe.save_pretrained(tmpdirname) - pipe = StableDiffusionPipeline.from_pretrained( - tmpdirname, from_diffusers=False) + pipe = StableDiffusionPipeline.from_pretrained(tmpdirname, from_diffusers=False) assert pipe.safety_checker is None image = pipe("example prompt", num_inference_steps=2).images[0] assert image is not None @@ -231,8 +245,7 @@ def test_stable_diffusion_fp16(self): scheduler = PNDMScheduler(skip_prk_steps=True) vae = self.dummy_vae bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") unet = unet.to(dtype=paddle.float16) vae = vae.to(dtype=paddle.float16) bert = bert.to(dtype=paddle.float16) @@ -243,11 +256,11 @@ def test_stable_diffusion_fp16(self): text_encoder=bert, tokenizer=tokenizer, safety_checker=None, - feature_extractor=self.dummy_extractor, ) + feature_extractor=self.dummy_extractor, + ) sd_pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger" - image = sd_pipe( - [prompt], num_inference_steps=2, output_type="np").images + image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images assert image.shape == (1, 64, 64, 3) @@ -260,10 +273,8 @@ def tearDown(self): paddle.device.cuda.empty_cache() def test_harm_safe_stable_diffusion(self): - sd_pipe = StableDiffusionPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", safety_checker=None) - sd_pipe.scheduler = LMSDiscreteScheduler.from_config( - sd_pipe.scheduler.config) + sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None) + sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config) sd_pipe.set_progress_bar_config(disable=None) prompt = "portrait of girl with smokey eyes makeup in abandoned hotel, grange clothes, redshift, wide high angle coloured polaroid photograph with flash, kodak film, hyper real, stunning moody cinematography, with anamorphic lenses, by maripol, fallen angels by wong kar - wai, style of suspiria and neon demon and children from bahnhof zoo, detailed " seed = 4003660346 @@ -277,7 +288,8 @@ def test_harm_safe_stable_diffusion(self): output_type="np", width=512, height=512, - sld_guidance_scale=0, ) + sld_guidance_scale=0, + ) image = output.images image_slice = image[0, -3:, -3:, -1] expected_slice = [ @@ -306,7 +318,8 @@ def test_harm_safe_stable_diffusion(self): sld_warmup_steps=7, sld_threshold=0.025, sld_momentum_scale=0.5, - sld_mom_beta=0.7, ) + sld_mom_beta=0.7, + ) image = output.images image_slice = image[0, -3:, -3:, -1] expected_slice = [ @@ -324,10 +337,8 @@ def test_harm_safe_stable_diffusion(self): assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_nudity_safe_stable_diffusion(self): - sd_pipe = StableDiffusionPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", safety_checker=None) - sd_pipe.scheduler = LMSDiscreteScheduler.from_config( - sd_pipe.scheduler.config) + sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None) + sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config) sd_pipe.set_progress_bar_config(disable=None) prompt = "padme amidala taking a bath artwork, safe for work, no nudity" seed = 2734971755 @@ -341,7 +352,8 @@ def test_nudity_safe_stable_diffusion(self): output_type="np", width=512, height=512, - sld_guidance_scale=0, ) + sld_guidance_scale=0, + ) image = output.images image_slice = image[0, -3:, -3:, -1] expected_slice = [ @@ -370,7 +382,8 @@ def test_nudity_safe_stable_diffusion(self): sld_warmup_steps=7, sld_threshold=0.025, sld_momentum_scale=0.5, - sld_mom_beta=0.7, ) + sld_mom_beta=0.7, + ) image = output.images image_slice = image[0, -3:, -3:, -1] expected_slice = [ @@ -388,8 +401,7 @@ def test_nudity_safe_stable_diffusion(self): assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_nudity_safetychecker_safe_stable_diffusion(self): - sd_pipe = StableDiffusionPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5") + sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") sd_pipe.set_progress_bar_config(disable=None) prompt = "the four horsewomen of the apocalypse, painting by tom of finland, gaston bussiere, craig mullins, j. c. leyendecker" seed = 1044355234 @@ -403,7 +415,8 @@ def test_nudity_safetychecker_safe_stable_diffusion(self): output_type="np", width=512, height=512, - sld_guidance_scale=0, ) + sld_guidance_scale=0, + ) image = output.images image_slice = image[0, -3:, -3:, -1] expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) @@ -422,12 +435,10 @@ def test_nudity_safetychecker_safe_stable_diffusion(self): sld_warmup_steps=7, sld_threshold=0.025, sld_momentum_scale=0.5, - sld_mom_beta=0.7, ) + sld_mom_beta=0.7, + ) image = output.images image_slice = image[0, -3:, -3:, -1] - expected_slice = np.array([ - 0.5818, 0.6285, 0.6835, 0.6019, 0.625, 0.6754, 0.6096, 0.6334, - 0.6561 - ]) + expected_slice = np.array([0.5818, 0.6285, 0.6835, 0.6019, 0.625, 0.6754, 0.6096, 0.6334, 0.6561]) assert image.shape == (1, 512, 512, 3) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 diff --git a/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip.py b/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip.py index 79cfcb2145995..fb5982706c2c9 100644 --- a/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip.py +++ b/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip.py @@ -16,14 +16,24 @@ import unittest import paddle -from paddlenlp.transformers import (CLIPTextConfig, CLIPTextModel, - CLIPTextModelWithProjection, CLIPTokenizer) +from paddlenlp.transformers import ( + CLIPTextConfig, + CLIPTextModel, + CLIPTextModelWithProjection, + CLIPTokenizer, +) -from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler, - PriorTransformer, StableUnCLIPPipeline, - UNet2DConditionModel) -from ppdiffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import \ - StableUnCLIPImageNormalizer +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + DDPMScheduler, + PriorTransformer, + StableUnCLIPPipeline, + UNet2DConditionModel, +) +from ppdiffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import ( + StableUnCLIPImageNormalizer, +) from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS from ..test_pipelines_common import PipelineTesterMixin @@ -39,8 +49,7 @@ def get_dummy_components(self): embedder_hidden_size = 32 embedder_projection_dim = embedder_hidden_size paddle.seed(0) - prior_tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + prior_tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") paddle.seed(0) prior_text_encoder = CLIPTextModelWithProjection( CLIPTextConfig( @@ -53,13 +62,16 @@ def get_dummy_components(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, )) + vocab_size=1000, + ) + ) paddle.seed(0) prior = PriorTransformer( num_attention_heads=2, attention_head_dim=12, embedding_dim=embedder_projection_dim, - num_layers=1, ) + num_layers=1, + ) paddle.seed(0) prior_scheduler = DDPMScheduler( variance_type="fixed_small_log", @@ -67,15 +79,13 @@ def get_dummy_components(self): num_train_timesteps=1000, clip_sample=True, clip_sample_range=5.0, - beta_schedule="squaredcos_cap_v2", ) + beta_schedule="squaredcos_cap_v2", + ) paddle.seed(0) - image_normalizer = StableUnCLIPImageNormalizer( - embedding_dim=embedder_hidden_size) - image_noising_scheduler = DDPMScheduler( - beta_schedule="squaredcos_cap_v2") + image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedder_hidden_size) + image_noising_scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2") paddle.seed(0) - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") paddle.seed(0) text_encoder = CLIPTextModel( CLIPTextConfig( @@ -88,7 +98,9 @@ def get_dummy_components(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, )) + vocab_size=1000, + ) + ) paddle.seed(0) unet = UNet2DConditionModel( sample_size=32, @@ -103,7 +115,8 @@ def get_dummy_components(self): cross_attention_dim=embedder_hidden_size, layers_per_block=1, upcast_attention=True, - use_linear_projection=True, ) + use_linear_projection=True, + ) paddle.seed(0) scheduler = DDIMScheduler( beta_schedule="scaled_linear", @@ -111,7 +124,8 @@ def get_dummy_components(self): beta_end=0.012, prediction_type="v_prediction", set_alpha_to_one=False, - steps_offset=1, ) + steps_offset=1, + ) paddle.seed(0) vae = AutoencoderKL() components = { @@ -143,13 +157,11 @@ def get_dummy_inputs(self, seed=0): def test_attention_slicing_forward_pass(self): test_max_difference = False - self._test_attention_slicing_forward_pass( - test_max_difference=test_max_difference) + self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference) def test_inference_batch_single_identical(self): test_max_difference = False - self._test_inference_batch_single_identical( - test_max_difference=test_max_difference) + self._test_inference_batch_single_identical(test_max_difference=test_max_difference) # @slow diff --git a/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py b/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py index aa2328fb72a16..eb769ee92815b 100644 --- a/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py +++ b/ppdiffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py @@ -19,24 +19,36 @@ import numpy as np import paddle from paddlenlp.transformers import ( - CLIPImageProcessor, CLIPTextConfig, CLIPTextModel, CLIPTokenizer, - CLIPVisionConfig, CLIPVisionModelWithProjection) - -from ppdiffusers import (AutoencoderKL, DDIMScheduler, DDPMScheduler, - StableUnCLIPImg2ImgPipeline, UNet2DConditionModel) + CLIPImageProcessor, + CLIPTextConfig, + CLIPTextModel, + CLIPTokenizer, + CLIPVisionConfig, + CLIPVisionModelWithProjection, +) + +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + DDPMScheduler, + StableUnCLIPImg2ImgPipeline, + UNet2DConditionModel, +) from ppdiffusers.pipelines.pipeline_utils import DiffusionPipeline -from ppdiffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import \ - StableUnCLIPImageNormalizer +from ppdiffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import ( + StableUnCLIPImageNormalizer, +) from ppdiffusers.utils.import_utils import is_ppxformers_available from ppdiffusers.utils.testing_utils import floats_tensor -from ..pipeline_params import (TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, - TEXT_GUIDED_IMAGE_VARIATION_PARAMS) +from ..pipeline_params import ( + TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_PARAMS, +) from ..test_pipelines_common import PipelineTesterMixin -class StableUnCLIPImg2ImgPipelineFastTests(PipelineTesterMixin, - unittest.TestCase): +class StableUnCLIPImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = StableUnCLIPImg2ImgPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS @@ -53,15 +65,14 @@ def get_dummy_components(self): num_attention_heads=4, image_size=32, intermediate_size=37, - patch_size=1, )) + patch_size=1, + ) + ) paddle.seed(0) - image_normalizer = StableUnCLIPImageNormalizer( - embedding_dim=embedder_hidden_size) - image_noising_scheduler = DDPMScheduler( - beta_schedule="squaredcos_cap_v2") + image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedder_hidden_size) + image_noising_scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2") paddle.seed(0) - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") paddle.seed(0) text_encoder = CLIPTextModel( CLIPTextConfig( @@ -74,7 +85,9 @@ def get_dummy_components(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, )) + vocab_size=1000, + ) + ) paddle.seed(0) unet = UNet2DConditionModel( sample_size=32, @@ -89,7 +102,8 @@ def get_dummy_components(self): cross_attention_dim=embedder_hidden_size, layers_per_block=1, upcast_attention=True, - use_linear_projection=True, ) + use_linear_projection=True, + ) paddle.seed(0) scheduler = DDIMScheduler( beta_schedule="scaled_linear", @@ -97,7 +111,8 @@ def get_dummy_components(self): beta_end=0.012, prediction_type="v_prediction", set_alpha_to_one=False, - steps_offset=1, ) + steps_offset=1, + ) paddle.seed(0) vae = AutoencoderKL() components = { @@ -124,17 +139,19 @@ def test_image_embeds_none(self): image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([ - 0.40317363, - 1.0, - 0.5802471, - 0.47334313, - 0.39546987, - 0.72409034, - 0.15691131, - 0.42981434, - 0.72585064, - ]) + expected_slice = np.array( + [ + 0.40317363, + 1.0, + 0.5802471, + 0.47334313, + 0.39546987, + 0.72409034, + 0.15691131, + 0.42981434, + 0.72585064, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 @@ -145,8 +162,7 @@ def get_dummy_inputs(self, seed=0, pil_image=True): if pil_image: input_image = input_image * 0.5 + 0.5 input_image = input_image.clip(min=0, max=1) - input_image = (input_image.cpu().transpose( - perm=[0, 2, 3, 1]).cast("float32").numpy()) + input_image = input_image.cpu().transpose(perm=[0, 2, 3, 1]).cast("float32").numpy() input_image = DiffusionPipeline.numpy_to_pil(input_image)[0] return { "prompt": "An anime racoon running a marathon", @@ -158,21 +174,18 @@ def get_dummy_inputs(self, seed=0, pil_image=True): def test_attention_slicing_forward_pass(self): test_max_difference = False - self._test_attention_slicing_forward_pass( - test_max_difference=test_max_difference) + self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference) def test_inference_batch_single_identical(self): test_max_difference = False - self._test_inference_batch_single_identical( - test_max_difference=test_max_difference) + self._test_inference_batch_single_identical(test_max_difference=test_max_difference) @unittest.skipIf( not is_ppxformers_available(), reason="XFormers attention is only available with CUDA and `xformers` installed", ) def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass( - test_max_difference=False) + self._test_xformers_attention_forwardGenerator_pass(test_max_difference=False) # @slow diff --git a/ppdiffusers/tests/pipelines/test_pipelines.py b/ppdiffusers/tests/pipelines/test_pipelines.py index ce6bcc0752a00..ef0b785f3ed4a 100644 --- a/ppdiffusers/tests/pipelines/test_pipelines.py +++ b/ppdiffusers/tests/pipelines/test_pipelines.py @@ -18,7 +18,6 @@ import os import random import shutil -import sys import tempfile import unittest import unittest.mock as mock @@ -29,24 +28,50 @@ import requests_mock import safetensors.torch from paddlenlp.transformers import ( - CLIPImageProcessor, CLIPModel, CLIPTextConfig, CLIPTextModel, CLIPTokenizer) + CLIPImageProcessor, + CLIPModel, + CLIPTextConfig, + CLIPTextModel, + CLIPTokenizer, +) from parameterized import parameterized from PIL import Image from requests.exceptions import HTTPError from ppdiffusers import ( - AutoencoderKL, DDIMPipeline, DDIMScheduler, DDPMPipeline, DDPMScheduler, - DiffusionPipeline, DPMSolverMultistepScheduler, - EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, - LMSDiscreteScheduler, PNDMScheduler, StableDiffusionImg2ImgPipeline, - StableDiffusionInpaintPipelineLegacy, StableDiffusionPipeline, - UNet2DConditionModel, UNet2DModel, logging) + AutoencoderKL, + DDIMPipeline, + DDIMScheduler, + DDPMPipeline, + DDPMScheduler, + DiffusionPipeline, + DPMSolverMultistepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + StableDiffusionImg2ImgPipeline, + StableDiffusionInpaintPipelineLegacy, + StableDiffusionPipeline, + UNet2DConditionModel, + UNet2DModel, + logging, +) from ppdiffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME -from ppdiffusers.utils import (CONFIG_NAME, TORCH_WEIGHTS_NAME, floats_tensor, - nightly, slow) -from ppdiffusers.utils.testing_utils import (CaptureLogger, get_tests_dir, - require_compel, require_paddle_gpu, - require_torch) +from ppdiffusers.utils import ( + CONFIG_NAME, + TORCH_WEIGHTS_NAME, + floats_tensor, + nightly, + slow, +) +from ppdiffusers.utils.testing_utils import ( + CaptureLogger, + get_tests_dir, + require_compel, + require_paddle_gpu, + require_torch, +) class DownloadTests(unittest.TestCase): @@ -57,12 +82,12 @@ def test_one_request_upon_cached(self): "hf-internal-testing/tiny-stable-diffusion-pipe", cache_dir=tmpdirname, from_hf_hub=True, - from_diffusers=True, ) + from_diffusers=True, + ) download_requests = [r.method for r in m.request_history] assert download_requests.count("HEAD") == 15, "15 calls to files" - assert (download_requests.count("GET") == 17 - ), "15 calls to files + model_info + model_index.json" + assert download_requests.count("GET") == 17, "15 calls to files + model_info + model_index.json" assert ( len(download_requests) == 32 ), "2 calls per file (15 files) + send_telemetry, model_info and model_index.json" @@ -73,11 +98,11 @@ def test_one_request_upon_cached(self): safety_checker=None, cache_dir=tmpdirname, from_hf_hub=True, - from_diffusers=True, ) + from_diffusers=True, + ) cache_requests = [r.method for r in m.request_history] - assert cache_requests.count( - "HEAD") == 1, "model_index.json is only HEAD" + assert cache_requests.count("HEAD") == 1, "model_index.json is only HEAD" assert cache_requests.count("GET") == 1, "model info is only GET" assert ( len(cache_requests) == 2 @@ -90,7 +115,8 @@ def test_less_downloads_passed_object(self): safety_checker=None, cache_dir=tmpdirname, from_hf_hub=True, - from_diffusers=True, ) + from_diffusers=True, + ) # make sure safety checker is not downloaded assert "safety_checker" not in os.listdir(cached_folder) @@ -112,14 +138,14 @@ def test_less_downloads_passed_object_calls(self): safety_checker=None, cache_dir=tmpdirname, from_hf_hub=True, - from_diffusers=True, ) + from_diffusers=True, + ) download_requests = [r.method for r in m.request_history] # 15 - 2 because no call to config or model file for `safety_checker` assert download_requests.count("HEAD") == 13, "13 calls to files" # 17 - 2 because no call to config or model file for `safety_checker` - assert (download_requests.count("GET") == 15 - ), "13 calls to files + model_info + model_index.json" + assert download_requests.count("GET") == 15, "13 calls to files + model_info + model_index.json" assert ( len(download_requests) == 28 ), "2 calls per file (13 files) + send_telemetry, model_info and model_index.json" @@ -130,11 +156,11 @@ def test_less_downloads_passed_object_calls(self): safety_checker=None, cache_dir=tmpdirname, from_hf_hub=True, - from_diffusers=True, ) + from_diffusers=True, + ) cache_requests = [r.method for r in m.request_history] - assert cache_requests.count( - "HEAD") == 1, "model_index.json is only HEAD" + assert cache_requests.count("HEAD") == 1, "model_index.json is only HEAD" assert cache_requests.count("GET") == 1, "model info is only GET" assert ( len(cache_requests) == 2 @@ -147,15 +173,11 @@ def test_download_only_pytorch(self): safety_checker=None, cache_dir=tmpdirname, from_hf_hub=True, - from_diffusers=True, ) + from_diffusers=True, + ) all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))] - all_root_files = [ - t[-1] - for t in os.walk( - os.path.join(tmpdirname, - os.listdir(tmpdirname)[0], "snapshots")) - ] + all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname, os.listdir(tmpdirname)[0], "snapshots"))] files = [item for sublist in all_root_files for item in sublist] assert not any(f.endswith(".msgpack") for f in files) assert not any(f.endswith(".safetensors") for f in files) @@ -163,25 +185,18 @@ def test_download_only_pytorch(self): def test_returned_cached_folder(self): prompt = "hello" pipe = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", - safety_checker=None) + "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None + ) _, local_path = StableDiffusionPipeline.from_pretrained( "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None, - return_cached_folder=True, ) + return_cached_folder=True, + ) pipe_2 = StableDiffusionPipeline.from_pretrained(local_path) generator = paddle.Generator().manual_seed(0) - out = pipe( - prompt, - num_inference_steps=2, - generator=generator, - output_type="numpy").images + out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images generator = paddle.Generator().manual_seed(0) - out_2 = pipe_2( - prompt, - num_inference_steps=2, - generator=generator, - output_type="numpy").images + out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images assert np.max(np.abs(out - out_2)) < 0.001 def test_force_safetensors_error(self): @@ -194,7 +209,8 @@ def test_force_safetensors_error(self): from_diffusers=True, safety_checker=None, cache_dir=tmpdirname, - use_safetensors=True, ) + use_safetensors=True, + ) def test_download_safetensors(self): with tempfile.TemporaryDirectory() as tmpdirname: @@ -204,7 +220,8 @@ def test_download_safetensors(self): from_diffusers=True, safety_checker=None, cache_dir=tmpdirname, - use_safetensors=True, ) + use_safetensors=True, + ) all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))] files = [item for sublist in all_root_files for item in sublist] @@ -219,11 +236,10 @@ def test_download_safetensors_index(self): use_safetensors=True, variant=variant, from_hf_hub=True, - from_diffusers=True, ) + from_diffusers=True, + ) - all_root_files = [ - t[-1] for t in os.walk(os.path.join(tmpdirname)) - ] + all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))] files = [item for sublist in all_root_files for item in sublist] # None of the downloaded files should be a safetensors file even if we have some here: @@ -246,11 +262,10 @@ def test_download_bin_index(self): use_safetensors=False, variant=variant, from_hf_hub=True, - from_diffusers=True, ) + from_diffusers=True, + ) - all_root_files = [ - t[-1] for t in os.walk(os.path.join(tmpdirname)) - ] + all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))] files = [item for sublist in all_root_files for item in sublist] # None of the downloaded files should be a safetensors file even if we have some here: @@ -267,66 +282,39 @@ def test_download_bin_index(self): def test_download_no_safety_checker(self): prompt = "hello" pipe = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", - safety_checker=None) + "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None + ) generator = paddle.Generator().manual_seed(0) - out = pipe( - prompt, - num_inference_steps=2, - generator=generator, - output_type="numpy").images - pipe_2 = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch") + out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images + pipe_2 = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch") generator = paddle.Generator().manual_seed(0) - out_2 = pipe_2( - prompt, - num_inference_steps=2, - generator=generator, - output_type="numpy").images + out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images assert np.max(np.abs(out - out_2)) < 0.001 def test_load_no_safety_checker_explicit_locally(self): prompt = "hello" pipe = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", - safety_checker=None) + "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None + ) generator = paddle.Generator().manual_seed(0) - out = pipe( - prompt, - num_inference_steps=2, - generator=generator, - output_type="numpy").images + out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images with tempfile.TemporaryDirectory() as tmpdirname: pipe.save_pretrained(tmpdirname) - pipe_2 = StableDiffusionPipeline.from_pretrained( - tmpdirname, safety_checker=None) + pipe_2 = StableDiffusionPipeline.from_pretrained(tmpdirname, safety_checker=None) generator = paddle.Generator().manual_seed(0) - out_2 = pipe_2( - prompt, - num_inference_steps=2, - generator=generator, - output_type="numpy").images + out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images assert np.max(np.abs(out - out_2)) < 0.001 def test_load_no_safety_checker_default_locally(self): prompt = "hello" - pipe = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch") + pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch") generator = paddle.Generator().manual_seed(0) - out = pipe( - prompt, - num_inference_steps=2, - generator=generator, - output_type="numpy").images + out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images with tempfile.TemporaryDirectory() as tmpdirname: pipe.save_pretrained(tmpdirname) pipe_2 = StableDiffusionPipeline.from_pretrained(tmpdirname) generator = paddle.Generator().manual_seed(0) - out_2 = pipe_2( - prompt, - num_inference_steps=2, - generator=generator, - output_type="numpy").images + out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images assert np.max(np.abs(out - out_2)) < 0.001 def test_cached_files_are_used_when_no_internet(self): @@ -336,21 +324,16 @@ def test_cached_files_are_used_when_no_internet(self): response_mock.raise_for_status.side_effect = HTTPError response_mock.json.return_value = {} orig_pipe = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", - safety_checker=None) - orig_comps = { - k: v - for k, v in orig_pipe.components.items() if hasattr(v, "parameters") - } + "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None + ) + orig_comps = {k: v for k, v in orig_pipe.components.items() if hasattr(v, "parameters")} with mock.patch("requests.request", return_value=response_mock): pipe = StableDiffusionPipeline.from_pretrained( "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None, - local_files_only=True, ) - comps = { - k: v - for k, v in pipe.components.items() if hasattr(v, "parameters") - } + local_files_only=True, + ) + comps = {k: v for k, v in pipe.components.items() if hasattr(v, "parameters")} for m1, m2 in zip(orig_comps.values(), comps.values()): for p1, p2 in zip(m1.parameters(), m2.parameters()): if (p1 != p2).sum() > 0: @@ -365,11 +348,11 @@ def test_download_from_variant_folder(self): with tempfile.TemporaryDirectory() as tmpdirname: tmpdirname = StableDiffusionPipeline.download( "hf-internal-testing/stable-diffusion-all-variants", - cache_dir=tmpdirname, ) + cache_dir=tmpdirname, + ) all_root_files = [t[-1] for t in os.walk(tmpdirname)] files = [item for sublist in all_root_files for item in sublist] - assert (len(files) == 15 - ), f"We should only download 15 files, not {len(files)}" + assert len(files) == 15, f"We should only download 15 files, not {len(files)}" assert not any(f.endswith(other_format) for f in files) assert not any(len(f.split(".")) == 3 for f in files) ppdiffusers.utils.import_utils._safetensors_available = True @@ -386,22 +369,15 @@ def test_download_variant_all(self): StableDiffusionPipeline.from_pretrained( "hf-internal-testing/stable-diffusion-all-variants", cache_dir=tmpdirname, - variant=variant, ) + variant=variant, + ) all_root_files = [ - t[-1] - for t in os.walk( - os.path.join(tmpdirname, - os.listdir(tmpdirname)[0], "snapshots")) + t[-1] for t in os.walk(os.path.join(tmpdirname, os.listdir(tmpdirname)[0], "snapshots")) ] files = [item for sublist in all_root_files for item in sublist] - assert (len(files) == 15 - ), f"We should only download 15 files, not {len(files)}" - assert (len([ - f for f in files if f.endswith(f"{variant}{this_format}") - ]) == 4) - assert not any( - f.endswith(this_format) and - not f.endswith(f"{variant}{this_format}") for f in files) + assert len(files) == 15, f"We should only download 15 files, not {len(files)}" + assert len([f for f in files if f.endswith(f"{variant}{this_format}")]) == 4 + assert not any(f.endswith(this_format) and not f.endswith(f"{variant}{this_format}") for f in files) assert not any(f.endswith(other_format) for f in files) ppdiffusers.utils.import_utils._safetensors_available = True @@ -417,21 +393,16 @@ def test_download_variant_partly(self): tmpdirname = StableDiffusionPipeline.download( "hf-internal-testing/stable-diffusion-all-variants", cache_dir=tmpdirname, - variant=variant, ) + variant=variant, + ) all_root_files = [t[-1] for t in os.walk(tmpdirname)] files = [item for sublist in all_root_files for item in sublist] unet_files = os.listdir(os.path.join(tmpdirname, "unet")) - assert (len(files) == 15 - ), f"We should only download 15 files, not {len(files)}" + assert len(files) == 15, f"We should only download 15 files, not {len(files)}" assert f"diffusion_pytorch_model.{variant}{this_format}" in unet_files - assert (len([ - f for f in files if f.endswith(f"{variant}{this_format}") - ]) == 1) - assert (sum( - f.endswith(this_format) and - not f.endswith(f"{variant}{this_format}") - for f in files) == 3) + assert len([f for f in files if f.endswith(f"{variant}{this_format}")]) == 1 + assert sum(f.endswith(this_format) and not f.endswith(f"{variant}{this_format}") for f in files) == 3 assert not any(f.endswith(other_format) for f in files) ppdiffusers.utils.import_utils._safetensors_available = True @@ -467,59 +438,52 @@ def test_local_save_load_index(self): @require_torch def test_text_inversion_download(self): pipe = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", - safety_checker=None) + "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None + ) import torch num_tokens = len(pipe.tokenizer) # single token load local with tempfile.TemporaryDirectory() as tmpdirname: - ten = {"<*>": torch.ones((32, ))} + ten = {"<*>": torch.ones((32,))} torch.save(ten, os.path.join(tmpdirname, "learned_embeds.bin")) pipe.load_textual_inversion(tmpdirname, from_diffusers=True) token = pipe.tokenizer.convert_tokens_to_ids("<*>") assert token == num_tokens, "Added token must be at spot `num_tokens`" - assert ( - pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() - == 32) + assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 32 assert pipe._maybe_convert_prompt("<*>", pipe.tokenizer) == "<*>" prompt = "hey <*>" - out = pipe( - prompt, num_inference_steps=1, output_type="numpy").images + out = pipe(prompt, num_inference_steps=1, output_type="numpy").images assert out.shape == (1, 128, 128, 3) # single token load local with weight name ten = {"<**>": 2 * torch.ones((1, 32))} torch.save(ten, os.path.join(tmpdirname, "learned_embeds.bin")) - pipe.load_textual_inversion( - tmpdirname, - weight_name="learned_embeds.bin", - from_diffusers=True) + pipe.load_textual_inversion(tmpdirname, weight_name="learned_embeds.bin", from_diffusers=True) token = pipe.tokenizer.convert_tokens_to_ids("<**>") assert token == num_tokens + 1, "Added token must be at spot `num_tokens`" - assert ( - pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() - == 64) + assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 64 assert pipe._maybe_convert_prompt("<**>", pipe.tokenizer) == "<**>" prompt = "hey <**>" - out = pipe( - prompt, num_inference_steps=1, output_type="numpy").images + out = pipe(prompt, num_inference_steps=1, output_type="numpy").images assert out.shape == (1, 128, 128, 3) # multi token load ten = { - "<***>": torch.cat([ - 3 * torch.ones((1, 32)), - 4 * torch.ones((1, 32)), - 5 * torch.ones((1, 32)), - ]) + "<***>": torch.cat( + [ + 3 * torch.ones((1, 32)), + 4 * torch.ones((1, 32)), + 5 * torch.ones((1, 32)), + ] + ) } torch.save(ten, os.path.join(tmpdirname, "learned_embeds.bin")) @@ -532,38 +496,31 @@ def test_text_inversion_download(self): assert token == num_tokens + 2, "Added token must be at spot `num_tokens`" assert token_1 == num_tokens + 3, "Added token must be at spot `num_tokens`" assert token_2 == num_tokens + 4, "Added token must be at spot `num_tokens`" - assert ( - pipe.text_encoder.get_input_embeddings().weight[-3].sum().item() - == 96) - assert ( - pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() - == 128) - assert ( - pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() - == 160) - assert (pipe._maybe_convert_prompt("<***>", pipe.tokenizer) == - "<***> <***>_1 <***>_2") + assert pipe.text_encoder.get_input_embeddings().weight[-3].sum().item() == 96 + assert pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() == 128 + assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 160 + assert pipe._maybe_convert_prompt("<***>", pipe.tokenizer) == "<***> <***>_1 <***>_2" prompt = "hey <***>" - out = pipe( - prompt, num_inference_steps=1, output_type="numpy").images + out = pipe(prompt, num_inference_steps=1, output_type="numpy").images assert out.shape == (1, 128, 128, 3) # multi token load a1111 ten = { "string_to_param": { - "*": torch.cat([ - 3 * torch.ones((1, 32)), - 4 * torch.ones((1, 32)), - 5 * torch.ones((1, 32)), - ]) + "*": torch.cat( + [ + 3 * torch.ones((1, 32)), + 4 * torch.ones((1, 32)), + 5 * torch.ones((1, 32)), + ] + ) }, "name": "<****>", } torch.save(ten, os.path.join(tmpdirname, "a1111.bin")) - pipe.load_textual_inversion( - tmpdirname, weight_name="a1111.bin", from_diffusers=True) + pipe.load_textual_inversion(tmpdirname, weight_name="a1111.bin", from_diffusers=True) token = pipe.tokenizer.convert_tokens_to_ids("<****>") token_1 = pipe.tokenizer.convert_tokens_to_ids("<****>_1") @@ -572,21 +529,13 @@ def test_text_inversion_download(self): assert token == num_tokens + 5, "Added token must be at spot `num_tokens`" assert token_1 == num_tokens + 6, "Added token must be at spot `num_tokens`" assert token_2 == num_tokens + 7, "Added token must be at spot `num_tokens`" - assert ( - pipe.text_encoder.get_input_embeddings().weight[-3].sum().item() - == 96) - assert ( - pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() - == 128) - assert ( - pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() - == 160) - assert (pipe._maybe_convert_prompt("<****>", pipe.tokenizer) == - "<****> <****>_1 <****>_2") + assert pipe.text_encoder.get_input_embeddings().weight[-3].sum().item() == 96 + assert pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() == 128 + assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 160 + assert pipe._maybe_convert_prompt("<****>", pipe.tokenizer) == "<****> <****>_1 <****>_2" prompt = "hey <****>" - out = pipe( - prompt, num_inference_steps=1, output_type="numpy").images + out = pipe(prompt, num_inference_steps=1, output_type="numpy").images assert out.shape == (1, 128, 128, 3) def test_download_ignore_files(self): @@ -595,20 +544,16 @@ def test_download_ignore_files(self): # pipeline has Flax weights tmpdirname = DiffusionPipeline.download( "hf-internal-testing/tiny-stable-diffusion-pipe-ignore-files", - cache_dir=tmpdirname, ) + cache_dir=tmpdirname, + ) files = [] for root, ds, fs in os.walk(tmpdirname): for f in fs: - str_path = str(os.path.join(root, f)).replace( - str(tmpdirname) + "/", "") + str_path = str(os.path.join(root, f)).replace(str(tmpdirname) + "/", "") files.append(str_path) # None of the downloaded files should be a pytorch file even if we have some here: # https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe/blob/main/unet/diffusion_flax_model.msgpack - assert not any(f in files - for f in [ - "vae/diffusion_pytorch_model.bin", - "text_encoder/config.json" - ]) + assert not any(f in files for f in ["vae/diffusion_pytorch_model.bin", "text_encoder/config.json"]) assert len(files) == 13 @@ -616,7 +561,8 @@ class CustomPipelineTests(unittest.TestCase): def test_load_custom_pipeline(self): pipeline = DiffusionPipeline.from_pretrained( "google/ddpm-cifar10-32", - custom_pipeline="junnyu/ppdiffusers-dummy-pipeline", ) + custom_pipeline="junnyu/ppdiffusers-dummy-pipeline", + ) pipeline = pipeline assert pipeline.__class__.__name__ == "CustomPipeline" @@ -644,7 +590,8 @@ def test_load_custom_pipeline(self): def test_run_custom_pipeline(self): pipeline = DiffusionPipeline.from_pretrained( "google/ddpm-cifar10-32", - custom_pipeline="junnyu/ppdiffusers-dummy-pipeline", ) + custom_pipeline="junnyu/ppdiffusers-dummy-pipeline", + ) pipeline = pipeline images, output_str = pipeline(num_inference_steps=2, output_type="np") assert images[0].shape == (1, 32, 32, 3) @@ -653,8 +600,8 @@ def test_run_custom_pipeline(self): def test_local_custom_pipeline_repo(self): local_custom_pipeline_path = get_tests_dir("fixtures/custom_pipeline") pipeline = DiffusionPipeline.from_pretrained( - "google/ddpm-cifar10-32", - custom_pipeline=local_custom_pipeline_path) + "google/ddpm-cifar10-32", custom_pipeline=local_custom_pipeline_path + ) pipeline = pipeline images, output_str = pipeline(num_inference_steps=2, output_type="np") assert pipeline.__class__.__name__ == "CustomLocalPipeline" @@ -663,11 +610,10 @@ def test_local_custom_pipeline_repo(self): def test_local_custom_pipeline_file(self): local_custom_pipeline_path = get_tests_dir("fixtures/custom_pipeline") - local_custom_pipeline_path = os.path.join(local_custom_pipeline_path, - "what_ever.py") + local_custom_pipeline_path = os.path.join(local_custom_pipeline_path, "what_ever.py") pipeline = DiffusionPipeline.from_pretrained( - "google/ddpm-cifar10-32", - custom_pipeline=local_custom_pipeline_path) + "google/ddpm-cifar10-32", custom_pipeline=local_custom_pipeline_path + ) pipeline = pipeline images, output_str = pipeline(num_inference_steps=2, output_type="np") assert pipeline.__class__.__name__ == "CustomLocalPipeline" @@ -678,13 +624,13 @@ def test_local_custom_pipeline_file(self): @require_paddle_gpu def test_download_from_git(self): clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K" - feature_extractor = CLIPImageProcessor.from_pretrained( - clip_model_id, from_hf_hub=False) + feature_extractor = CLIPImageProcessor.from_pretrained(clip_model_id, from_hf_hub=False) clip_model = CLIPModel.from_pretrained( clip_model_id, paddle_dtype=paddle.float16, from_hf_hub=False, - from_diffusers=False, ) + from_diffusers=False, + ) pipeline = DiffusionPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", custom_pipeline="clip_guided_stable_diffusion", @@ -692,17 +638,17 @@ def test_download_from_git(self): feature_extractor=feature_extractor, paddle_dtype=paddle.float16, from_hf_hub=False, - from_diffusers=False, ) + from_diffusers=False, + ) pipeline.enable_attention_slicing() assert pipeline.__class__.__name__ == "CLIPGuidedStableDiffusion" - image = pipeline( - "a prompt", num_inference_steps=2, output_type="np").images[0] + image = pipeline("a prompt", num_inference_steps=2, output_type="np").images[0] assert image.shape == (512, 512, 3) def test_save_pipeline_change_config(self): pipe = DiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", - safety_checker=None) + "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None + ) with tempfile.TemporaryDirectory() as tmpdirname: pipe.save_pretrained(tmpdirname) @@ -710,8 +656,7 @@ def test_save_pipeline_change_config(self): assert pipe.scheduler.__class__.__name__ == "PNDMScheduler" - pipe.scheduler = DPMSolverMultistepScheduler.from_config( - pipe.scheduler.config) + pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) pipe.save_pretrained(tmpdirname) pipe = DiffusionPipeline.from_pretrained(tmpdirname) @@ -732,8 +677,7 @@ def dummy_image(self): batch_size = 1 num_channels = 3 sizes = 32, 32 - image = floats_tensor( - (batch_size, num_channels) + sizes, rng=random.Random(0)) + image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)) return image def dummy_uncond_unet(self, sample_size=32): @@ -745,7 +689,8 @@ def dummy_uncond_unet(self, sample_size=32): in_channels=3, out_channels=3, down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), ) + up_block_types=("AttnUpBlock2D", "UpBlock2D"), + ) return model def dummy_cond_unet(self, sample_size=32): @@ -758,7 +703,8 @@ def dummy_cond_unet(self, sample_size=32): out_channels=4, down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, ) + cross_attention_dim=32, + ) return model @property @@ -770,7 +716,8 @@ def dummy_vae(self): out_channels=3, down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, ) + latent_channels=4, + ) return model @property @@ -785,7 +732,8 @@ def dummy_text_encoder(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, ) + vocab_size=1000, + ) return CLIPTextModel(config).eval() @property @@ -803,24 +751,21 @@ def to(self, device): return extract - @parameterized.expand([ - [DDIMScheduler, DDIMPipeline, 32], - [DDPMScheduler, DDPMPipeline, 32], - [DDIMScheduler, DDIMPipeline, (32, 64)], - [DDPMScheduler, DDPMPipeline, (64, 32)], - ]) - def test_uncond_unet_components(self, - scheduler_fn=DDPMScheduler, - pipeline_fn=DDPMPipeline, - sample_size=32): + @parameterized.expand( + [ + [DDIMScheduler, DDIMPipeline, 32], + [DDPMScheduler, DDPMPipeline, 32], + [DDIMScheduler, DDIMPipeline, (32, 64)], + [DDPMScheduler, DDPMPipeline, (64, 32)], + ] + ) + def test_uncond_unet_components(self, scheduler_fn=DDPMScheduler, pipeline_fn=DDPMPipeline, sample_size=32): unet = self.dummy_uncond_unet(sample_size) scheduler = scheduler_fn() pipeline = pipeline_fn(unet, scheduler) generator = paddle.Generator().manual_seed(0) - out_image = pipeline( - generator=generator, num_inference_steps=2, output_type="np").images - sample_size = ((sample_size, sample_size) - if isinstance(sample_size, int) else sample_size) + out_image = pipeline(generator=generator, num_inference_steps=2, output_type="np").images + sample_size = (sample_size, sample_size) if isinstance(sample_size, int) else sample_size assert out_image.shape == (1, *sample_size, 3) def test_stable_diffusion_components(self): @@ -829,13 +774,10 @@ def test_stable_diffusion_components(self): scheduler = PNDMScheduler(skip_prk_steps=True) vae = self.dummy_vae bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") image = self.dummy_image().cpu().transpose(perm=[0, 2, 3, 1])[0] init_image = Image.fromarray(np.uint8(image)).convert("RGB") - mask_image = ( - Image.fromarray(np.uint8(image + 4)).convert("RGB").resize( - (32, 32))) + mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32)) inpaint = StableDiffusionInpaintPipelineLegacy( unet=unet, scheduler=scheduler, @@ -843,7 +785,8 @@ def test_stable_diffusion_components(self): text_encoder=bert, tokenizer=tokenizer, safety_checker=None, - feature_extractor=self.dummy_extractor, ) + feature_extractor=self.dummy_extractor, + ) img2img = StableDiffusionImg2ImgPipeline(**inpaint.components) text2img = StableDiffusionPipeline(**inpaint.components) prompt = "A painting of a squirrel eating a burger" @@ -854,18 +797,16 @@ def test_stable_diffusion_components(self): num_inference_steps=2, output_type="np", image=init_image, - mask_image=mask_image, ).images + mask_image=mask_image, + ).images image_img2img = img2img( [prompt], generator=generator, num_inference_steps=2, output_type="np", - image=init_image, ).images - image_text2img = text2img( - [prompt], - generator=generator, - num_inference_steps=2, - output_type="np").images + image=init_image, + ).images + image_text2img = text2img([prompt], generator=generator, num_inference_steps=2, output_type="np").images assert image_inpaint.shape == (1, 32, 32, 3) assert image_img2img.shape == (1, 32, 32, 3) assert image_text2img.shape == (1, 64, 64, 3) @@ -875,8 +816,7 @@ def test_set_scheduler(self): scheduler = PNDMScheduler(skip_prk_steps=True) vae = self.dummy_vae bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") sd = StableDiffusionPipeline( unet=unet, scheduler=scheduler, @@ -884,7 +824,8 @@ def test_set_scheduler(self): text_encoder=bert, tokenizer=tokenizer, safety_checker=None, - feature_extractor=self.dummy_extractor, ) + feature_extractor=self.dummy_extractor, + ) sd.scheduler = DDIMScheduler.from_config(sd.scheduler.config) assert isinstance(sd.scheduler, DDIMScheduler) sd.scheduler = DDPMScheduler.from_config(sd.scheduler.config) @@ -895,11 +836,9 @@ def test_set_scheduler(self): assert isinstance(sd.scheduler, LMSDiscreteScheduler) sd.scheduler = EulerDiscreteScheduler.from_config(sd.scheduler.config) assert isinstance(sd.scheduler, EulerDiscreteScheduler) - sd.scheduler = EulerAncestralDiscreteScheduler.from_config( - sd.scheduler.config) + sd.scheduler = EulerAncestralDiscreteScheduler.from_config(sd.scheduler.config) assert isinstance(sd.scheduler, EulerAncestralDiscreteScheduler) - sd.scheduler = DPMSolverMultistepScheduler.from_config( - sd.scheduler.config) + sd.scheduler = DPMSolverMultistepScheduler.from_config(sd.scheduler.config) assert isinstance(sd.scheduler, DPMSolverMultistepScheduler) def test_set_component_to_none(self): @@ -907,8 +846,7 @@ def test_set_component_to_none(self): scheduler = PNDMScheduler(skip_prk_steps=True) vae = self.dummy_vae bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") pipeline = StableDiffusionPipeline( unet=unet, @@ -917,7 +855,8 @@ def test_set_component_to_none(self): text_encoder=bert, tokenizer=tokenizer, safety_checker=None, - feature_extractor=self.dummy_extractor, ) + feature_extractor=self.dummy_extractor, + ) generator = paddle.Generator().manual_seed(0) @@ -927,7 +866,8 @@ def test_set_component_to_none(self): prompt=prompt, generator=generator, num_inference_steps=1, - output_type="np", ).images + output_type="np", + ).images pipeline.feature_extractor = None generator = paddle.Generator().manual_seed(0) @@ -935,23 +875,19 @@ def test_set_component_to_none(self): prompt=prompt, generator=generator, num_inference_steps=1, - output_type="np", ).images + output_type="np", + ).images assert out_image.shape == (1, 64, 64, 3) assert np.abs(out_image - out_image_2).max() < 1e-3 def test_set_scheduler_consistency(self): unet = self.dummy_cond_unet() - pndm = PNDMScheduler.from_config( - "hf-internal-testing/tiny-stable-diffusion-torch", - subfolder="scheduler") - ddim = DDIMScheduler.from_config( - "hf-internal-testing/tiny-stable-diffusion-torch", - subfolder="scheduler") + pndm = PNDMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler") + ddim = DDIMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler") vae = self.dummy_vae bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") sd = StableDiffusionPipeline( unet=unet, scheduler=pndm, @@ -959,15 +895,13 @@ def test_set_scheduler_consistency(self): text_encoder=bert, tokenizer=tokenizer, safety_checker=None, - feature_extractor=self.dummy_extractor, ) + feature_extractor=self.dummy_extractor, + ) pndm_config = sd.scheduler.config sd.scheduler = DDPMScheduler.from_config(pndm_config) sd.scheduler = PNDMScheduler.from_config(sd.scheduler.config) pndm_config_2 = sd.scheduler.config - pndm_config_2 = { - k: v - for k, v in pndm_config_2.items() if k in pndm_config - } + pndm_config_2 = {k: v for k, v in pndm_config_2.items() if k in pndm_config} assert dict(pndm_config) == dict(pndm_config_2) sd = StableDiffusionPipeline( unet=unet, @@ -976,40 +910,33 @@ def test_set_scheduler_consistency(self): text_encoder=bert, tokenizer=tokenizer, safety_checker=None, - feature_extractor=self.dummy_extractor, ) + feature_extractor=self.dummy_extractor, + ) ddim_config = sd.scheduler.config sd.scheduler = LMSDiscreteScheduler.from_config(ddim_config) sd.scheduler = DDIMScheduler.from_config(sd.scheduler.config) ddim_config_2 = sd.scheduler.config - ddim_config_2 = { - k: v - for k, v in ddim_config_2.items() if k in ddim_config - } + ddim_config_2 = {k: v for k, v in ddim_config_2.items() if k in ddim_config} assert dict(ddim_config) == dict(ddim_config_2) def test_save_safe_serialization(self): pipeline = StableDiffusionPipeline.from_pretrained( "hf-internal-testing/tiny-stable-diffusion-torch", from_hf_hub=True, - from_diffusers=True, ) + from_diffusers=True, + ) with tempfile.TemporaryDirectory() as tmpdirname: - pipeline.save_pretrained( - tmpdirname, safe_serialization=True, to_diffusers=True) - vae_path = os.path.join(tmpdirname, "vae", - "diffusion_pytorch_model.safetensors") + pipeline.save_pretrained(tmpdirname, safe_serialization=True, to_diffusers=True) + vae_path = os.path.join(tmpdirname, "vae", "diffusion_pytorch_model.safetensors") assert os.path.exists(vae_path), f"Could not find {vae_path}" _ = safetensors.torch.load_file(vae_path) - unet_path = os.path.join(tmpdirname, "unet", - "diffusion_pytorch_model.safetensors") + unet_path = os.path.join(tmpdirname, "unet", "diffusion_pytorch_model.safetensors") assert os.path.exists(unet_path), f"Could not find {unet_path}" _ = safetensors.torch.load_file(unet_path) - text_encoder_path = os.path.join(tmpdirname, "text_encoder", - "model.safetensors") - assert os.path.exists( - text_encoder_path), f"Could not find {text_encoder_path}" + text_encoder_path = os.path.join(tmpdirname, "text_encoder", "model.safetensors") + assert os.path.exists(text_encoder_path), f"Could not find {text_encoder_path}" _ = safetensors.torch.load_file(text_encoder_path) - pipeline = StableDiffusionPipeline.from_pretrained( - tmpdirname, from_diffusers=True) + pipeline = StableDiffusionPipeline.from_pretrained(tmpdirname, from_diffusers=True) assert pipeline.unet is not None assert pipeline.vae is not None assert pipeline.text_encoder is not None @@ -1020,17 +947,17 @@ def test_no_pytorch_download_when_doing_safetensors(self): with tempfile.TemporaryDirectory() as tmpdirname: _ = StableDiffusionPipeline.from_pretrained( "hf-internal-testing/diffusers-stable-diffusion-tiny-all", - cache_dir=tmpdirname, ) + cache_dir=tmpdirname, + ) path = os.path.join( tmpdirname, "models--hf-internal-testing--diffusers-stable-diffusion-tiny-all", "snapshots", "07838d72e12f9bcec1375b0482b80c1d399be843", - "unet", ) - assert os.path.exists( - os.path.join(path, "diffusion_pytorch_model.safetensors")) - assert not os.path.exists( - os.path.join(path, "diffusion_pytorch_model.bin")) + "unet", + ) + assert os.path.exists(os.path.join(path, "diffusion_pytorch_model.safetensors")) + assert not os.path.exists(os.path.join(path, "diffusion_pytorch_model.bin")) def test_no_safetensors_download_when_doing_pytorch(self): import ppdiffusers @@ -1039,28 +966,25 @@ def test_no_safetensors_download_when_doing_pytorch(self): with tempfile.TemporaryDirectory() as tmpdirname: _ = StableDiffusionPipeline.from_pretrained( "hf-internal-testing/diffusers-stable-diffusion-tiny-all", - cache_dir=tmpdirname, ) + cache_dir=tmpdirname, + ) path = os.path.join( tmpdirname, "models--hf-internal-testing--diffusers-stable-diffusion-tiny-all", "snapshots", "07838d72e12f9bcec1375b0482b80c1d399be843", - "unet", ) - assert not os.path.exists( - os.path.join(path, "diffusion_pytorch_model.safetensors")) - assert os.path.exists( - os.path.join(path, "diffusion_pytorch_model.bin")) + "unet", + ) + assert not os.path.exists(os.path.join(path, "diffusion_pytorch_model.safetensors")) + assert os.path.exists(os.path.join(path, "diffusion_pytorch_model.bin")) ppdiffusers.utils.import_utils._safetensors_available = True def test_optional_components(self): unet = self.dummy_cond_unet() - pndm = PNDMScheduler.from_config( - "hf-internal-testing/tiny-stable-diffusion-torch", - subfolder="scheduler") + pndm = PNDMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler") vae = self.dummy_vae bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") sd = StableDiffusionPipeline( unet=unet, scheduler=pndm, @@ -1068,7 +992,8 @@ def test_optional_components(self): text_encoder=bert, tokenizer=tokenizer, safety_checker=unet, - feature_extractor=self.dummy_extractor, ) + feature_extractor=self.dummy_extractor, + ) assert sd.config.requires_safety_checker is True with tempfile.TemporaryDirectory() as tmpdirname: sd.save_pretrained(tmpdirname) @@ -1076,7 +1001,8 @@ def test_optional_components(self): tmpdirname, feature_extractor=None, safety_checker=None, - requires_safety_checker=False, ) + requires_safety_checker=False, + ) assert sd.config.requires_safety_checker is False assert sd.config.safety_checker == (None, None) assert sd.config.feature_extractor == (None, None) @@ -1092,8 +1018,7 @@ def test_optional_components(self): config["safety_checker"] = [None, None] with open(os.path.join(tmpdirname, sd.config_name), "w") as f: json.dump(config, f) - sd = StableDiffusionPipeline.from_pretrained( - tmpdirname, requires_safety_checker=False) + sd = StableDiffusionPipeline.from_pretrained(tmpdirname, requires_safety_checker=False) sd.save_pretrained(tmpdirname) sd = StableDiffusionPipeline.from_pretrained(tmpdirname) assert sd.config.requires_safety_checker is False @@ -1110,8 +1035,7 @@ def test_optional_components(self): assert sd.config.safety_checker == (None, None) assert sd.config.feature_extractor == (None, None) sd.save_pretrained(tmpdirname) - sd = StableDiffusionPipeline.from_pretrained( - tmpdirname, feature_extractor=self.dummy_extractor) + sd = StableDiffusionPipeline.from_pretrained(tmpdirname, feature_extractor=self.dummy_extractor) assert sd.config.requires_safety_checker is False assert sd.config.safety_checker == (None, None) assert sd.config.feature_extractor != (None, None) @@ -1119,13 +1043,13 @@ def test_optional_components(self): tmpdirname, feature_extractor=self.dummy_extractor, safety_checker=unet, - requires_safety_checker=[True, True], ) + requires_safety_checker=[True, True], + ) assert sd.config.requires_safety_checker == [True, True] assert sd.config.safety_checker != (None, None) assert sd.config.feature_extractor != (None, None) sd.save_pretrained(tmpdirname) - sd = StableDiffusionPipeline.from_pretrained( - tmpdirname, feature_extractor=self.dummy_extractor) + sd = StableDiffusionPipeline.from_pretrained(tmpdirname, feature_extractor=self.dummy_extractor) assert sd.config.requires_safety_checker == [True, True] assert sd.config.safety_checker != (None, None) assert sd.config.feature_extractor != (None, None) @@ -1146,42 +1070,28 @@ def tearDown(self): def test_smart_download(self): model_id = "hf-internal-testing/unet-pipeline-dummy" with tempfile.TemporaryDirectory() as tmpdirname: - _ = DiffusionPipeline.from_pretrained( - model_id, cache_dir=tmpdirname, force_download=True) + _ = DiffusionPipeline.from_pretrained(model_id, cache_dir=tmpdirname, force_download=True) local_repo_name = "--".join(["models"] + model_id.split("/")) - snapshot_dir = os.path.join(tmpdirname, local_repo_name, - "snapshots") - snapshot_dir = os.path.join(snapshot_dir, - os.listdir(snapshot_dir)[0]) - assert os.path.isfile( - os.path.join(snapshot_dir, DiffusionPipeline.config_name)) + snapshot_dir = os.path.join(tmpdirname, local_repo_name, "snapshots") + snapshot_dir = os.path.join(snapshot_dir, os.listdir(snapshot_dir)[0]) + assert os.path.isfile(os.path.join(snapshot_dir, DiffusionPipeline.config_name)) assert os.path.isfile(os.path.join(snapshot_dir, CONFIG_NAME)) - assert os.path.isfile( - os.path.join(snapshot_dir, SCHEDULER_CONFIG_NAME)) - assert os.path.isfile( - os.path.join(snapshot_dir, TORCH_WEIGHTS_NAME)) - assert os.path.isfile( - os.path.join(snapshot_dir, "scheduler", SCHEDULER_CONFIG_NAME)) - assert os.path.isfile( - os.path.join(snapshot_dir, "unet", TORCH_WEIGHTS_NAME)) - assert os.path.isfile( - os.path.join(snapshot_dir, "unet", TORCH_WEIGHTS_NAME)) - assert not os.path.isfile( - os.path.join(snapshot_dir, "big_array.npy")) + assert os.path.isfile(os.path.join(snapshot_dir, SCHEDULER_CONFIG_NAME)) + assert os.path.isfile(os.path.join(snapshot_dir, TORCH_WEIGHTS_NAME)) + assert os.path.isfile(os.path.join(snapshot_dir, "scheduler", SCHEDULER_CONFIG_NAME)) + assert os.path.isfile(os.path.join(snapshot_dir, "unet", TORCH_WEIGHTS_NAME)) + assert os.path.isfile(os.path.join(snapshot_dir, "unet", TORCH_WEIGHTS_NAME)) + assert not os.path.isfile(os.path.join(snapshot_dir, "big_array.npy")) def test_warning_unused_kwargs(self): model_id = "hf-internal-testing/unet-pipeline-dummy" logger = logging.get_logger("ppdiffusers.pipelines") with tempfile.TemporaryDirectory() as tmpdirname: with CaptureLogger(logger) as cap_logger: - DiffusionPipeline.from_pretrained( - model_id, - not_used=True, - cache_dir=tmpdirname, - force_download=True) + DiffusionPipeline.from_pretrained(model_id, not_used=True, cache_dir=tmpdirname, force_download=True) assert ( - cap_logger.out.strip().split("\n")[-1] == - "Keyword arguments {'not_used': True} are not expected by DDPMPipeline and will be ignored." + cap_logger.out.strip().split("\n")[-1] + == "Keyword arguments {'not_used': True} are not expected by DDPMPipeline and will be ignored." ) def test_from_save_pretrained(self): @@ -1192,7 +1102,8 @@ def test_from_save_pretrained(self): in_channels=3, out_channels=3, down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), ) + up_block_types=("AttnUpBlock2D", "UpBlock2D"), + ) scheduler = DDPMScheduler(num_train_timesteps=10) ddpm = DDPMPipeline(model, scheduler) ddpm.set_progress_bar_config(disable=None) @@ -1202,59 +1113,41 @@ def test_from_save_pretrained(self): new_ddpm = DDPMPipeline.from_pretrained(tmpdirname) generator = paddle.Generator().manual_seed(0) - image = ddpm( - generator=generator, num_inference_steps=5, - output_type="numpy").images + image = ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images generator = paddle.Generator().manual_seed(0) - new_image = new_ddpm( - generator=generator, num_inference_steps=5, - output_type="numpy").images + new_image = new_ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images - assert (np.abs(image - new_image).sum() < 1e-5 - ), "Models don't give the same forward pass" + assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass" def test_from_pretrained_hub(self): model_path = "google/ddpm-cifar10-32" scheduler = DDPMScheduler(num_train_timesteps=10) ddpm = DDPMPipeline.from_pretrained(model_path, scheduler=scheduler) ddpm.set_progress_bar_config(disable=None) - ddpm_from_hub = DiffusionPipeline.from_pretrained( - model_path, scheduler=scheduler) + ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler) ddpm_from_hub = ddpm_from_hub ddpm_from_hub.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) - image = ddpm( - generator=generator, num_inference_steps=5, - output_type="numpy").images + image = ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images generator = paddle.Generator().manual_seed(0) - new_image = ddpm_from_hub( - generator=generator, num_inference_steps=5, - output_type="numpy").images - assert (np.abs(image - new_image).sum() < 1e-05 - ), "Models don't give the same forward pass" + new_image = ddpm_from_hub(generator=generator, num_inference_steps=5, output_type="numpy").images + assert np.abs(image - new_image).sum() < 1e-05, "Models don't give the same forward pass" def test_from_pretrained_hub_pass_model(self): model_path = "google/ddpm-cifar10-32" scheduler = DDPMScheduler(num_train_timesteps=10) unet = UNet2DModel.from_pretrained(model_path) - ddpm_from_hub_custom_model = DiffusionPipeline.from_pretrained( - model_path, unet=unet, scheduler=scheduler) + ddpm_from_hub_custom_model = DiffusionPipeline.from_pretrained(model_path, unet=unet, scheduler=scheduler) ddpm_from_hub_custom_model = ddpm_from_hub_custom_model ddpm_from_hub_custom_model.set_progress_bar_config(disable=None) - ddpm_from_hub = DiffusionPipeline.from_pretrained( - model_path, scheduler=scheduler) + ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler) ddpm_from_hub_custom_model.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) - image = ddpm_from_hub_custom_model( - generator=generator, num_inference_steps=5, - output_type="numpy").images + image = ddpm_from_hub_custom_model(generator=generator, num_inference_steps=5, output_type="numpy").images generator = paddle.Generator().manual_seed(0) - new_image = ddpm_from_hub( - generator=generator, num_inference_steps=5, - output_type="numpy").images - assert (np.abs(image - new_image).sum() < 1e-05 - ), "Models don't give the same forward pass" + new_image = ddpm_from_hub(generator=generator, num_inference_steps=5, output_type="numpy").images + assert np.abs(image - new_image).sum() < 1e-05, "Models don't give the same forward pass" def test_output_format(self): model_path = "google/ddpm-cifar10-32" @@ -1292,8 +1185,7 @@ def test_ddpm_ddim_equality_batched(self): ddim = DDIMPipeline(unet=unet, scheduler=ddim_scheduler) ddim.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(seed) - ddpm_images = ddpm( - batch_size=2, generator=generator, output_type="numpy").images + ddpm_images = ddpm(batch_size=2, generator=generator, output_type="numpy").images generator = paddle.Generator().manual_seed(seed) ddim_images = ddim( batch_size=2, @@ -1301,5 +1193,6 @@ def test_ddpm_ddim_equality_batched(self): num_inference_steps=1000, eta=1.0, output_type="numpy", - use_clipped_model_output=True, ).images + use_clipped_model_output=True, + ).images assert np.abs(ddpm_images - ddim_images).max() < 0.1 diff --git a/ppdiffusers/tests/pipelines/test_pipelines_common.py b/ppdiffusers/tests/pipelines/test_pipelines_common.py index 5b09ecc71d187..c92b77174f7dc 100644 --- a/ppdiffusers/tests/pipelines/test_pipelines_common.py +++ b/ppdiffusers/tests/pipelines/test_pipelines_common.py @@ -48,16 +48,18 @@ class PipelineTesterMixin: # Canonical parameters that are passed to `__call__` regardless # of the type of pipeline. They are always optional and have common # sense default values. - required_optional_params = frozenset([ - "num_inference_steps", - "num_images_per_prompt", - "generator", - "latents", - "output_type", - "return_dict", - "callback", - "callback_steps", - ]) + required_optional_params = frozenset( + [ + "num_inference_steps", + "num_images_per_prompt", + "generator", + "latents", + "output_type", + "return_dict", + "callback", + "callback_steps", + ] + ) num_inference_steps_args = ["num_inference_steps"] test_attention_slicing = True test_cpu_offload = False @@ -95,7 +97,8 @@ def params(self) -> frozenset: "do not make modifications to the existing common sets of arguments. I.e. a text to image pipeline " "with non-configurable height and width arguments should set the attribute as " "`params = TEXT_TO_IMAGE_PARAMS - {'height', 'width'}`. " - "See existing pipeline tests for reference.") + "See existing pipeline tests for reference." + ) @property def batch_params(self) -> frozenset: @@ -108,7 +111,8 @@ def batch_params(self) -> frozenset: "do not make modifications to the existing common sets of batch arguments. I.e. a text to " "image pipeline `negative_prompt` is not batched should set the attribute as " "`batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - {'negative_prompt'}`. " - "See existing pipeline tests for reference.") + "See existing pipeline tests for reference." + ) def tearDown(self): super().tearDown() @@ -123,8 +127,7 @@ def test_save_load_local(self): output = pipe(**inputs)[0] with tempfile.TemporaryDirectory() as tmpdir: pipe.save_pretrained(tmpdir, to_diffusers=False) - pipe_loaded = self.pipeline_class.from_pretrained( - tmpdir, from_diffusers=False) + pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, from_diffusers=False) pipe_loaded.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs() output_loaded = pipe_loaded(**inputs)[0] @@ -134,7 +137,8 @@ def test_save_load_local(self): def test_pipeline_call_signature(self): self.assertTrue( hasattr(self.pipeline_class, "__call__"), - f"{self.pipeline_class} should have a `__call__` method", ) + f"{self.pipeline_class} should have a `__call__` method", + ) parameters = inspect.signature(self.pipeline_class.__call__).parameters @@ -146,9 +150,7 @@ def test_pipeline_call_signature(self): parameters = set(parameters.keys()) parameters.remove("self") - parameters.discard( - "kwargs" - ) # kwargs can be added if arguments of pipeline call function are deprecated + parameters.discard("kwargs") # kwargs can be added if arguments of pipeline call function are deprecated remaining_required_parameters = set() @@ -176,9 +178,10 @@ def test_inference_batch_consistent(self, batch_sizes=[2, 4, 13]): self._test_inference_batch_consistent(batch_sizes=batch_sizes) def _test_inference_batch_consistent( - self, - batch_sizes=[2, 4, 13], - additional_params_copy_to_batched_inputs=["num_inference_steps"], ): + self, + batch_sizes=[2, 4, 13], + additional_params_copy_to_batched_inputs=["num_inference_steps"], + ): components = self.get_dummy_components() pipe = self.pipeline_class(**components) pipe.set_progress_bar_config(disable=None) @@ -191,10 +194,7 @@ def _test_inference_batch_consistent( if name in self.batch_params: if name == "prompt": len_prompt = len(value) - batched_inputs[name] = [ - value[:len_prompt // i] - for i in range(1, batch_size + 1) - ] + batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)] batched_inputs[name][-1] = 2000 * "very long" else: batched_inputs[name] = batch_size * [value] @@ -220,13 +220,14 @@ def test_inference_batch_single_identical(self, batch_size=3): self._test_inference_batch_single_identical(batch_size=batch_size) def _test_inference_batch_single_identical( - self, - batch_size=3, - test_max_difference=None, - test_mean_pixel_difference=None, - relax_max_difference=False, - expected_max_diff=1e-4, - additional_params_copy_to_batched_inputs=["num_inference_steps"], ): + self, + batch_size=3, + test_max_difference=None, + test_mean_pixel_difference=None, + relax_max_difference=False, + expected_max_diff=1e-4, + additional_params_copy_to_batched_inputs=["num_inference_steps"], + ): components = self.get_dummy_components() pipe = self.pipeline_class(**components) @@ -240,19 +241,14 @@ def _test_inference_batch_single_identical( if name in self.batch_params: if name == "prompt": len_prompt = len(value) - batched_inputs[name] = [ - value[:len_prompt // i] - for i in range(1, batch_size + 1) - ] + batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)] batched_inputs[name][-1] = 2000 * "very long" else: batched_inputs[name] = batch_size * [value] elif name == "batch_size": batched_inputs[name] = batch_size elif name == "generator": - batched_inputs[name] = [ - self.get_generator(i) for i in range(batch_size) - ] + batched_inputs[name] = [self.get_generator(i) for i in range(batch_size)] else: batched_inputs[name] = value @@ -293,8 +289,7 @@ def test_components_function(self): init_components = self.get_dummy_components() pipe = self.pipeline_class(**init_components) self.assertTrue(hasattr(pipe, "components")) - self.assertTrue( - set(pipe.components.keys()) == set(init_components.keys())) + self.assertTrue(set(pipe.components.keys()) == set(init_components.keys())) def test_float16_inference(self, expected_max_diff=1e-2): self._test_float16_inference(expected_max_diff) @@ -312,7 +307,8 @@ def _test_float16_inference(self, expected_max_diff=1e-2): self.assertLess( max_diff, expected_max_diff, - "The outputs of the fp16 and fp32 pipelines are too different.", ) + "The outputs of the fp16 and fp32 pipelines are too different.", + ) def test_save_load_float16(self, expected_max_diff=1e-2): self._test_save_load_float16(expected_max_diff) @@ -360,8 +356,7 @@ def test_save_load_optional_components(self): with tempfile.TemporaryDirectory() as tmpdir: # TODO check this pipe.save_pretrained(tmpdir, to_diffusers=False) - pipe_loaded = self.pipeline_class.from_pretrained( - tmpdir, from_diffusers=False) + pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, from_diffusers=False) pipe_loaded.set_progress_bar_config(disable=None) for optional_component in pipe._optional_components: self.assertTrue( @@ -394,27 +389,22 @@ def test_to_dtype(self): pipe = self.pipeline_class(**components) pipe.set_progress_bar_config(disable=None) - model_dtypes = [ - component.dtype for component in components.values() - if hasattr(component, "dtype") - ] + model_dtypes = [component.dtype for component in components.values() if hasattr(component, "dtype")] self.assertTrue(all(dtype == paddle.float32 for dtype in model_dtypes)) pipe.to(paddle_dtype=paddle.float16) - model_dtypes = [ - component.dtype for component in components.values() - if hasattr(component, "dtype") - ] + model_dtypes = [component.dtype for component in components.values() if hasattr(component, "dtype")] self.assertTrue(all(dtype == paddle.float16 for dtype in model_dtypes)) def test_attention_slicing_forward_pass(self): self._test_attention_slicing_forward_pass() def _test_attention_slicing_forward_pass( - self, - test_max_difference=True, - test_mean_pixel_difference=True, - expected_max_diff=5e-3, ): + self, + test_max_difference=True, + test_mean_pixel_difference=True, + expected_max_diff=5e-3, + ): if not self.test_attention_slicing: return @@ -427,25 +417,24 @@ def _test_attention_slicing_forward_pass( inputs = self.get_dummy_inputs() output_with_slicing = pipe(**inputs)[0] if test_max_difference: - max_diff = np.abs( - to_np(output_with_slicing) - to_np(output_without_slicing)).max( - ) + max_diff = np.abs(to_np(output_with_slicing) - to_np(output_without_slicing)).max() self.assertLess( max_diff, expected_max_diff, - "Attention slicing should not affect the inference results", ) + "Attention slicing should not affect the inference results", + ) if test_mean_pixel_difference: - assert_mean_pixel_difference(output_with_slicing[0], - output_without_slicing[0]) + assert_mean_pixel_difference(output_with_slicing[0], output_without_slicing[0]) def test_xformers_attention_forwardGenerator_pass(self): self._test_xformers_attention_forwardGenerator_pass() def _test_xformers_attention_forwardGenerator_pass( - self, - test_max_difference=True, - test_mean_pixel_difference=True, - expected_max_diff=1e-2, ): + self, + test_max_difference=True, + test_mean_pixel_difference=True, + expected_max_diff=1e-2, + ): if not self.test_xformers_attention: return components = self.get_dummy_components() @@ -461,15 +450,14 @@ def _test_xformers_attention_forwardGenerator_pass( output_with_xformers = output_with_xformers.numpy() if hasattr(output_without_xformers, "numpy"): output_without_xformers = output_without_xformers.numpy() - max_diff = np.abs(output_with_xformers - - output_without_xformers).max() + max_diff = np.abs(output_with_xformers - output_without_xformers).max() self.assertLess( max_diff, expected_max_diff, - "XFormers attention should not affect the inference results", ) + "XFormers attention should not affect the inference results", + ) if test_mean_pixel_difference: - assert_mean_pixel_difference(output_with_xformers[0], - output_without_xformers[0]) + assert_mean_pixel_difference(output_with_xformers[0], output_without_xformers[0]) def test_progress_bar(self): components = self.get_dummy_components() @@ -482,12 +470,12 @@ def test_progress_bar(self): self.assertTrue(max_steps is not None and len(max_steps) > 0) self.assertTrue( f"{max_steps}/{max_steps}" in stderr, - "Progress bar should be enabled and stopped at the max step", ) + "Progress bar should be enabled and stopped at the max step", + ) pipe.set_progress_bar_config(disable=True) with io.StringIO() as stderr, contextlib.redirect_stderr(stderr): _ = pipe(**inputs) - self.assertTrue(stderr.getvalue() == "", - "Progress bar should be disabled") + self.assertTrue(stderr.getvalue() == "", "Progress bar should be disabled") def test_num_images_per_prompt(self): sig = inspect.signature(self.pipeline_class.__call__) @@ -510,17 +498,13 @@ def test_num_images_per_prompt(self): if key in self.batch_params: inputs[key] = batch_size * [inputs[key]] - images = pipe( - **inputs, - num_images_per_prompt=num_images_per_prompt).images + images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images assert images.shape[0] == batch_size * num_images_per_prompt def assert_mean_pixel_difference(image, expected_image): - image = np.asarray( - DiffusionPipeline.numpy_to_pil(image)[0], dtype=np.float32) - expected_image = np.asarray( - DiffusionPipeline.numpy_to_pil(expected_image)[0], dtype=np.float32) + image = np.asarray(DiffusionPipeline.numpy_to_pil(image)[0], dtype=np.float32) + expected_image = np.asarray(DiffusionPipeline.numpy_to_pil(expected_image)[0], dtype=np.float32) avg_diff = np.abs(image - expected_image).mean() assert avg_diff < 10, f"Error image deviates {avg_diff} pixels on average" diff --git a/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video.py b/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video.py index 23825d0855c71..b6cb10d5a3545 100644 --- a/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video.py +++ b/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video.py @@ -19,9 +19,13 @@ import paddle from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer -from ppdiffusers import (AutoencoderKL, DDIMScheduler, - DPMSolverMultistepScheduler, TextToVideoSDPipeline, - UNet3DConditionModel) +from ppdiffusers import ( + AutoencoderKL, + DDIMScheduler, + DPMSolverMultistepScheduler, + TextToVideoSDPipeline, + UNet3DConditionModel, +) from ppdiffusers.utils import load_numpy, slow from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS @@ -32,14 +36,16 @@ class TextToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = TextToVideoSDPipeline params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - required_optional_params = frozenset([ - "num_inference_steps", - "generator", - "latents", - "return_dict", - "callback", - "callback_steps", - ]) + required_optional_params = frozenset( + [ + "num_inference_steps", + "generator", + "latents", + "return_dict", + "callback", + "callback_steps", + ] + ) def get_dummy_components(self): paddle.seed(0) @@ -53,20 +59,24 @@ def get_dummy_components(self): "CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D", - "DownBlock3D", ), + "DownBlock3D", + ), up_block_types=( "UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", - "CrossAttnUpBlock3D", ), + "CrossAttnUpBlock3D", + ), cross_attention_dim=32, - attention_head_dim=4, ) + attention_head_dim=4, + ) scheduler = DDIMScheduler( beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, - set_alpha_to_one=False, ) + set_alpha_to_one=False, + ) paddle.seed(0) vae = AutoencoderKL( block_out_channels=[32, 64], @@ -75,7 +85,8 @@ def get_dummy_components(self): down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], latent_channels=4, - sample_size=128, ) + sample_size=128, + ) paddle.seed(0) text_encoder_config = CLIPTextConfig( bos_token_id=0, @@ -88,10 +99,10 @@ def get_dummy_components(self): pad_token_id=1, vocab_size=1000, hidden_act="gelu", - projection_dim=512, ) + projection_dim=512, + ) text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") components = { "unet": unet, "scheduler": scheduler, @@ -128,28 +139,20 @@ def test_text_to_video_default_case(self): assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass( - test_mean_pixel_difference=False) + self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False) def test_attention_slicing_forward_pass(self): - self._test_attention_slicing_forward_pass( - test_mean_pixel_difference=False) + self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False) - @unittest.skip( - reason="Batching needs to be properly figured out first for this pipeline." - ) + @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.") def test_inference_batch_consistent(self): pass - @unittest.skip( - reason="Batching needs to be properly figured out first for this pipeline." - ) + @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.") def test_inference_batch_single_identical(self): pass - @unittest.skip( - reason="`num_images_per_prompt` argument is not supported for this pipeline." - ) + @unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.") def test_num_images_per_prompt(self): pass @@ -161,19 +164,13 @@ def test_full_model(self): "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_to_video/video.npy" ) pipe = TextToVideoSDPipeline.from_pretrained( - "damo-vilab/text-to-video-ms-1.7b", - from_hf_hub=True, - from_diffusers=True) - pipe.scheduler = DPMSolverMultistepScheduler.from_config( - pipe.scheduler.config) + "damo-vilab/text-to-video-ms-1.7b", from_hf_hub=True, from_diffusers=True + ) + pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) pipe = pipe prompt = "Spiderman is surfing" generator = paddle.Generator().manual_seed(0) - video_frames = pipe( - prompt, - generator=generator, - num_inference_steps=25, - output_type="pd").frames + video_frames = pipe(prompt, generator=generator, num_inference_steps=25, output_type="pd").frames video = video_frames.cpu().numpy() assert np.abs(expected_video - video).mean() < 0.8 @@ -181,15 +178,10 @@ def test_two_step_model(self): expected_video = load_numpy( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_to_video/video_2step.npy" ) - pipe = TextToVideoSDPipeline.from_pretrained( - "damo-vilab/text-to-video-ms-1.7b") + pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b") pipe = pipe prompt = "Spiderman is surfing" generator = paddle.Generator().manual_seed(0) - video_frames = pipe( - prompt, - generator=generator, - num_inference_steps=2, - output_type="pd").frames + video_frames = pipe(prompt, generator=generator, num_inference_steps=2, output_type="pd").frames video = video_frames.cpu().numpy() assert np.abs(expected_video - video).mean() < 0.8 diff --git a/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video_zero.py b/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video_zero.py index 8387b54267696..121798ea45e07 100644 --- a/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video_zero.py +++ b/ppdiffusers/tests/pipelines/text_to_video/test_text_to_video_zero.py @@ -27,8 +27,7 @@ class TextToVideoZeroPipelineSlowTests(unittest.TestCase): def test_full_model(self): model_id = "runwayml/stable-diffusion-v1-5" - pipe = TextToVideoZeroPipeline.from_pretrained( - model_id, torch_dtype="float16") + pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype="float16") pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) generator = paddle.Generator().manual_seed(0) prompt = "A bear is playing a guitar on Times Square" diff --git a/ppdiffusers/tests/pipelines/unclip/test_unclip.py b/ppdiffusers/tests/pipelines/unclip/test_unclip.py index 3f0b1a190c645..3e8d64094abd3 100644 --- a/ppdiffusers/tests/pipelines/unclip/test_unclip.py +++ b/ppdiffusers/tests/pipelines/unclip/test_unclip.py @@ -18,18 +18,25 @@ import numpy as np import paddle -from paddlenlp.transformers import (CLIPTextConfig, CLIPTextModelWithProjection, - CLIPTokenizer) - -from ppdiffusers import (PriorTransformer, UnCLIPPipeline, UnCLIPScheduler, - UNet2DConditionModel, UNet2DModel) +from paddlenlp.transformers import ( + CLIPTextConfig, + CLIPTextModelWithProjection, + CLIPTokenizer, +) + +from ppdiffusers import ( + PriorTransformer, + UnCLIPPipeline, + UnCLIPScheduler, + UNet2DConditionModel, + UNet2DModel, +) from ppdiffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel from ppdiffusers.utils import slow from ppdiffusers.utils.testing_utils import require_paddle_gpu from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ..test_pipelines_common import (PipelineTesterMixin, - assert_mean_pixel_difference) +from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase): @@ -44,13 +51,15 @@ class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase): "cross_attention_kwargs", } batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - required_optional_params = frozenset([ - "generator", - "return_dict", - "prior_num_inference_steps", - "decoder_num_inference_steps", - "super_res_num_inference_steps", - ]) + required_optional_params = frozenset( + [ + "generator", + "return_dict", + "prior_num_inference_steps", + "decoder_num_inference_steps", + "super_res_num_inference_steps", + ] + ) test_xformers_attention = False @property @@ -75,8 +84,7 @@ def cross_attention_dim(self): @property def dummy_tokenizer(self): - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") return tokenizer @property @@ -92,7 +100,8 @@ def dummy_text_encoder(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, ) + vocab_size=1000, + ) return CLIPTextModelWithProjection(config) @property @@ -127,13 +136,14 @@ def dummy_decoder(self): "out_channels": 6, "down_block_types": ( "ResnetDownsampleBlock2D", - "SimpleCrossAttnDownBlock2D", ), - "up_block_types": - ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"), + "SimpleCrossAttnDownBlock2D", + ), + "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"), "mid_block_type": "UNetMidBlock2DSimpleCrossAttn", "block_out_channels": ( self.block_out_channels_0, - self.block_out_channels_0 * 2, ), + self.block_out_channels_0 * 2, + ), "layers_per_block": 1, "cross_attention_dim": self.cross_attention_dim, "attention_head_dim": 4, @@ -148,13 +158,12 @@ def dummy_super_res_kwargs(self): return { "sample_size": 64, "layers_per_block": 1, - "down_block_types": - ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"), - "up_block_types": - ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"), + "down_block_types": ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"), + "up_block_types": ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"), "block_out_channels": ( self.block_out_channels_0, - self.block_out_channels_0 * 2, ), + self.block_out_channels_0 * 2, + ), "in_channels": 6, "out_channels": 3, } @@ -183,15 +192,18 @@ def get_dummy_components(self): variance_type="fixed_small_log", prediction_type="sample", num_train_timesteps=1000, - clip_sample_range=5.0, ) + clip_sample_range=5.0, + ) decoder_scheduler = UnCLIPScheduler( variance_type="learned_range", prediction_type="epsilon", - num_train_timesteps=1000, ) + num_train_timesteps=1000, + ) super_res_scheduler = UnCLIPScheduler( variance_type="fixed_small_log", prediction_type="epsilon", - num_train_timesteps=1000, ) + num_train_timesteps=1000, + ) components = { "prior": prior, "decoder": decoder, @@ -229,20 +241,21 @@ def test_unclip(self): image_slice = image[0, -3:, -3:, -1] image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 2.6383996e-04, - 9.9658674e-01, - 1.1275411e-03, - 2.6383996e-04, - 2.6383996e-04, - 9.9702907e-01, - 9.9973619e-01, - 9.9545717e-01, - 2.6383996e-04, - ]) + expected_slice = np.array( + [ + 2.6383996e-04, + 9.9658674e-01, + 1.1275411e-03, + 2.6383996e-04, + 2.6383996e-04, + 9.9702907e-01, + 9.9973619e-01, + 9.9545717e-01, + 2.6383996e-04, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max( - ) < 0.01 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01 def test_unclip_passed_text_embed(self): class DummyScheduler: @@ -264,29 +277,34 @@ class DummyScheduler: dtype=dtype, generator=generator, latents=None, - scheduler=DummyScheduler(), ) + scheduler=DummyScheduler(), + ) shape = ( batch_size, decoder.config.in_channels, decoder.config.sample_size, - decoder.config.sample_size, ) + decoder.config.sample_size, + ) decoder_latents = pipe.prepare_latents( shape, dtype=dtype, generator=generator, latents=None, - scheduler=DummyScheduler(), ) + scheduler=DummyScheduler(), + ) shape = ( batch_size, super_res_first.config.in_channels // 2, super_res_first.config.sample_size, - super_res_first.config.sample_size, ) + super_res_first.config.sample_size, + ) super_res_latents = pipe.prepare_latents( shape, dtype=dtype, generator=generator, latents=None, - scheduler=DummyScheduler(), ) + scheduler=DummyScheduler(), + ) pipe.set_progress_bar_config(disable=None) prompt = "this is a prompt example" generator = paddle.Generator().manual_seed(0) @@ -299,14 +317,16 @@ class DummyScheduler: prior_latents=prior_latents, decoder_latents=decoder_latents, super_res_latents=super_res_latents, - output_type="np", ) + output_type="np", + ) image = output.images text_inputs = tokenizer( prompt, padding="max_length", max_length=tokenizer.model_max_length, return_attention_mask=True, - return_tensors="pd", ) + return_tensors="pd", + ) text_model_output = text_encoder(text_inputs.input_ids) text_attention_mask = text_inputs.attention_mask generator = paddle.Generator().manual_seed(0) @@ -320,13 +340,13 @@ class DummyScheduler: super_res_latents=super_res_latents, text_model_output=text_model_output, text_attention_mask=text_attention_mask, - output_type="np", )[0] + output_type="np", + )[0] assert np.abs(image - image_from_text).max() < 0.0001 def test_attention_slicing_forward_pass(self): test_max_difference = False - self._test_attention_slicing_forward_pass( - test_max_difference=test_max_difference, expected_max_diff=0.01) + self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference, expected_max_diff=0.01) def test_inference_batch_single_identical(self): test_max_difference = False @@ -365,8 +385,7 @@ def tearDown(self): def test_unclip_karlo(self): # Hard code image - expected_image = np.array([[0.73281264, 0.69175875, 0.64672112], - [0.71919304, 0.65395129, 0.60436499]]) + expected_image = np.array([[0.73281264, 0.69175875, 0.64672112], [0.71919304, 0.65395129, 0.60436499]]) pipeline = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha") pipeline.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) diff --git a/ppdiffusers/tests/pipelines/unclip/test_unclip_image_variation.py b/ppdiffusers/tests/pipelines/unclip/test_unclip_image_variation.py index 2bbb56cfad604..e09f906a7f87d 100644 --- a/ppdiffusers/tests/pipelines/unclip/test_unclip_image_variation.py +++ b/ppdiffusers/tests/pipelines/unclip/test_unclip_image_variation.py @@ -20,32 +20,41 @@ import numpy as np import paddle from paddlenlp.transformers import ( - CLIPImageProcessor, CLIPTextConfig, CLIPTextModelWithProjection, - CLIPTokenizer, CLIPVisionConfig, CLIPVisionModelWithProjection) - -from ppdiffusers import (DiffusionPipeline, UnCLIPImageVariationPipeline, - UnCLIPScheduler, UNet2DConditionModel, UNet2DModel) + CLIPImageProcessor, + CLIPTextConfig, + CLIPTextModelWithProjection, + CLIPTokenizer, + CLIPVisionConfig, + CLIPVisionModelWithProjection, +) + +from ppdiffusers import ( + DiffusionPipeline, + UnCLIPImageVariationPipeline, + UnCLIPScheduler, + UNet2DConditionModel, + UNet2DModel, +) from ppdiffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel from ppdiffusers.utils import floats_tensor, slow from ppdiffusers.utils.testing_utils import load_image, require_paddle_gpu -from ..pipeline_params import (IMAGE_VARIATION_BATCH_PARAMS, - IMAGE_VARIATION_PARAMS) -from ..test_pipelines_common import (PipelineTesterMixin, - assert_mean_pixel_difference) +from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS +from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference -class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, - unittest.TestCase): +class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = UnCLIPImageVariationPipeline params = IMAGE_VARIATION_PARAMS - {"height", "width", "guidance_scale"} batch_params = IMAGE_VARIATION_BATCH_PARAMS - required_optional_params = frozenset([ - "generator", - "return_dict", - "decoder_num_inference_steps", - "super_res_num_inference_steps", - ]) + required_optional_params = frozenset( + [ + "generator", + "return_dict", + "decoder_num_inference_steps", + "super_res_num_inference_steps", + ] + ) test_xformers_attention = False @property @@ -70,8 +79,7 @@ def cross_attention_dim(self): @property def dummy_tokenizer(self): - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") return tokenizer @property @@ -87,7 +95,8 @@ def dummy_text_encoder(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, ) + vocab_size=1000, + ) return CLIPTextModelWithProjection(config) @property @@ -100,7 +109,8 @@ def dummy_image_encoder(self): num_attention_heads=4, image_size=32, intermediate_size=37, - patch_size=1, ) + patch_size=1, + ) return CLIPVisionModelWithProjection(config) @property @@ -123,13 +133,14 @@ def dummy_decoder(self): "out_channels": 6, "down_block_types": ( "ResnetDownsampleBlock2D", - "SimpleCrossAttnDownBlock2D", ), - "up_block_types": - ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"), + "SimpleCrossAttnDownBlock2D", + ), + "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"), "mid_block_type": "UNetMidBlock2DSimpleCrossAttn", "block_out_channels": ( self.block_out_channels_0, - self.block_out_channels_0 * 2, ), + self.block_out_channels_0 * 2, + ), "layers_per_block": 1, "cross_attention_dim": self.cross_attention_dim, "attention_head_dim": 4, @@ -144,13 +155,12 @@ def dummy_super_res_kwargs(self): return { "sample_size": 64, "layers_per_block": 1, - "down_block_types": - ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"), - "up_block_types": - ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"), + "down_block_types": ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"), + "up_block_types": ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"), "block_out_channels": ( self.block_out_channels_0, - self.block_out_channels_0 * 2, ), + self.block_out_channels_0 * 2, + ), "in_channels": 6, "out_channels": 3, } @@ -177,11 +187,13 @@ def get_dummy_components(self): decoder_scheduler = UnCLIPScheduler( variance_type="learned_range", prediction_type="epsilon", - num_train_timesteps=1000, ) + num_train_timesteps=1000, + ) super_res_scheduler = UnCLIPScheduler( variance_type="fixed_small_log", prediction_type="epsilon", - num_train_timesteps=1000, ) + num_train_timesteps=1000, + ) feature_extractor = CLIPImageProcessor(crop_size=32, size=32) image_encoder = self.dummy_image_encoder return { @@ -207,8 +219,7 @@ def get_dummy_inputs(self, seed=0, pil_image=True): if pil_image: input_image = input_image * 0.5 + 0.5 input_image = input_image.clip(min=0, max=1) - input_image = (input_image.cpu().transpose( - perm=[0, 2, 3, 1]).cast("float32").numpy()) + input_image = input_image.cpu().transpose(perm=[0, 2, 3, 1]).cast("float32").numpy() input_image = DiffusionPipeline.numpy_to_pil(input_image)[0] return { "image": input_image, @@ -230,20 +241,21 @@ def test_unclip_image_variation_input_tensor(self): image_slice = image[0, -3:, -3:, -1] image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 2.7585030e-03, - 2.6383996e-04, - 9.9801058e-01, - 2.6383996e-04, - 9.9531418e-01, - 9.9220645e-01, - 3.6702752e-03, - 9.9970925e-01, - 9.9973619e-01, - ]) + expected_slice = np.array( + [ + 2.7585030e-03, + 2.6383996e-04, + 9.9801058e-01, + 2.6383996e-04, + 9.9531418e-01, + 9.9220645e-01, + 3.6702752e-03, + 9.9970925e-01, + 9.9973619e-01, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max( - ) < 0.01 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01 def test_unclip_image_variation_input_image(self): components = self.get_dummy_components() @@ -257,28 +269,28 @@ def test_unclip_image_variation_input_image(self): image_slice = image[0, -3:, -3:, -1] image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([ - 5.2168965e-04, - 9.9861604e-01, - 9.9755847e-01, - 9.9804187e-01, - 9.9411416e-01, - 9.9248302e-01, - 9.9973619e-01, - 9.9777901e-01, - 9.9973619e-01, - ]) + expected_slice = np.array( + [ + 5.2168965e-04, + 9.9861604e-01, + 9.9755847e-01, + 9.9804187e-01, + 9.9411416e-01, + 9.9248302e-01, + 9.9973619e-01, + 9.9777901e-01, + 9.9973619e-01, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max( - ) < 0.01 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01 def test_unclip_image_variation_input_list_images(self): components = self.get_dummy_components() pipe = self.pipeline_class(**components) pipe.set_progress_bar_config(disable=None) pipeline_inputs = self.get_dummy_inputs(pil_image=True) - pipeline_inputs[ - "image"] = [pipeline_inputs["image"], pipeline_inputs["image"]] + pipeline_inputs["image"] = [pipeline_inputs["image"], pipeline_inputs["image"]] output = pipe(**pipeline_inputs) image = output.images tuple_pipeline_inputs = self.get_dummy_inputs(pil_image=True) @@ -290,20 +302,21 @@ def test_unclip_image_variation_input_list_images(self): image_slice = image[0, -3:, -3:, -1] image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] assert image.shape == (2, 64, 64, 3) - expected_slice = np.array([ - 5.2201748e-04, - 9.9861759e-01, - 9.9755961e-01, - 9.9804127e-01, - 9.9411547e-01, - 9.9248385e-01, - 9.9973619e-01, - 9.9777836e-01, - 9.9973619e-01, - ]) + expected_slice = np.array( + [ + 5.2201748e-04, + 9.9861759e-01, + 9.9755961e-01, + 9.9804127e-01, + 9.9411547e-01, + 9.9248385e-01, + 9.9973619e-01, + 9.9777836e-01, + 9.9973619e-01, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max( - ) < 0.01 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01 def test_unclip_passed_image_embed(self): class DummyScheduler: @@ -319,29 +332,34 @@ class DummyScheduler: batch_size, pipe.decoder.config.in_channels, pipe.decoder.config.sample_size, - pipe.decoder.config.sample_size, ) + pipe.decoder.config.sample_size, + ) decoder_latents = pipe.prepare_latents( shape, dtype=dtype, generator=generator, latents=None, - scheduler=DummyScheduler(), ) + scheduler=DummyScheduler(), + ) shape = ( batch_size, pipe.super_res_first.config.in_channels // 2, pipe.super_res_first.config.sample_size, - pipe.super_res_first.config.sample_size, ) + pipe.super_res_first.config.sample_size, + ) super_res_latents = pipe.prepare_latents( shape, dtype=dtype, generator=generator, latents=None, - scheduler=DummyScheduler(), ) + scheduler=DummyScheduler(), + ) pipeline_inputs = self.get_dummy_inputs(pil_image=False) img_out_1 = pipe( **pipeline_inputs, decoder_latents=decoder_latents, - super_res_latents=super_res_latents, ).images + super_res_latents=super_res_latents, + ).images pipeline_inputs = self.get_dummy_inputs(pil_image=False) image = pipeline_inputs.pop("image") image_embeddings = pipe.image_encoder(image).image_embeds @@ -349,7 +367,8 @@ class DummyScheduler: **pipeline_inputs, decoder_latents=decoder_latents, super_res_latents=super_res_latents, - image_embeddings=image_embeddings, ).images + image_embeddings=image_embeddings, + ).images assert np.abs(img_out_1 - img_out_2).max() < 0.0001 def test_attention_slicing_forward_pass(self): @@ -358,8 +377,8 @@ def test_attention_slicing_forward_pass(self): expected_max_diff = 1e-2 self._test_attention_slicing_forward_pass( - test_max_difference=test_max_difference, - expected_max_diff=expected_max_diff) + test_max_difference=test_max_difference, expected_max_diff=expected_max_diff + ) def test_inference_batch_single_identical(self): test_max_difference = False @@ -398,11 +417,9 @@ def test_unclip_image_variation_karlo(self): input_image = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unclip/cat.png" ) - expected_image = np.array([[0.09096909, 0.13343304, 0.26244187], - [0.15095001, 0.19459972, 0.3182609]]) + expected_image = np.array([[0.09096909, 0.13343304, 0.26244187], [0.15095001, 0.19459972, 0.3182609]]) # TODO(wugaosheng): test this function - pipeline = UnCLIPImageVariationPipeline.from_pretrained( - "kakaobrain/karlo-v1-alpha-image-variations") + pipeline = UnCLIPImageVariationPipeline.from_pretrained("kakaobrain/karlo-v1-alpha-image-variations") pipeline.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) output = pipeline(input_image, generator=generator, output_type="np") diff --git a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py index 35b1372d082b8..c3906861b23a7 100644 --- a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py +++ b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py @@ -21,8 +21,7 @@ import paddle from ppdiffusers import VersatileDiffusionDualGuidedPipeline -from ppdiffusers.utils.testing_utils import (load_image, nightly, - require_paddle_gpu) +from ppdiffusers.utils.testing_utils import load_image, nightly, require_paddle_gpu @nightly @@ -34,8 +33,7 @@ def tearDown(self): paddle.device.cuda.empty_cache() def test_remove_unused_weights_save_load(self): - pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained( - "shi-labs/versatile-diffusion") + pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion") pipe.remove_unused_weights() pipe.set_progress_bar_config(disable=None) second_prompt = load_image( @@ -49,11 +47,11 @@ def test_remove_unused_weights_save_load(self): generator=generator, guidance_scale=7.5, num_inference_steps=2, - output_type="numpy", ).images + output_type="numpy", + ).images with tempfile.TemporaryDirectory() as tmpdirname: pipe.save_pretrained(tmpdirname) - pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained( - tmpdirname, from_diffusers=False) + pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained(tmpdirname, from_diffusers=False) pipe.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) new_image = pipe( @@ -63,13 +61,12 @@ def test_remove_unused_weights_save_load(self): generator=generator, guidance_scale=7.5, num_inference_steps=2, - output_type="numpy", ).images - assert (np.abs(image - new_image).sum() < 1e-05 - ), "Models don't have the same forward pass" + output_type="numpy", + ).images + assert np.abs(image - new_image).sum() < 1e-05, "Models don't have the same forward pass" def test_inference_dual_guided(self): - pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained( - "shi-labs/versatile-diffusion") + pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion") pipe.remove_unused_weights() pipe.set_progress_bar_config(disable=None) first_prompt = "cyberpunk 2077" @@ -84,18 +81,21 @@ def test_inference_dual_guided(self): generator=generator, guidance_scale=7.5, num_inference_steps=50, - output_type="numpy", ).images + output_type="numpy", + ).images image_slice = image[0, 253:256, 253:256, -1] assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.01500076, - 0.01142624, - 0.01418972, - 0.01518875, - 0.01114869, - 0.01190853, - 0.02978998, - 0.02376354, - 0.02396089, - ]) + expected_slice = np.array( + [ + 0.01500076, + 0.01142624, + 0.01418972, + 0.01518875, + 0.01114869, + 0.01190853, + 0.02978998, + 0.02376354, + 0.02396089, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 diff --git a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py index fbc38ee9f49a1..8335bdf260d7a 100644 --- a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py +++ b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py @@ -19,8 +19,7 @@ import paddle from ppdiffusers import VersatileDiffusionImageVariationPipeline -from ppdiffusers.utils.testing_utils import (load_image, require_paddle_gpu, - slow) +from ppdiffusers.utils.testing_utils import load_image, require_paddle_gpu, slow class VersatileDiffusionImageVariationPipelineFastTests(unittest.TestCase): @@ -29,11 +28,9 @@ class VersatileDiffusionImageVariationPipelineFastTests(unittest.TestCase): @slow @require_paddle_gpu -class VersatileDiffusionImageVariationPipelineIntegrationTests( - unittest.TestCase): +class VersatileDiffusionImageVariationPipelineIntegrationTests(unittest.TestCase): def test_inference_image_variations(self): - pipe = VersatileDiffusionImageVariationPipeline.from_pretrained( - "shi-labs/versatile-diffusion") + pipe = VersatileDiffusionImageVariationPipeline.from_pretrained("shi-labs/versatile-diffusion") pipe.set_progress_bar_config(disable=None) image_prompt = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg" @@ -44,18 +41,21 @@ def test_inference_image_variations(self): generator=generator, guidance_scale=7.5, num_inference_steps=50, - output_type="numpy", ).images + output_type="numpy", + ).images image_slice = image[0, 253:256, 253:256, -1] assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([ - 0.12047189, - 0.19138041, - 0.22884357, - 0.08833978, - 0.1594424, - 0.16826832, - 0.07032129, - 0.14926612, - 0.12981007, - ]) + expected_slice = np.array( + [ + 0.12047189, + 0.19138041, + 0.22884357, + 0.08833978, + 0.1594424, + 0.16826832, + 0.07032129, + 0.14926612, + 0.12981007, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 diff --git a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py index ed49997b5a89b..aab7e81ba0c40 100644 --- a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py +++ b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py @@ -21,8 +21,7 @@ import paddle from ppdiffusers import VersatileDiffusionPipeline -from ppdiffusers.utils.testing_utils import (load_image, nightly, - require_paddle_gpu) +from ppdiffusers.utils.testing_utils import load_image, nightly, require_paddle_gpu class VersatileDiffusionMegaPipelineFastTests(unittest.TestCase): @@ -38,8 +37,7 @@ def tearDown(self): paddle.device.cuda.empty_cache() def test_from_save_pretrained(self): - pipe = VersatileDiffusionPipeline.from_pretrained( - "shi-labs/versatile-diffusion") + pipe = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion") pipe.set_progress_bar_config(disable=None) prompt_image = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg" @@ -52,11 +50,11 @@ def test_from_save_pretrained(self): generator=generator, guidance_scale=7.5, num_inference_steps=2, - output_type="numpy", ).images + output_type="numpy", + ).images with tempfile.TemporaryDirectory() as tmpdirname: pipe.save_pretrained(tmpdirname) - pipe = VersatileDiffusionPipeline.from_pretrained( - tmpdirname, from_diffusers=False) + pipe = VersatileDiffusionPipeline.from_pretrained(tmpdirname, from_diffusers=False) pipe.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) new_image = pipe.dual_guided( @@ -66,13 +64,12 @@ def test_from_save_pretrained(self): generator=generator, guidance_scale=7.5, num_inference_steps=2, - output_type="numpy", ).images - assert (np.abs(image - new_image).sum() < 1e-05 - ), "Models don't have the same forward pass" + output_type="numpy", + ).images + assert np.abs(image - new_image).sum() < 1e-05, "Models don't have the same forward pass" def test_inference_dual_guided_then_text_to_image(self): - pipe = VersatileDiffusionPipeline.from_pretrained( - "shi-labs/versatile-diffusion", paddle_dtype=paddle.float16) + pipe = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion", paddle_dtype=paddle.float16) pipe.set_progress_bar_config(disable=None) prompt = "cyberpunk 2077" init_image = load_image( @@ -86,21 +83,24 @@ def test_inference_dual_guided_then_text_to_image(self): generator=generator, guidance_scale=7.5, num_inference_steps=50, - output_type="numpy", ).images + output_type="numpy", + ).images image_slice = image[0, 253:256, 253:256, -1] assert image.shape == (1, 512, 512, 3) # expected_slice = np.array([0.1448, 0.1619, 0.1741, 0.1086, 0.1147, 0.1128, 0.1199, 0.1165, 0.1001]) - expected_slice = np.array([ - 0.03100586, - 0.02929688, - 0.03271484, - 0.02807617, - 0.02905273, - 0.03173828, - 0.02685547, - 0.02807617, - 0.03271484, - ]) + expected_slice = np.array( + [ + 0.03100586, + 0.02929688, + 0.03271484, + 0.02807617, + 0.02905273, + 0.03173828, + 0.02685547, + 0.02807617, + 0.03271484, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.1 prompt = "A painting of a squirrel eating a burger " generator = paddle.Generator().manual_seed(0) @@ -109,36 +109,40 @@ def test_inference_dual_guided_then_text_to_image(self): generator=generator, guidance_scale=7.5, num_inference_steps=50, - output_type="numpy", ).images + output_type="numpy", + ).images image_slice = image[0, 253:256, 253:256, -1] assert image.shape == (1, 512, 512, 3) # expected_slice = np.array([0.3367, 0.3169, 0.2656, 0.387, 0.479, 0.3796, 0.4009, 0.4878, 0.4778]) - expected_slice = np.array([ - 0.0390625, - 0.00854492, - 0.0, - 0.03930664, - 0.00878906, - 0.04711914, - 0.03686523, - 0.0, - 0.0246582, - ]) + expected_slice = np.array( + [ + 0.0390625, + 0.00854492, + 0.0, + 0.03930664, + 0.00878906, + 0.04711914, + 0.03686523, + 0.0, + 0.0246582, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.1 - image = pipe.image_variation( - init_image, generator=generator, output_type="numpy").images + image = pipe.image_variation(init_image, generator=generator, output_type="numpy").images image_slice = image[0, 253:256, 253:256, -1] assert image.shape == (1, 512, 512, 3) # expected_slice = np.array([0.3076, 0.3123, 0.3284, 0.3782, 0.377, 0.3894, 0.4297, 0.4331, 0.4456]) - expected_slice = np.array([ - 0.34472656, - 0.1940918, - 0.10546875, - 0.38134766, - 0.24560547, - 0.13208008, - 0.38867188, - 0.30566406, - 0.18188477, - ]) + expected_slice = np.array( + [ + 0.34472656, + 0.1940918, + 0.10546875, + 0.38134766, + 0.24560547, + 0.13208008, + 0.38867188, + 0.30566406, + 0.18188477, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.1 diff --git a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py index fbe47142eafcb..c95b30030f3d5 100644 --- a/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py +++ b/ppdiffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py @@ -37,8 +37,7 @@ def tearDown(self): paddle.device.cuda.empty_cache() def test_remove_unused_weights_save_load(self): - pipe = VersatileDiffusionTextToImagePipeline.from_pretrained( - "shi-labs/versatile-diffusion") + pipe = VersatileDiffusionTextToImagePipeline.from_pretrained("shi-labs/versatile-diffusion") pipe.remove_unused_weights() pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger " @@ -48,11 +47,11 @@ def test_remove_unused_weights_save_load(self): generator=generator, guidance_scale=7.5, num_inference_steps=2, - output_type="numpy", ).images + output_type="numpy", + ).images with tempfile.TemporaryDirectory() as tmpdirname: pipe.save_pretrained(tmpdirname) - pipe = VersatileDiffusionTextToImagePipeline.from_pretrained( - tmpdirname, from_diffusers=False) + pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(tmpdirname, from_diffusers=False) pipe.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) new_image = pipe( @@ -60,13 +59,12 @@ def test_remove_unused_weights_save_load(self): generator=generator, guidance_scale=7.5, num_inference_steps=2, - output_type="numpy", ).images - assert (np.abs(image - new_image).sum() < 1e-05 - ), "Models don't have the same forward pass" + output_type="numpy", + ).images + assert np.abs(image - new_image).sum() < 1e-05, "Models don't have the same forward pass" def test_inference_text2img(self): - pipe = VersatileDiffusionTextToImagePipeline.from_pretrained( - "shi-labs/versatile-diffusion") + pipe = VersatileDiffusionTextToImagePipeline.from_pretrained("shi-labs/versatile-diffusion") pipe.set_progress_bar_config(disable=None) prompt = "A painting of a squirrel eating a burger " generator = paddle.Generator().manual_seed(0) @@ -75,19 +73,22 @@ def test_inference_text2img(self): generator=generator, guidance_scale=7.5, num_inference_steps=50, - output_type="numpy", ).images + output_type="numpy", + ).images image_slice = image[0, 253:256, 253:256, -1] assert image.shape == (1, 512, 512, 3) # expected_slice = np.array([0.3493, 0.3757, 0.4093, 0.4495, 0.4233, 0.4102, 0.4507, 0.4756, 0.4787]) - expected_slice = np.array([ - 0.0390625, - 0.00854492, - 0.0, - 0.03930664, - 0.00878906, - 0.04711914, - 0.03686523, - 0.0, - 0.0246582, - ]) + expected_slice = np.array( + [ + 0.0390625, + 0.00854492, + 0.0, + 0.03930664, + 0.00878906, + 0.04711914, + 0.03686523, + 0.0, + 0.0246582, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 diff --git a/ppdiffusers/tests/pipelines/vq_diffusion/test_vq_diffusion.py b/ppdiffusers/tests/pipelines/vq_diffusion/test_vq_diffusion.py index 5c65fd95fc95f..c17b7fd1d7257 100644 --- a/ppdiffusers/tests/pipelines/vq_diffusion/test_vq_diffusion.py +++ b/ppdiffusers/tests/pipelines/vq_diffusion/test_vq_diffusion.py @@ -20,10 +20,15 @@ import paddle from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer -from ppdiffusers import (Transformer2DModel, VQDiffusionPipeline, - VQDiffusionScheduler, VQModel) -from ppdiffusers.pipelines.vq_diffusion.pipeline_vq_diffusion import \ - LearnedClassifierFreeSamplingEmbeddings +from ppdiffusers import ( + Transformer2DModel, + VQDiffusionPipeline, + VQDiffusionScheduler, + VQModel, +) +from ppdiffusers.pipelines.vq_diffusion.pipeline_vq_diffusion import ( + LearnedClassifierFreeSamplingEmbeddings, +) from ppdiffusers.utils import load_numpy, slow from ppdiffusers.utils.testing_utils import require_paddle_gpu @@ -57,13 +62,13 @@ def dummy_vqvae(self): up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], latent_channels=3, num_vq_embeddings=self.num_embed, - vq_embed_dim=3, ) + vq_embed_dim=3, + ) return model @property def dummy_tokenizer(self): - tokenizer = CLIPTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-clip") + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") return tokenizer @property @@ -78,7 +83,8 @@ def dummy_text_encoder(self): num_attention_heads=4, num_hidden_layers=5, pad_token_id=1, - vocab_size=1000, ) + vocab_size=1000, + ) return CLIPTextModel(config).eval() @property @@ -106,8 +112,7 @@ def test_vq_diffusion(self): tokenizer = self.dummy_tokenizer transformer = self.dummy_transformer scheduler = VQDiffusionScheduler(self.num_embed) - learned_classifier_free_sampling_embeddings = ( - LearnedClassifierFreeSamplingEmbeddings(learnable=False)) + learned_classifier_free_sampling_embeddings = LearnedClassifierFreeSamplingEmbeddings(learnable=False) pipe = VQDiffusionPipeline( vqvae=vqvae, text_encoder=text_encoder, @@ -119,11 +124,7 @@ def test_vq_diffusion(self): pipe.set_progress_bar_config(disable=None) prompt = "teddy bear playing in the pool" generator = paddle.Generator().manual_seed(0) - output = pipe( - [prompt], - generator=generator, - num_inference_steps=2, - output_type="np") + output = pipe([prompt], generator=generator, num_inference_steps=2, output_type="np") image = output.images generator = paddle.Generator().manual_seed(0) image_from_tuple = pipe( @@ -131,24 +132,26 @@ def test_vq_diffusion(self): generator=generator, output_type="np", return_dict=False, - num_inference_steps=2, )[0] + num_inference_steps=2, + )[0] image_slice = image[0, -3:, -3:, -1] image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] assert image.shape == (1, 24, 24, 3) - expected_slice = np.array([ - 0.5900591, - 0.83443725, - 0.4418438, - 0.604656, - 0.89781034, - 0.40088692, - 0.6107253, - 0.87849474, - 0.64088374, - ]) + expected_slice = np.array( + [ + 0.5900591, + 0.83443725, + 0.4418438, + 0.604656, + 0.89781034, + 0.40088692, + 0.6107253, + 0.87849474, + 0.64088374, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max( - ) < 0.01 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01 def test_vq_diffusion_classifier_free_sampling(self): vqvae = self.dummy_vqvae @@ -156,11 +159,11 @@ def test_vq_diffusion_classifier_free_sampling(self): tokenizer = self.dummy_tokenizer transformer = self.dummy_transformer scheduler = VQDiffusionScheduler(self.num_embed) - learned_classifier_free_sampling_embeddings = ( - LearnedClassifierFreeSamplingEmbeddings( - learnable=True, - hidden_size=self.text_embedder_hidden_size, - length=tokenizer.model_max_length, )) + learned_classifier_free_sampling_embeddings = LearnedClassifierFreeSamplingEmbeddings( + learnable=True, + hidden_size=self.text_embedder_hidden_size, + length=tokenizer.model_max_length, + ) pipe = VQDiffusionPipeline( vqvae=vqvae, text_encoder=text_encoder, @@ -172,11 +175,7 @@ def test_vq_diffusion_classifier_free_sampling(self): pipe.set_progress_bar_config(disable=None) prompt = "teddy bear playing in the pool" generator = paddle.Generator().manual_seed(0) - output = pipe( - [prompt], - generator=generator, - num_inference_steps=2, - output_type="np") + output = pipe([prompt], generator=generator, num_inference_steps=2, output_type="np") image = output.images generator = paddle.Generator().manual_seed(0) image_from_tuple = pipe( @@ -184,24 +183,26 @@ def test_vq_diffusion_classifier_free_sampling(self): generator=generator, output_type="np", return_dict=False, - num_inference_steps=2, )[0] + num_inference_steps=2, + )[0] image_slice = image[0, -3:, -3:, -1] image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] assert image.shape == (1, 24, 24, 3) - expected_slice = np.array([ - 0.61711097, - 0.8419658, - 0.5493732, - 0.64064896, - 0.97944254, - 0.5611503, - 0.6145399, - 0.7063037, - 0.54406035, - ]) + expected_slice = np.array( + [ + 0.61711097, + 0.8419658, + 0.5493732, + 0.64064896, + 0.97944254, + 0.5611503, + 0.6145399, + 0.7063037, + 0.54406035, + ] + ) assert np.abs(image_slice.flatten() - expected_slice).max() < 0.01 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max( - ) < 0.01 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 0.01 @slow @@ -216,8 +217,7 @@ def test_vq_diffusion_classifier_free_sampling(self): expected_image = load_numpy( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/vq_diffusion/teddy_bear_pool_classifier_free_sampling.npy" ) - pipeline = VQDiffusionPipeline.from_pretrained( - "microsoft/vq-diffusion-ithq") + pipeline = VQDiffusionPipeline.from_pretrained("microsoft/vq-diffusion-ithq") pipeline = pipeline pipeline.set_progress_bar_config(disable=None) generator = paddle.Generator().manual_seed(0) @@ -225,7 +225,8 @@ def test_vq_diffusion_classifier_free_sampling(self): "teddy bear playing in the pool", num_images_per_prompt=1, generator=generator, - output_type="np", ) + output_type="np", + ) image = output.images[0] assert image.shape == (256, 256, 3) assert np.abs(expected_image - image).max() < 0.01 diff --git a/ppdiffusers/tests/schedulers/test_scheduler_ddim.py b/ppdiffusers/tests/schedulers/test_scheduler_ddim.py index ce993b9501fb1..c578c2ffb27cd 100644 --- a/ppdiffusers/tests/schedulers/test_scheduler_ddim.py +++ b/ppdiffusers/tests/schedulers/test_scheduler_ddim.py @@ -20,7 +20,7 @@ class DDIMSchedulerTest(SchedulerCommonTest): - scheduler_classes = (DDIMScheduler, ) + scheduler_classes = (DDIMScheduler,) forward_default_kwargs = (("eta", 0.0), ("num_inference_steps", 50)) def get_scheduler_config(self, **kwargs): @@ -65,12 +65,10 @@ def test_steps_offset(self): scheduler_config = self.get_scheduler_config(steps_offset=1) scheduler = scheduler_class(**scheduler_config) scheduler.set_timesteps(5) - assert paddle.equal_all(scheduler.timesteps, - paddle.to_tensor([801, 601, 401, 201, 1])) + assert paddle.equal_all(scheduler.timesteps, paddle.to_tensor([801, 601, 401, 201, 1])) def test_betas(self): - for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], - [0.002, 0.02, 0.2, 2]): + for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]): self.check_over_configs(beta_start=beta_start, beta_end=beta_end) def test_schedules(self): @@ -92,7 +90,8 @@ def test_thresholding(self): self.check_over_configs( thresholding=True, prediction_type=prediction_type, - sample_max_value=threshold, ) + sample_max_value=threshold, + ) def test_time_indices(self): for t in [1, 10, 49]: @@ -100,8 +99,7 @@ def test_time_indices(self): def test_inference_steps(self): for t, num_inference_steps in zip([1, 10, 50], [10, 50, 500]): - self.check_over_forward( - time_step=t, num_inference_steps=num_inference_steps) + self.check_over_forward(time_step=t, num_inference_steps=num_inference_steps) def test_eta(self): for t, eta in zip([1, 10, 49], [0.0, 0.5, 1.0]): @@ -112,18 +110,12 @@ def test_variance(self): scheduler_config = self.get_scheduler_config() scheduler = scheduler_class(**scheduler_config) - assert paddle.sum(paddle.abs(scheduler._get_variance(0, 0) - - 0.0)) < 1e-5 - assert (paddle.sum( - paddle.abs(scheduler._get_variance(420, 400) - 0.14771)) < 1e-5) - assert (paddle.sum( - paddle.abs(scheduler._get_variance(980, 960) - 0.32460)) < 1e-5) - assert paddle.sum(paddle.abs(scheduler._get_variance(0, 0) - - 0.0)) < 1e-5 - assert (paddle.sum( - paddle.abs(scheduler._get_variance(487, 486) - 0.00979)) < 1e-5) - assert paddle.sum( - paddle.abs(scheduler._get_variance(999, 998) - 0.02)) < 1e-5 + assert paddle.sum(paddle.abs(scheduler._get_variance(0, 0) - 0.0)) < 1e-5 + assert paddle.sum(paddle.abs(scheduler._get_variance(420, 400) - 0.14771)) < 1e-5 + assert paddle.sum(paddle.abs(scheduler._get_variance(980, 960) - 0.32460)) < 1e-5 + assert paddle.sum(paddle.abs(scheduler._get_variance(0, 0) - 0.0)) < 1e-5 + assert paddle.sum(paddle.abs(scheduler._get_variance(487, 486) - 0.00979)) < 1e-5 + assert paddle.sum(paddle.abs(scheduler._get_variance(999, 998) - 0.02)) < 1e-5 def test_full_loop_no_noise(self): sample = self.full_loop() diff --git a/ppdiffusers/tests/schedulers/test_scheduler_ddpm.py b/ppdiffusers/tests/schedulers/test_scheduler_ddpm.py index e9fa28609abda..9768d50cc5dbc 100644 --- a/ppdiffusers/tests/schedulers/test_scheduler_ddpm.py +++ b/ppdiffusers/tests/schedulers/test_scheduler_ddpm.py @@ -20,7 +20,7 @@ class DDPMSchedulerTest(SchedulerCommonTest): - scheduler_classes = (DDPMScheduler, ) + scheduler_classes = (DDPMScheduler,) def get_scheduler_config(self, **kwargs): config = { @@ -40,8 +40,7 @@ def test_timesteps(self): self.check_over_configs(num_train_timesteps=timesteps) def test_betas(self): - for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], - [0.002, 0.02, 0.2, 2]): + for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]): self.check_over_configs(beta_start=beta_start, beta_end=beta_end) def test_schedules(self): @@ -63,7 +62,8 @@ def test_thresholding(self): self.check_over_configs( thresholding=True, prediction_type=prediction_type, - sample_max_value=threshold, ) + sample_max_value=threshold, + ) def test_prediction_type(self): for prediction_type in ["epsilon", "sample", "v_prediction"]: @@ -79,10 +79,8 @@ def test_variance(self): scheduler = scheduler_class(**scheduler_config) assert paddle.sum(paddle.abs(scheduler._get_variance(0) - 0.0)) < 1e-5 - assert paddle.sum(paddle.abs(scheduler._get_variance(487) - - 0.00979)) < 1e-5 - assert paddle.sum(paddle.abs(scheduler._get_variance(999) - - 0.02)) < 1e-5 + assert paddle.sum(paddle.abs(scheduler._get_variance(487) - 0.00979)) < 1e-5 + assert paddle.sum(paddle.abs(scheduler._get_variance(999) - 0.02)) < 1e-5 def test_full_loop_no_noise(self): scheduler_class = self.scheduler_classes[0] @@ -100,8 +98,7 @@ def test_full_loop_no_noise(self): residual = model(sample, t) # 2. predict previous mean of sample x_t-1 - pred_prev_sample = scheduler.step( - residual, t, sample, generator=generator).prev_sample + pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample # if t > 0: # noise = self.dummy_sample_deter @@ -118,8 +115,7 @@ def test_full_loop_no_noise(self): def test_full_loop_with_v_prediction(self): scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config( - prediction_type="v_prediction") + scheduler_config = self.get_scheduler_config(prediction_type="v_prediction") scheduler = scheduler_class(**scheduler_config) num_trained_timesteps = len(scheduler) @@ -133,8 +129,7 @@ def test_full_loop_with_v_prediction(self): residual = model(sample, t) # 2. predict previous mean of sample x_t-1 - pred_prev_sample = scheduler.step( - residual, t, sample, generator=generator).prev_sample + pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample # if t > 0: # noise = self.dummy_sample_deter @@ -178,13 +173,10 @@ def test_custom_timesteps_increasing_order(self): timesteps = [100, 87, 50, 51, 0] - with self.assertRaises( - ValueError, - msg="`custom_timesteps` must be in descending order."): + with self.assertRaises(ValueError, msg="`custom_timesteps` must be in descending order."): scheduler.set_timesteps(timesteps=timesteps) - def test_custom_timesteps_passing_both_num_inference_steps_and_timesteps( - self): + def test_custom_timesteps_passing_both_num_inference_steps_and_timesteps(self): scheduler_class = self.scheduler_classes[0] scheduler_config = self.get_scheduler_config() scheduler = scheduler_class(**scheduler_config) @@ -193,11 +185,10 @@ def test_custom_timesteps_passing_both_num_inference_steps_and_timesteps( num_inference_steps = len(timesteps) with self.assertRaises( - ValueError, - msg="Can only pass one of `num_inference_steps` or `custom_timesteps`.", + ValueError, + msg="Can only pass one of `num_inference_steps` or `custom_timesteps`.", ): - scheduler.set_timesteps( - num_inference_steps=num_inference_steps, timesteps=timesteps) + scheduler.set_timesteps(num_inference_steps=num_inference_steps, timesteps=timesteps) def test_custom_timesteps_too_large(self): scheduler_class = self.scheduler_classes[0] @@ -207,7 +198,7 @@ def test_custom_timesteps_too_large(self): timesteps = [scheduler.config.num_train_timesteps] with self.assertRaises( - ValueError, - msg="`timesteps` must start before `self.config.train_timesteps`: {scheduler.config.num_train_timesteps}}", + ValueError, + msg="`timesteps` must start before `self.config.train_timesteps`: {scheduler.config.num_train_timesteps}}", ): scheduler.set_timesteps(timesteps=timesteps) diff --git a/ppdiffusers/tests/schedulers/test_scheduler_deis.py b/ppdiffusers/tests/schedulers/test_scheduler_deis.py index b40af9f177525..7ea11c2198020 100644 --- a/ppdiffusers/tests/schedulers/test_scheduler_deis.py +++ b/ppdiffusers/tests/schedulers/test_scheduler_deis.py @@ -16,15 +16,19 @@ import paddle -from ppdiffusers import (DEISMultistepScheduler, DPMSolverMultistepScheduler, - DPMSolverSinglestepScheduler, UniPCMultistepScheduler) +from ppdiffusers import ( + DEISMultistepScheduler, + DPMSolverMultistepScheduler, + DPMSolverSinglestepScheduler, + UniPCMultistepScheduler, +) from .test_schedulers import SchedulerCommonTest class DEISMultistepSchedulerTest(SchedulerCommonTest): - scheduler_classes = (DEISMultistepScheduler, ) - forward_default_kwargs = (("num_inference_steps", 25), ) + scheduler_classes = (DEISMultistepScheduler,) + forward_default_kwargs = (("num_inference_steps", 25),) def get_scheduler_config(self, **kwargs): config = { @@ -43,38 +47,28 @@ def check_over_configs(self, time_step=0, **config): num_inference_steps = kwargs.pop("num_inference_steps", None) sample = self.dummy_sample residual = 0.1 * sample - dummy_past_residuals = [ - residual + 0.2, residual + 0.15, residual + 0.10 - ] + dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10] for scheduler_class in self.scheduler_classes: scheduler_config = self.get_scheduler_config(**config) scheduler = scheduler_class(**scheduler_config) scheduler.set_timesteps(num_inference_steps) # copy over dummy past residuals - scheduler.model_outputs = dummy_past_residuals[:scheduler.config. - solver_order] + scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order] with tempfile.TemporaryDirectory() as tmpdirname: scheduler.save_config(tmpdirname) new_scheduler = scheduler_class.from_pretrained(tmpdirname) new_scheduler.set_timesteps(num_inference_steps) # copy over dummy past residuals - new_scheduler.model_outputs = dummy_past_residuals[: - new_scheduler. - config. - solver_order] + new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order] output, new_output = sample, sample - for t in range(time_step, - time_step + scheduler.config.solver_order + 1): - output = scheduler.step(residual, t, output, - **kwargs).prev_sample - new_output = new_scheduler.step(residual, t, new_output, - **kwargs).prev_sample + for t in range(time_step, time_step + scheduler.config.solver_order + 1): + output = scheduler.step(residual, t, output, **kwargs).prev_sample + new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler outputs are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" def test_from_save_pretrained(self): pass @@ -84,9 +78,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs): num_inference_steps = kwargs.pop("num_inference_steps", None) sample = self.dummy_sample residual = 0.1 * sample - dummy_past_residuals = [ - residual + 0.2, residual + 0.15, residual + 0.10 - ] + dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10] for scheduler_class in self.scheduler_classes: scheduler_config = self.get_scheduler_config() @@ -94,8 +86,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs): scheduler.set_timesteps(num_inference_steps) # copy over dummy past residuals (must be after setting timesteps) - scheduler.model_outputs = dummy_past_residuals[:scheduler.config. - solver_order] + scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order] with tempfile.TemporaryDirectory() as tmpdirname: scheduler.save_config(tmpdirname) @@ -104,18 +95,12 @@ def check_over_forward(self, time_step=0, **forward_kwargs): new_scheduler.set_timesteps(num_inference_steps) # copy over dummy past residual (must be after setting timesteps) - new_scheduler.model_outputs = dummy_past_residuals[: - new_scheduler. - config. - solver_order] + new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order] - output = scheduler.step(residual, time_step, sample, - **kwargs).prev_sample - new_output = new_scheduler.step(residual, time_step, sample, - **kwargs).prev_sample + output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample + new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler outputs are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" def full_loop(self, scheduler=None, **config): if scheduler is None: @@ -150,27 +135,20 @@ def test_step_shape(self): sample = self.dummy_sample residual = 0.1 * sample - if num_inference_steps is not None and hasattr(scheduler, - "set_timesteps"): + if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr( - scheduler, "set_timesteps"): + elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): kwargs["num_inference_steps"] = num_inference_steps # copy over dummy past residuals (must be done after set_timesteps) - dummy_past_residuals = [ - residual + 0.2, residual + 0.15, residual + 0.10 - ] - scheduler.model_outputs = dummy_past_residuals[:scheduler.config. - solver_order] + dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10] + scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order] time_step_0 = scheduler.timesteps[5] time_step_1 = scheduler.timesteps[6] - output_0 = scheduler.step(residual, time_step_0, sample, - **kwargs).prev_sample - output_1 = scheduler.step(residual, time_step_1, sample, - **kwargs).prev_sample + output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample + output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample self.assertEqual(output_0.shape, sample.shape) self.assertEqual(output_0.shape, output_1.shape) @@ -210,7 +188,8 @@ def test_thresholding(self): sample_max_value=threshold, algorithm_type="deis", solver_order=order, - solver_type=solver_type, ) + solver_type=solver_type, + ) def test_prediction_type(self): for prediction_type in ["epsilon", "v_prediction"]: @@ -225,14 +204,15 @@ def test_solver_order_and_type(self): solver_order=order, solver_type=solver_type, prediction_type=prediction_type, - algorithm_type=algorithm_type, ) + algorithm_type=algorithm_type, + ) sample = self.full_loop( solver_order=order, solver_type=solver_type, prediction_type=prediction_type, - algorithm_type=algorithm_type, ) - assert not paddle.isnan(sample).any( - ), "Samples have nan numbers" + algorithm_type=algorithm_type, + ) + assert not paddle.isnan(sample).any(), "Samples have nan numbers" def test_lower_order_final(self): self.check_over_configs(lower_order_final=True) @@ -240,8 +220,7 @@ def test_lower_order_final(self): def test_inference_steps(self): for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]: - self.check_over_forward( - num_inference_steps=num_inference_steps, time_step=0) + self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0) def test_full_loop_no_noise(self): sample = self.full_loop() @@ -257,8 +236,7 @@ def test_full_loop_with_v_prediction(self): def test_fp16_support(self): scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config( - thresholding=True, dynamic_thresholding_ratio=0) + scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0) scheduler = scheduler_class(**scheduler_config) num_inference_steps = 10 diff --git a/ppdiffusers/tests/schedulers/test_scheduler_dpm_multi.py b/ppdiffusers/tests/schedulers/test_scheduler_dpm_multi.py index 8935cd0ba072e..869b1cc9280d1 100644 --- a/ppdiffusers/tests/schedulers/test_scheduler_dpm_multi.py +++ b/ppdiffusers/tests/schedulers/test_scheduler_dpm_multi.py @@ -16,15 +16,19 @@ import paddle -from ppdiffusers import (DEISMultistepScheduler, DPMSolverMultistepScheduler, - DPMSolverSinglestepScheduler, UniPCMultistepScheduler) +from ppdiffusers import ( + DEISMultistepScheduler, + DPMSolverMultistepScheduler, + DPMSolverSinglestepScheduler, + UniPCMultistepScheduler, +) from .test_schedulers import SchedulerCommonTest class DPMSolverMultistepSchedulerTest(SchedulerCommonTest): - scheduler_classes = (DPMSolverMultistepScheduler, ) - forward_default_kwargs = (("num_inference_steps", 25), ) + scheduler_classes = (DPMSolverMultistepScheduler,) + forward_default_kwargs = (("num_inference_steps", 25),) def get_scheduler_config(self, **kwargs): config = { @@ -49,38 +53,28 @@ def check_over_configs(self, time_step=0, **config): num_inference_steps = kwargs.pop("num_inference_steps", None) sample = self.dummy_sample residual = 0.1 * sample - dummy_past_residuals = [ - residual + 0.2, residual + 0.15, residual + 0.10 - ] + dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10] for scheduler_class in self.scheduler_classes: scheduler_config = self.get_scheduler_config(**config) scheduler = scheduler_class(**scheduler_config) scheduler.set_timesteps(num_inference_steps) # copy over dummy past residuals - scheduler.model_outputs = dummy_past_residuals[:scheduler.config. - solver_order] + scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order] with tempfile.TemporaryDirectory() as tmpdirname: scheduler.save_config(tmpdirname) new_scheduler = scheduler_class.from_pretrained(tmpdirname) new_scheduler.set_timesteps(num_inference_steps) # copy over dummy past residuals - new_scheduler.model_outputs = dummy_past_residuals[: - new_scheduler. - config. - solver_order] + new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order] output, new_output = sample, sample - for t in range(time_step, - time_step + scheduler.config.solver_order + 1): - output = scheduler.step(residual, t, output, - **kwargs).prev_sample - new_output = new_scheduler.step(residual, t, new_output, - **kwargs).prev_sample + for t in range(time_step, time_step + scheduler.config.solver_order + 1): + output = scheduler.step(residual, t, output, **kwargs).prev_sample + new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler outputs are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" def test_from_save_pretrained(self): pass @@ -90,9 +84,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs): num_inference_steps = kwargs.pop("num_inference_steps", None) sample = self.dummy_sample residual = 0.1 * sample - dummy_past_residuals = [ - residual + 0.2, residual + 0.15, residual + 0.10 - ] + dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10] for scheduler_class in self.scheduler_classes: scheduler_config = self.get_scheduler_config() @@ -100,8 +92,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs): scheduler.set_timesteps(num_inference_steps) # copy over dummy past residuals (must be after setting timesteps) - scheduler.model_outputs = dummy_past_residuals[:scheduler.config. - solver_order] + scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order] with tempfile.TemporaryDirectory() as tmpdirname: scheduler.save_config(tmpdirname) @@ -110,18 +101,12 @@ def check_over_forward(self, time_step=0, **forward_kwargs): new_scheduler.set_timesteps(num_inference_steps) # copy over dummy past residual (must be after setting timesteps) - new_scheduler.model_outputs = dummy_past_residuals[: - new_scheduler. - config. - solver_order] + new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order] - output = scheduler.step(residual, time_step, sample, - **kwargs).prev_sample - new_output = new_scheduler.step(residual, time_step, sample, - **kwargs).prev_sample + output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample + new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler outputs are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" def full_loop(self, scheduler=None, **config): if scheduler is None: @@ -152,27 +137,20 @@ def test_step_shape(self): sample = self.dummy_sample residual = 0.1 * sample - if num_inference_steps is not None and hasattr(scheduler, - "set_timesteps"): + if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr( - scheduler, "set_timesteps"): + elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): kwargs["num_inference_steps"] = num_inference_steps # copy over dummy past residuals (must be done after set_timesteps) - dummy_past_residuals = [ - residual + 0.2, residual + 0.15, residual + 0.10 - ] - scheduler.model_outputs = dummy_past_residuals[:scheduler.config. - solver_order] + dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10] + scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order] time_step_0 = scheduler.timesteps[5] time_step_1 = scheduler.timesteps[6] - output_0 = scheduler.step(residual, time_step_0, sample, - **kwargs).prev_sample - output_1 = scheduler.step(residual, time_step_1, sample, - **kwargs).prev_sample + output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample + output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample self.assertEqual(output_0.shape, sample.shape) self.assertEqual(output_0.shape, output_1.shape) @@ -193,7 +171,8 @@ def test_thresholding(self): sample_max_value=threshold, algorithm_type="dpmsolver++", solver_order=order, - solver_type=solver_type, ) + solver_type=solver_type, + ) def test_prediction_type(self): for prediction_type in ["epsilon", "v_prediction"]: @@ -208,14 +187,15 @@ def test_solver_order_and_type(self): solver_order=order, solver_type=solver_type, prediction_type=prediction_type, - algorithm_type=algorithm_type, ) + algorithm_type=algorithm_type, + ) sample = self.full_loop( solver_order=order, solver_type=solver_type, prediction_type=prediction_type, - algorithm_type=algorithm_type, ) - assert not paddle.isnan(sample).any( - ), "Samples have nan numbers" + algorithm_type=algorithm_type, + ) + assert not paddle.isnan(sample).any(), "Samples have nan numbers" def test_lower_order_final(self): self.check_over_configs(lower_order_final=True) @@ -223,8 +203,7 @@ def test_lower_order_final(self): def test_inference_steps(self): for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]: - self.check_over_forward( - num_inference_steps=num_inference_steps, time_step=0) + self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0) def test_full_loop_no_noise(self): sample = self.full_loop() @@ -233,10 +212,7 @@ def test_full_loop_no_noise(self): assert abs(result_mean.item() - 0.3301) < 1e-3 def test_full_loop_no_noise_thres(self): - sample = self.full_loop( - thresholding=True, - dynamic_thresholding_ratio=0.87, - sample_max_value=0.5) + sample = self.full_loop(thresholding=True, dynamic_thresholding_ratio=0.87, sample_max_value=0.5) result_mean = paddle.mean(paddle.abs(sample)) assert abs(result_mean.item() - 1.1364) < 1e-3 @@ -248,8 +224,7 @@ def test_full_loop_with_v_prediction(self): assert abs(result_mean.item() - 0.2251) < 1e-3 def test_full_loop_with_karras_and_v_prediction(self): - sample = self.full_loop( - prediction_type="v_prediction", use_karras_sigmas=True) + sample = self.full_loop(prediction_type="v_prediction", use_karras_sigmas=True) result_mean = paddle.mean(paddle.abs(sample)) assert abs(result_mean.item() - 0.2096) < 1e-3 @@ -275,8 +250,7 @@ def test_switch(self): def test_fp16_support(self): scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config( - thresholding=True, dynamic_thresholding_ratio=0) + scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0) scheduler = scheduler_class(**scheduler_config) num_inference_steps = 10 @@ -297,5 +271,4 @@ def test_unique_timesteps(self, **config): scheduler = scheduler_class(**scheduler_config) scheduler.set_timesteps(scheduler.config.num_train_timesteps) - assert len(scheduler.timesteps.unique( - )) == scheduler.num_inference_steps + assert len(scheduler.timesteps.unique()) == scheduler.num_inference_steps diff --git a/ppdiffusers/tests/schedulers/test_scheduler_dpm_single.py b/ppdiffusers/tests/schedulers/test_scheduler_dpm_single.py index bb702887ed40f..ce229323bc363 100644 --- a/ppdiffusers/tests/schedulers/test_scheduler_dpm_single.py +++ b/ppdiffusers/tests/schedulers/test_scheduler_dpm_single.py @@ -16,15 +16,19 @@ import paddle -from ppdiffusers import (DEISMultistepScheduler, DPMSolverMultistepScheduler, - DPMSolverSinglestepScheduler, UniPCMultistepScheduler) +from ppdiffusers import ( + DEISMultistepScheduler, + DPMSolverMultistepScheduler, + DPMSolverSinglestepScheduler, + UniPCMultistepScheduler, +) from .test_schedulers import SchedulerCommonTest class DPMSolverSinglestepSchedulerTest(SchedulerCommonTest): - scheduler_classes = (DPMSolverSinglestepScheduler, ) - forward_default_kwargs = (("num_inference_steps", 25), ) + scheduler_classes = (DPMSolverSinglestepScheduler,) + forward_default_kwargs = (("num_inference_steps", 25),) def get_scheduler_config(self, **kwargs): config = { @@ -48,38 +52,28 @@ def check_over_configs(self, time_step=0, **config): num_inference_steps = kwargs.pop("num_inference_steps", None) sample = self.dummy_sample residual = 0.1 * sample - dummy_past_residuals = [ - residual + 0.2, residual + 0.15, residual + 0.10 - ] + dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10] for scheduler_class in self.scheduler_classes: scheduler_config = self.get_scheduler_config(**config) scheduler = scheduler_class(**scheduler_config) scheduler.set_timesteps(num_inference_steps) # copy over dummy past residuals - scheduler.model_outputs = dummy_past_residuals[:scheduler.config. - solver_order] + scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order] with tempfile.TemporaryDirectory() as tmpdirname: scheduler.save_config(tmpdirname) new_scheduler = scheduler_class.from_pretrained(tmpdirname) new_scheduler.set_timesteps(num_inference_steps) # copy over dummy past residuals - new_scheduler.model_outputs = dummy_past_residuals[: - new_scheduler. - config. - solver_order] + new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order] output, new_output = sample, sample - for t in range(time_step, - time_step + scheduler.config.solver_order + 1): - output = scheduler.step(residual, t, output, - **kwargs).prev_sample - new_output = new_scheduler.step(residual, t, new_output, - **kwargs).prev_sample + for t in range(time_step, time_step + scheduler.config.solver_order + 1): + output = scheduler.step(residual, t, output, **kwargs).prev_sample + new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler outputs are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" def test_from_save_pretrained(self): pass @@ -89,9 +83,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs): num_inference_steps = kwargs.pop("num_inference_steps", None) sample = self.dummy_sample residual = 0.1 * sample - dummy_past_residuals = [ - residual + 0.2, residual + 0.15, residual + 0.10 - ] + dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10] for scheduler_class in self.scheduler_classes: scheduler_config = self.get_scheduler_config() @@ -99,8 +91,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs): scheduler.set_timesteps(num_inference_steps) # copy over dummy past residuals (must be after setting timesteps) - scheduler.model_outputs = dummy_past_residuals[:scheduler.config. - solver_order] + scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order] with tempfile.TemporaryDirectory() as tmpdirname: scheduler.save_config(tmpdirname) @@ -109,18 +100,12 @@ def check_over_forward(self, time_step=0, **forward_kwargs): new_scheduler.set_timesteps(num_inference_steps) # copy over dummy past residual (must be after setting timesteps) - new_scheduler.model_outputs = dummy_past_residuals[: - new_scheduler. - config. - solver_order] + new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order] - output = scheduler.step(residual, time_step, sample, - **kwargs).prev_sample - new_output = new_scheduler.step(residual, time_step, sample, - **kwargs).prev_sample + output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample + new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler outputs are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" def full_loop(self, scheduler=None, **config): if scheduler is None: @@ -178,7 +163,8 @@ def test_thresholding(self): sample_max_value=threshold, algorithm_type="dpmsolver++", solver_order=order, - solver_type=solver_type, ) + solver_type=solver_type, + ) def test_prediction_type(self): for prediction_type in ["epsilon", "v_prediction"]: @@ -193,14 +179,15 @@ def test_solver_order_and_type(self): solver_order=order, solver_type=solver_type, prediction_type=prediction_type, - algorithm_type=algorithm_type, ) + algorithm_type=algorithm_type, + ) sample = self.full_loop( solver_order=order, solver_type=solver_type, prediction_type=prediction_type, - algorithm_type=algorithm_type, ) - assert not paddle.isnan(sample).any( - ), "Samples have nan numbers" + algorithm_type=algorithm_type, + ) + assert not paddle.isnan(sample).any(), "Samples have nan numbers" def test_lower_order_final(self): self.check_over_configs(lower_order_final=True) @@ -208,8 +195,7 @@ def test_lower_order_final(self): def test_inference_steps(self): for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]: - self.check_over_forward( - num_inference_steps=num_inference_steps, time_step=0) + self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0) def test_full_loop_no_noise(self): sample = self.full_loop() @@ -225,8 +211,7 @@ def test_full_loop_with_v_prediction(self): def test_fp16_support(self): scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config( - thresholding=True, dynamic_thresholding_ratio=0) + scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0) scheduler = scheduler_class(**scheduler_config) num_inference_steps = 10 diff --git a/ppdiffusers/tests/schedulers/test_scheduler_euler.py b/ppdiffusers/tests/schedulers/test_scheduler_euler.py index bdca25bba1cb3..d6cfc9fe4474b 100644 --- a/ppdiffusers/tests/schedulers/test_scheduler_euler.py +++ b/ppdiffusers/tests/schedulers/test_scheduler_euler.py @@ -20,7 +20,7 @@ class EulerDiscreteSchedulerTest(SchedulerCommonTest): - scheduler_classes = (EulerDiscreteScheduler, ) + scheduler_classes = (EulerDiscreteScheduler,) num_inference_steps = 10 def get_scheduler_config(self, **kwargs): @@ -39,8 +39,7 @@ def test_timesteps(self): self.check_over_configs(num_train_timesteps=timesteps) def test_betas(self): - for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], - [0.0002, 0.002, 0.02]): + for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]): self.check_over_configs(beta_start=beta_start, beta_end=beta_end) def test_schedules(self): @@ -68,8 +67,7 @@ def test_full_loop_no_noise(self): model_output = model(sample, t) - output = scheduler.step( - model_output, t, sample, generator=generator) + output = scheduler.step(model_output, t, sample, generator=generator) sample = output.prev_sample result_sum = paddle.sum(paddle.abs(sample)) @@ -80,8 +78,7 @@ def test_full_loop_no_noise(self): def test_full_loop_with_v_prediction(self): scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config( - prediction_type="v_prediction") + scheduler_config = self.get_scheduler_config(prediction_type="v_prediction") scheduler = scheduler_class(**scheduler_config) scheduler.set_timesteps(self.num_inference_steps) @@ -96,8 +93,7 @@ def test_full_loop_with_v_prediction(self): model_output = model(sample, t) - output = scheduler.step( - model_output, t, sample, generator=generator) + output = scheduler.step(model_output, t, sample, generator=generator) sample = output.prev_sample result_sum = paddle.sum(paddle.abs(sample)) @@ -123,8 +119,7 @@ def test_full_loop_device(self): model_output = model(sample, t) - output = scheduler.step( - model_output, t, sample, generator=generator) + output = scheduler.step(model_output, t, sample, generator=generator) sample = output.prev_sample result_sum = paddle.sum(paddle.abs(sample)) @@ -150,8 +145,7 @@ def test_full_loop_device_karras_sigmas(self): model_output = model(sample, t) - output = scheduler.step( - model_output, t, sample, generator=generator) + output = scheduler.step(model_output, t, sample, generator=generator) sample = output.prev_sample result_sum = paddle.sum(paddle.abs(sample)) diff --git a/ppdiffusers/tests/schedulers/test_scheduler_euler_ancestral.py b/ppdiffusers/tests/schedulers/test_scheduler_euler_ancestral.py index cb2d308947d3b..fdc7f2a34f30f 100644 --- a/ppdiffusers/tests/schedulers/test_scheduler_euler_ancestral.py +++ b/ppdiffusers/tests/schedulers/test_scheduler_euler_ancestral.py @@ -20,7 +20,7 @@ class EulerAncestralDiscreteSchedulerTest(SchedulerCommonTest): - scheduler_classes = (EulerAncestralDiscreteScheduler, ) + scheduler_classes = (EulerAncestralDiscreteScheduler,) num_inference_steps = 10 def get_scheduler_config(self, **kwargs): @@ -39,8 +39,7 @@ def test_timesteps(self): self.check_over_configs(num_train_timesteps=timesteps) def test_betas(self): - for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], - [0.0002, 0.002, 0.02]): + for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]): self.check_over_configs(beta_start=beta_start, beta_end=beta_end) def test_schedules(self): @@ -68,8 +67,7 @@ def test_full_loop_no_noise(self): model_output = model(sample, t) - output = scheduler.step( - model_output, t, sample, generator=generator) + output = scheduler.step(model_output, t, sample, generator=generator) sample = output.prev_sample result_sum = paddle.sum(paddle.abs(sample)) @@ -80,8 +78,7 @@ def test_full_loop_no_noise(self): def test_full_loop_with_v_prediction(self): scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config( - prediction_type="v_prediction") + scheduler_config = self.get_scheduler_config(prediction_type="v_prediction") scheduler = scheduler_class(**scheduler_config) scheduler.set_timesteps(self.num_inference_steps) @@ -96,8 +93,7 @@ def test_full_loop_with_v_prediction(self): model_output = model(sample, t) - output = scheduler.step( - model_output, t, sample, generator=generator) + output = scheduler.step(model_output, t, sample, generator=generator) sample = output.prev_sample result_sum = paddle.sum(paddle.abs(sample)) @@ -122,8 +118,7 @@ def test_full_loop_device(self): model_output = model(sample, t) - output = scheduler.step( - model_output, t, sample, generator=generator) + output = scheduler.step(model_output, t, sample, generator=generator) sample = output.prev_sample result_sum = paddle.sum(paddle.abs(sample)) diff --git a/ppdiffusers/tests/schedulers/test_scheduler_heun.py b/ppdiffusers/tests/schedulers/test_scheduler_heun.py index b8223700592bb..0f62ae519f4e0 100644 --- a/ppdiffusers/tests/schedulers/test_scheduler_heun.py +++ b/ppdiffusers/tests/schedulers/test_scheduler_heun.py @@ -20,7 +20,7 @@ class HeunDiscreteSchedulerTest(SchedulerCommonTest): - scheduler_classes = (HeunDiscreteScheduler, ) + scheduler_classes = (HeunDiscreteScheduler,) num_inference_steps = 10 def get_scheduler_config(self, **kwargs): @@ -39,8 +39,7 @@ def test_timesteps(self): self.check_over_configs(num_train_timesteps=timesteps) def test_betas(self): - for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], - [0.0002, 0.002, 0.02]): + for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]): self.check_over_configs(beta_start=beta_start, beta_end=beta_end) def test_schedules(self): @@ -78,8 +77,7 @@ def test_full_loop_no_noise(self): def test_full_loop_with_v_prediction(self): scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config( - prediction_type="v_prediction") + scheduler_config = self.get_scheduler_config(prediction_type="v_prediction") scheduler = scheduler_class(**scheduler_config) scheduler.set_timesteps(self.num_inference_steps) diff --git a/ppdiffusers/tests/schedulers/test_scheduler_ipndm.py b/ppdiffusers/tests/schedulers/test_scheduler_ipndm.py index 39558436871af..c282c6a61079b 100644 --- a/ppdiffusers/tests/schedulers/test_scheduler_ipndm.py +++ b/ppdiffusers/tests/schedulers/test_scheduler_ipndm.py @@ -22,8 +22,8 @@ class IPNDMSchedulerTest(SchedulerCommonTest): - scheduler_classes = (IPNDMScheduler, ) - forward_default_kwargs = (("num_inference_steps", 50), ) + scheduler_classes = (IPNDMScheduler,) + forward_default_kwargs = (("num_inference_steps", 50),) def get_scheduler_config(self, **kwargs): config = {"num_train_timesteps": 1000} @@ -59,21 +59,15 @@ def check_over_configs(self, time_step=0, **config): # copy over dummy past residuals new_scheduler.ets = dummy_past_residuals[:] - output = scheduler.step(residual, time_step, sample, - **kwargs).prev_sample - new_output = new_scheduler.step(residual, time_step, sample, - **kwargs).prev_sample + output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample + new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler outputs are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - output = scheduler.step(residual, time_step, sample, - **kwargs).prev_sample - new_output = new_scheduler.step(residual, time_step, sample, - **kwargs).prev_sample + output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample + new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler outputs are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" def test_from_save_pretrained(self): pass @@ -110,21 +104,15 @@ def check_over_forward(self, time_step=0, **forward_kwargs): # copy over dummy past residual (must be after setting timesteps) new_scheduler.ets = dummy_past_residuals[:] - output = scheduler.step(residual, time_step, sample, - **kwargs).prev_sample - new_output = new_scheduler.step(residual, time_step, sample, - **kwargs).prev_sample + output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample + new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler outputs are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - output = scheduler.step(residual, time_step, sample, - **kwargs).prev_sample - new_output = new_scheduler.step(residual, time_step, sample, - **kwargs).prev_sample + output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample + new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler outputs are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" def full_loop(self, **config): scheduler_class = self.scheduler_classes[0] @@ -158,11 +146,9 @@ def test_step_shape(self): sample = self.dummy_sample residual = 0.1 * sample - if num_inference_steps is not None and hasattr(scheduler, - "set_timesteps"): + if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr( - scheduler, "set_timesteps"): + elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): kwargs["num_inference_steps"] = num_inference_steps # copy over dummy past residuals (must be done after set_timesteps) @@ -177,31 +163,25 @@ def test_step_shape(self): time_step_0 = scheduler.timesteps[5] time_step_1 = scheduler.timesteps[6] - output_0 = scheduler.step(residual, time_step_0, sample, - **kwargs).prev_sample - output_1 = scheduler.step(residual, time_step_1, sample, - **kwargs).prev_sample + output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample + output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample self.assertEqual(output_0.shape, sample.shape) self.assertEqual(output_0.shape, output_1.shape) - output_0 = scheduler.step(residual, time_step_0, sample, - **kwargs).prev_sample - output_1 = scheduler.step(residual, time_step_1, sample, - **kwargs).prev_sample + output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample + output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample self.assertEqual(output_0.shape, sample.shape) self.assertEqual(output_0.shape, output_1.shape) def test_timesteps(self): for timesteps in [100, 1000]: - self.check_over_configs( - num_train_timesteps=timesteps, time_step=None) + self.check_over_configs(num_train_timesteps=timesteps, time_step=None) def test_inference_steps(self): for t, num_inference_steps in zip([1, 5, 10], [10, 50, 100]): - self.check_over_forward( - num_inference_steps=num_inference_steps, time_step=None) + self.check_over_forward(num_inference_steps=num_inference_steps, time_step=None) def test_full_loop_no_noise(self): sample = self.full_loop() diff --git a/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py b/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py index 4081289cebb20..770b4f226ba5c 100644 --- a/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py +++ b/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py @@ -20,7 +20,7 @@ class KDPM2AncestralDiscreteSchedulerTest(SchedulerCommonTest): - scheduler_classes = (KDPM2AncestralDiscreteScheduler, ) + scheduler_classes = (KDPM2AncestralDiscreteScheduler,) num_inference_steps = 10 def get_scheduler_config(self, **kwargs): @@ -39,8 +39,7 @@ def test_timesteps(self): self.check_over_configs(num_train_timesteps=timesteps) def test_betas(self): - for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], - [0.0002, 0.002, 0.02]): + for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]): self.check_over_configs(beta_start=beta_start, beta_end=beta_end) def test_schedules(self): @@ -65,8 +64,7 @@ def test_full_loop_no_noise(self): model_output = model(sample, t) - output = scheduler.step( - model_output, t, sample, generator=generator) + output = scheduler.step(model_output, t, sample, generator=generator) sample = output.prev_sample result_sum = paddle.sum(paddle.abs(sample)) @@ -82,8 +80,7 @@ def test_prediction_type(self): def test_full_loop_with_v_prediction(self): scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config( - prediction_type="v_prediction") + scheduler_config = self.get_scheduler_config(prediction_type="v_prediction") scheduler = scheduler_class(**scheduler_config) scheduler.set_timesteps(self.num_inference_steps) @@ -98,8 +95,7 @@ def test_full_loop_with_v_prediction(self): model_output = model(sample, t) - output = scheduler.step( - model_output, t, sample, generator=generator) + output = scheduler.step(model_output, t, sample, generator=generator) sample = output.prev_sample result_sum = paddle.sum(paddle.abs(sample)) @@ -125,8 +121,7 @@ def test_full_loop_device(self): model_output = model(sample, t) - output = scheduler.step( - model_output, t, sample, generator=generator) + output = scheduler.step(model_output, t, sample, generator=generator) sample = output.prev_sample result_sum = paddle.sum(paddle.abs(sample)) diff --git a/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py b/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py index ee87c662588d7..3da7b7e75fd44 100644 --- a/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py +++ b/ppdiffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py @@ -20,7 +20,7 @@ class KDPM2DiscreteSchedulerTest(SchedulerCommonTest): - scheduler_classes = (KDPM2DiscreteScheduler, ) + scheduler_classes = (KDPM2DiscreteScheduler,) num_inference_steps = 10 def get_scheduler_config(self, **kwargs): @@ -39,8 +39,7 @@ def test_timesteps(self): self.check_over_configs(num_train_timesteps=timesteps) def test_betas(self): - for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], - [0.0002, 0.002, 0.02]): + for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]): self.check_over_configs(beta_start=beta_start, beta_end=beta_end) def test_schedules(self): @@ -53,8 +52,7 @@ def test_prediction_type(self): def test_full_loop_with_v_prediction(self): scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config( - prediction_type="v_prediction") + scheduler_config = self.get_scheduler_config(prediction_type="v_prediction") scheduler = scheduler_class(**scheduler_config) scheduler.set_timesteps(self.num_inference_steps) diff --git a/ppdiffusers/tests/schedulers/test_scheduler_lms.py b/ppdiffusers/tests/schedulers/test_scheduler_lms.py index 0be32200e94c8..8ee87bbddf624 100644 --- a/ppdiffusers/tests/schedulers/test_scheduler_lms.py +++ b/ppdiffusers/tests/schedulers/test_scheduler_lms.py @@ -20,7 +20,7 @@ class LMSDiscreteSchedulerTest(SchedulerCommonTest): - scheduler_classes = (LMSDiscreteScheduler, ) + scheduler_classes = (LMSDiscreteScheduler,) num_inference_steps = 10 def get_scheduler_config(self, **kwargs): @@ -39,8 +39,7 @@ def test_timesteps(self): self.check_over_configs(num_train_timesteps=timesteps) def test_betas(self): - for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], - [0.0002, 0.002, 0.02]): + for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]): self.check_over_configs(beta_start=beta_start, beta_end=beta_end) def test_schedules(self): @@ -81,8 +80,7 @@ def test_full_loop_no_noise(self): def test_full_loop_with_v_prediction(self): scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config( - prediction_type="v_prediction") + scheduler_config = self.get_scheduler_config(prediction_type="v_prediction") scheduler = scheduler_class(**scheduler_config) scheduler.set_timesteps(self.num_inference_steps) diff --git a/ppdiffusers/tests/schedulers/test_scheduler_pndm.py b/ppdiffusers/tests/schedulers/test_scheduler_pndm.py index ab94b8ffca3f3..ad2998c26bfd9 100644 --- a/ppdiffusers/tests/schedulers/test_scheduler_pndm.py +++ b/ppdiffusers/tests/schedulers/test_scheduler_pndm.py @@ -22,8 +22,8 @@ class PNDMSchedulerTest(SchedulerCommonTest): - scheduler_classes = (PNDMScheduler, ) - forward_default_kwargs = (("num_inference_steps", 50), ) + scheduler_classes = (PNDMScheduler,) + forward_default_kwargs = (("num_inference_steps", 50),) def get_scheduler_config(self, **kwargs): config = { @@ -62,21 +62,15 @@ def check_over_configs(self, time_step=0, **config): # copy over dummy past residuals new_scheduler.ets = dummy_past_residuals[:] - output = scheduler.step_prk(residual, time_step, sample, - **kwargs).prev_sample - new_output = new_scheduler.step_prk(residual, time_step, sample, - **kwargs).prev_sample + output = scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample + new_output = new_scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler outputs are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - output = scheduler.step_plms(residual, time_step, sample, - **kwargs).prev_sample - new_output = new_scheduler.step_plms(residual, time_step, sample, - **kwargs).prev_sample + output = scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample + new_output = new_scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler outputs are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" def test_from_save_pretrained(self): pass @@ -110,21 +104,15 @@ def check_over_forward(self, time_step=0, **forward_kwargs): # copy over dummy past residual (must be after setting timesteps) new_scheduler.ets = dummy_past_residuals[:] - output = scheduler.step_prk(residual, time_step, sample, - **kwargs).prev_sample - new_output = new_scheduler.step_prk(residual, time_step, sample, - **kwargs).prev_sample + output = scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample + new_output = new_scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler outputs are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - output = scheduler.step_plms(residual, time_step, sample, - **kwargs).prev_sample - new_output = new_scheduler.step_plms(residual, time_step, sample, - **kwargs).prev_sample + output = scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample + new_output = new_scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler outputs are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" def full_loop(self, **config): scheduler_class = self.scheduler_classes[0] @@ -158,11 +146,9 @@ def test_step_shape(self): sample = self.dummy_sample residual = 0.1 * sample - if num_inference_steps is not None and hasattr(scheduler, - "set_timesteps"): + if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr( - scheduler, "set_timesteps"): + elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): kwargs["num_inference_steps"] = num_inference_steps # copy over dummy past residuals (must be done after set_timesteps) @@ -174,18 +160,14 @@ def test_step_shape(self): ] scheduler.ets = dummy_past_residuals[:] - output_0 = scheduler.step_prk(residual, 0, sample, - **kwargs).prev_sample - output_1 = scheduler.step_prk(residual, 1, sample, - **kwargs).prev_sample + output_0 = scheduler.step_prk(residual, 0, sample, **kwargs).prev_sample + output_1 = scheduler.step_prk(residual, 1, sample, **kwargs).prev_sample self.assertEqual(output_0.shape, sample.shape) self.assertEqual(output_0.shape, output_1.shape) - output_0 = scheduler.step_plms(residual, 0, sample, - **kwargs).prev_sample - output_1 = scheduler.step_plms(residual, 1, sample, - **kwargs).prev_sample + output_0 = scheduler.step_plms(residual, 0, sample, **kwargs).prev_sample + output_1 = scheduler.step_plms(residual, 1, sample, **kwargs).prev_sample self.assertEqual(output_0.shape, sample.shape) self.assertEqual(output_0.shape, output_1.shape) @@ -204,27 +186,30 @@ def test_steps_offset(self): scheduler.set_timesteps(10) assert paddle.equal_all( scheduler.timesteps, - paddle.to_tensor([ - 901, - 851, - 851, - 801, - 801, - 751, - 751, - 701, - 701, - 651, - 651, - 601, - 601, - 501, - 401, - 301, - 201, - 101, - 1, - ]), ) + paddle.to_tensor( + [ + 901, + 851, + 851, + 801, + 801, + 751, + 751, + 701, + 701, + 651, + 651, + 601, + 601, + 501, + 401, + 301, + 201, + 101, + 1, + ] + ), + ) def test_betas(self): for beta_start, beta_end in zip([0.0001, 0.001], [0.002, 0.02]): @@ -269,8 +254,7 @@ def test_inference_plms_no_past_residuals(self): scheduler_config = self.get_scheduler_config() scheduler = scheduler_class(**scheduler_config) - scheduler.step_plms(self.dummy_sample, 1, - self.dummy_sample).prev_sample + scheduler.step_plms(self.dummy_sample, 1, self.dummy_sample).prev_sample def test_full_loop_no_noise(self): sample = self.full_loop() diff --git a/ppdiffusers/tests/schedulers/test_scheduler_score_sde_ve.py b/ppdiffusers/tests/schedulers/test_scheduler_score_sde_ve.py index 3c2c1cd8ac641..ac15c502eda8d 100644 --- a/ppdiffusers/tests/schedulers/test_scheduler_score_sde_ve.py +++ b/ppdiffusers/tests/schedulers/test_scheduler_score_sde_ve.py @@ -23,7 +23,7 @@ class ScoreSdeVeSchedulerTest(unittest.TestCase): # TODO adapt with class SchedulerCommonTest (scheduler needs Numpy Integration) - scheduler_classes = (ScoreSdeVeScheduler, ) + scheduler_classes = (ScoreSdeVeScheduler,) forward_default_kwargs = () @property @@ -85,34 +85,22 @@ def check_over_configs(self, time_step=0, **config): new_scheduler = scheduler_class.from_pretrained(tmpdirname) output = scheduler.step_pred( - residual, - time_step, - sample, - generator=paddle.Generator().manual_seed(0), - **kwargs).prev_sample + residual, time_step, sample, generator=paddle.Generator().manual_seed(0), **kwargs + ).prev_sample new_output = new_scheduler.step_pred( - residual, - time_step, - sample, - generator=paddle.Generator().manual_seed(0), - **kwargs).prev_sample + residual, time_step, sample, generator=paddle.Generator().manual_seed(0), **kwargs + ).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler outputs are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" output = scheduler.step_correct( - residual, - sample, - generator=paddle.Generator().manual_seed(0), - **kwargs).prev_sample + residual, sample, generator=paddle.Generator().manual_seed(0), **kwargs + ).prev_sample new_output = new_scheduler.step_correct( - residual, - sample, - generator=paddle.Generator().manual_seed(0), - **kwargs).prev_sample + residual, sample, generator=paddle.Generator().manual_seed(0), **kwargs + ).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler correction are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler correction are not identical" def check_over_forward(self, time_step=0, **forward_kwargs): kwargs = dict(self.forward_default_kwargs) @@ -130,34 +118,22 @@ def check_over_forward(self, time_step=0, **forward_kwargs): new_scheduler = scheduler_class.from_pretrained(tmpdirname) output = scheduler.step_pred( - residual, - time_step, - sample, - generator=paddle.Generator().manual_seed(0), - **kwargs).prev_sample + residual, time_step, sample, generator=paddle.Generator().manual_seed(0), **kwargs + ).prev_sample new_output = new_scheduler.step_pred( - residual, - time_step, - sample, - generator=paddle.Generator().manual_seed(0), - **kwargs).prev_sample + residual, time_step, sample, generator=paddle.Generator().manual_seed(0), **kwargs + ).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler outputs are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" output = scheduler.step_correct( - residual, - sample, - generator=paddle.Generator().manual_seed(0), - **kwargs).prev_sample + residual, sample, generator=paddle.Generator().manual_seed(0), **kwargs + ).prev_sample new_output = new_scheduler.step_correct( - residual, - sample, - generator=paddle.Generator().manual_seed(0), - **kwargs).prev_sample + residual, sample, generator=paddle.Generator().manual_seed(0), **kwargs + ).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler correction are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler correction are not identical" def test_timesteps(self): for timesteps in [10, 100, 1000]: @@ -193,15 +169,12 @@ def test_full_loop_no_noise(self): for _ in range(scheduler.config.correct_steps): with paddle.no_grad(): model_output = model(sample, sigma_t) - sample = scheduler.step_correct( - model_output, sample, generator=generator, - **kwargs).prev_sample + sample = scheduler.step_correct(model_output, sample, generator=generator, **kwargs).prev_sample with paddle.no_grad(): model_output = model(sample, sigma_t) - output = scheduler.step_pred( - model_output, t, sample, generator=generator, **kwargs) + output = scheduler.step_pred(model_output, t, sample, generator=generator, **kwargs) sample, _ = output.prev_sample, output.prev_sample_mean result_sum = paddle.sum(paddle.abs(sample)) @@ -222,25 +195,17 @@ def test_step_shape(self): sample = self.dummy_sample residual = 0.1 * sample - if num_inference_steps is not None and hasattr(scheduler, - "set_timesteps"): + if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr( - scheduler, "set_timesteps"): + elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): kwargs["num_inference_steps"] = num_inference_steps output_0 = scheduler.step_pred( - residual, - 0, - sample, - generator=paddle.Generator().manual_seed(0), - **kwargs).prev_sample + residual, 0, sample, generator=paddle.Generator().manual_seed(0), **kwargs + ).prev_sample output_1 = scheduler.step_pred( - residual, - 1, - sample, - generator=paddle.Generator().manual_seed(0), - **kwargs).prev_sample + residual, 1, sample, generator=paddle.Generator().manual_seed(0), **kwargs + ).prev_sample self.assertEqual(output_0.shape, sample.shape) self.assertEqual(output_0.shape, output_1.shape) diff --git a/ppdiffusers/tests/schedulers/test_scheduler_unclip.py b/ppdiffusers/tests/schedulers/test_scheduler_unclip.py index 5ac931e6abef5..b37fa2c513271 100644 --- a/ppdiffusers/tests/schedulers/test_scheduler_unclip.py +++ b/ppdiffusers/tests/schedulers/test_scheduler_unclip.py @@ -21,7 +21,7 @@ # UnCLIPScheduler is a modified DDPMScheduler with a subset of the configuration. class UnCLIPSchedulerTest(SchedulerCommonTest): - scheduler_classes = (UnCLIPScheduler, ) + scheduler_classes = (UnCLIPScheduler,) def get_scheduler_config(self, **kwargs): config = { @@ -61,36 +61,27 @@ def test_time_indices(self): if prev_timestep is not None and prev_timestep >= time_step: continue - self.check_over_forward( - time_step=time_step, prev_timestep=prev_timestep) + self.check_over_forward(time_step=time_step, prev_timestep=prev_timestep) def test_variance_fixed_small_log(self): scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config( - variance_type="fixed_small_log") + scheduler_config = self.get_scheduler_config(variance_type="fixed_small_log") scheduler = scheduler_class(**scheduler_config) - assert paddle.sum(paddle.abs(scheduler._get_variance(0) - - 1.0000e-10)) < 1e-5 - assert paddle.sum( - paddle.abs(scheduler._get_variance(487) - 0.0549625)) < 1e-5 - assert paddle.sum( - paddle.abs(scheduler._get_variance(999) - 0.9994987)) < 1e-5 + assert paddle.sum(paddle.abs(scheduler._get_variance(0) - 1.0000e-10)) < 1e-5 + assert paddle.sum(paddle.abs(scheduler._get_variance(487) - 0.0549625)) < 1e-5 + assert paddle.sum(paddle.abs(scheduler._get_variance(999) - 0.9994987)) < 1e-5 def test_variance_learned_range(self): scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config( - variance_type="learned_range") + scheduler_config = self.get_scheduler_config(variance_type="learned_range") scheduler = scheduler_class(**scheduler_config) predicted_variance = 0.5 - assert (scheduler._get_variance( - 1, predicted_variance=predicted_variance) - -10.1712790 < 1e-5) - assert (scheduler._get_variance( - 487, predicted_variance=predicted_variance) - -5.7998052 < 1e-5) - assert (scheduler._get_variance( - 999, predicted_variance=predicted_variance) - -0.0010011 < 1e-5) + assert scheduler._get_variance(1, predicted_variance=predicted_variance) - -10.1712790 < 1e-5 + assert scheduler._get_variance(487, predicted_variance=predicted_variance) - -5.7998052 < 1e-5 + assert scheduler._get_variance(999, predicted_variance=predicted_variance) - -0.0010011 < 1e-5 def test_full_loop(self): scheduler_class = self.scheduler_classes[0] @@ -108,8 +99,7 @@ def test_full_loop(self): residual = model(sample, t) # 2. predict previous mean of sample x_t-1 - pred_prev_sample = scheduler.step( - residual, t, sample, generator=generator).prev_sample + pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample sample = pred_prev_sample @@ -143,11 +133,8 @@ def test_full_loop_skip_timesteps(self): # 2. predict previous mean of sample x_t-1 pred_prev_sample = scheduler.step( - residual, - t, - sample, - prev_timestep=prev_timestep, - generator=generator).prev_sample + residual, t, sample, prev_timestep=prev_timestep, generator=generator + ).prev_sample sample = pred_prev_sample diff --git a/ppdiffusers/tests/schedulers/test_scheduler_unipc.py b/ppdiffusers/tests/schedulers/test_scheduler_unipc.py index 7d28f06cd5fb7..0c19a3bb8387a 100644 --- a/ppdiffusers/tests/schedulers/test_scheduler_unipc.py +++ b/ppdiffusers/tests/schedulers/test_scheduler_unipc.py @@ -16,15 +16,19 @@ import paddle -from ppdiffusers import (DEISMultistepScheduler, DPMSolverMultistepScheduler, - DPMSolverSinglestepScheduler, UniPCMultistepScheduler) +from ppdiffusers import ( + DEISMultistepScheduler, + DPMSolverMultistepScheduler, + DPMSolverSinglestepScheduler, + UniPCMultistepScheduler, +) from .test_schedulers import SchedulerCommonTest class UniPCMultistepSchedulerTest(SchedulerCommonTest): - scheduler_classes = (UniPCMultistepScheduler, ) - forward_default_kwargs = (("num_inference_steps", 25), ) + scheduler_classes = (UniPCMultistepScheduler,) + forward_default_kwargs = (("num_inference_steps", 25),) def get_scheduler_config(self, **kwargs): config = { @@ -44,47 +48,35 @@ def check_over_configs(self, time_step=0, **config): num_inference_steps = kwargs.pop("num_inference_steps", None) sample = self.dummy_sample residual = 0.1 * sample - dummy_past_residuals = [ - residual + 0.2, residual + 0.15, residual + 0.10 - ] + dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10] for scheduler_class in self.scheduler_classes: scheduler_config = self.get_scheduler_config(**config) scheduler = scheduler_class(**scheduler_config) scheduler.set_timesteps(num_inference_steps) # copy over dummy past residuals - scheduler.model_outputs = dummy_past_residuals[:scheduler.config. - solver_order] + scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order] with tempfile.TemporaryDirectory() as tmpdirname: scheduler.save_config(tmpdirname) new_scheduler = scheduler_class.from_pretrained(tmpdirname) new_scheduler.set_timesteps(num_inference_steps) # copy over dummy past residuals - new_scheduler.model_outputs = dummy_past_residuals[: - new_scheduler. - config. - solver_order] + new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order] output, new_output = sample, sample - for t in range(time_step, - time_step + scheduler.config.solver_order + 1): - output = scheduler.step(residual, t, output, - **kwargs).prev_sample - new_output = new_scheduler.step(residual, t, new_output, - **kwargs).prev_sample + for t in range(time_step, time_step + scheduler.config.solver_order + 1): + output = scheduler.step(residual, t, output, **kwargs).prev_sample + new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler outputs are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" def check_over_forward(self, time_step=0, **forward_kwargs): kwargs = dict(self.forward_default_kwargs) num_inference_steps = kwargs.pop("num_inference_steps", None) sample = self.dummy_sample residual = 0.1 * sample - dummy_past_residuals = [ - residual + 0.2, residual + 0.15, residual + 0.10 - ] + dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10] for scheduler_class in self.scheduler_classes: scheduler_config = self.get_scheduler_config() @@ -92,8 +84,7 @@ def check_over_forward(self, time_step=0, **forward_kwargs): scheduler.set_timesteps(num_inference_steps) # copy over dummy past residuals (must be after setting timesteps) - scheduler.model_outputs = dummy_past_residuals[:scheduler.config. - solver_order] + scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order] with tempfile.TemporaryDirectory() as tmpdirname: scheduler.save_config(tmpdirname) @@ -102,18 +93,12 @@ def check_over_forward(self, time_step=0, **forward_kwargs): new_scheduler.set_timesteps(num_inference_steps) # copy over dummy past residual (must be after setting timesteps) - new_scheduler.model_outputs = dummy_past_residuals[: - new_scheduler. - config. - solver_order] + new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order] - output = scheduler.step(residual, time_step, sample, - **kwargs).prev_sample - new_output = new_scheduler.step(residual, time_step, sample, - **kwargs).prev_sample + output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample + new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler outputs are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" def full_loop(self, scheduler=None, **config): if scheduler is None: @@ -148,27 +133,20 @@ def test_step_shape(self): sample = self.dummy_sample residual = 0.1 * sample - if num_inference_steps is not None and hasattr(scheduler, - "set_timesteps"): + if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr( - scheduler, "set_timesteps"): + elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): kwargs["num_inference_steps"] = num_inference_steps # copy over dummy past residuals (must be done after set_timesteps) - dummy_past_residuals = [ - residual + 0.2, residual + 0.15, residual + 0.10 - ] - scheduler.model_outputs = dummy_past_residuals[:scheduler.config. - solver_order] + dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10] + scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order] time_step_0 = scheduler.timesteps[5] time_step_1 = scheduler.timesteps[6] - output_0 = scheduler.step(residual, time_step_0, sample, - **kwargs).prev_sample - output_1 = scheduler.step(residual, time_step_1, sample, - **kwargs).prev_sample + output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample + output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample self.assertEqual(output_0.shape, sample.shape) self.assertEqual(output_0.shape, output_1.shape) @@ -207,7 +185,8 @@ def test_thresholding(self): prediction_type=prediction_type, sample_max_value=threshold, solver_order=order, - solver_type=solver_type, ) + solver_type=solver_type, + ) def test_prediction_type(self): for prediction_type in ["epsilon", "v_prediction"]: @@ -220,13 +199,14 @@ def test_solver_order_and_type(self): self.check_over_configs( solver_order=order, solver_type=solver_type, - prediction_type=prediction_type, ) + prediction_type=prediction_type, + ) sample = self.full_loop( solver_order=order, solver_type=solver_type, - prediction_type=prediction_type, ) - assert not paddle.isnan(sample).any( - ), "Samples have nan numbers" + prediction_type=prediction_type, + ) + assert not paddle.isnan(sample).any(), "Samples have nan numbers" def test_lower_order_final(self): self.check_over_configs(lower_order_final=True) @@ -234,8 +214,7 @@ def test_lower_order_final(self): def test_inference_steps(self): for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]: - self.check_over_forward( - num_inference_steps=num_inference_steps, time_step=0) + self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0) def test_full_loop_no_noise(self): sample = self.full_loop() @@ -251,8 +230,7 @@ def test_full_loop_with_v_prediction(self): def test_fp16_support(self): scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config( - thresholding=True, dynamic_thresholding_ratio=0) + scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0) scheduler = scheduler_class(**scheduler_config) num_inference_steps = 10 @@ -272,5 +250,4 @@ def test_unique_timesteps(self, **config): scheduler = scheduler_class(**scheduler_config) scheduler.set_timesteps(scheduler.config.num_train_timesteps) - assert len(scheduler.timesteps.unique( - )) == scheduler.num_inference_steps + assert len(scheduler.timesteps.unique()) == scheduler.num_inference_steps diff --git a/ppdiffusers/tests/schedulers/test_scheduler_vq_diffusion.py b/ppdiffusers/tests/schedulers/test_scheduler_vq_diffusion.py index 81ed3de4a1062..c40e7834d682f 100644 --- a/ppdiffusers/tests/schedulers/test_scheduler_vq_diffusion.py +++ b/ppdiffusers/tests/schedulers/test_scheduler_vq_diffusion.py @@ -21,7 +21,7 @@ class VQDiffusionSchedulerTest(SchedulerCommonTest): - scheduler_classes = (VQDiffusionScheduler, ) + scheduler_classes = (VQDiffusionScheduler,) def get_scheduler_config(self, **kwargs): config = { @@ -37,8 +37,7 @@ def dummy_sample(self, num_vec_classes): height = 8 width = 8 - sample = paddle.randint(0, num_vec_classes, - (batch_size, height * width)) + sample = paddle.randint(0, num_vec_classes, (batch_size, height * width)) return sample @@ -49,10 +48,8 @@ def dummy_sample_deter(self): def dummy_model(self, num_vec_classes): def model(sample, t, *args): batch_size, num_latent_pixels = sample.shape - logits = paddle.rand( - (batch_size, num_vec_classes - 1, num_latent_pixels)) - return_value = F.log_softmax( - logits.cast("float64"), axis=1).cast("float32") + logits = paddle.rand((batch_size, num_vec_classes - 1, num_latent_pixels)) + return_value = F.log_softmax(logits.cast("float64"), axis=1).cast("float32") return return_value return model diff --git a/ppdiffusers/tests/schedulers/test_schedulers.py b/ppdiffusers/tests/schedulers/test_schedulers.py index f01069d246e6a..92b11a679f661 100755 --- a/ppdiffusers/tests/schedulers/test_schedulers.py +++ b/ppdiffusers/tests/schedulers/test_schedulers.py @@ -24,9 +24,14 @@ import paddle import ppdiffusers -from ppdiffusers import (EulerAncestralDiscreteScheduler, - EulerDiscreteScheduler, IPNDMScheduler, - LMSDiscreteScheduler, VQDiffusionScheduler, logging) +from ppdiffusers import ( + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + IPNDMScheduler, + LMSDiscreteScheduler, + VQDiffusionScheduler, + logging, +) from ppdiffusers.configuration_utils import ConfigMixin, register_to_config from ppdiffusers.schedulers.scheduling_utils import SchedulerMixin from ppdiffusers.utils.testing_utils import CaptureLogger @@ -37,12 +42,13 @@ class SchedulerObject(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - a=2, - b=5, - c=(2, 5), - d="for diffusion", - e=[1, 3], ): + self, + a=2, + b=5, + c=(2, 5), + d="for diffusion", + e=[1, 3], + ): pass @@ -51,12 +57,13 @@ class SchedulerObject2(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - a=2, - b=5, - c=(2, 5), - d="for diffusion", - f=[1, 3], ): + self, + a=2, + b=5, + c=(2, 5), + d="for diffusion", + f=[1, 3], + ): pass @@ -65,13 +72,14 @@ class SchedulerObject3(SchedulerMixin, ConfigMixin): @register_to_config def __init__( - self, - a=2, - b=5, - c=(2, 5), - d="for diffusion", - e=[1, 3], - f=[1, 3], ): + self, + a=2, + b=5, + c=(2, 5), + d="for diffusion", + e=[1, 3], + f=[1, 3], + ): pass @@ -90,15 +98,11 @@ def test_save_load_from_different_config(self): new_obj_1 = SchedulerObject2.from_config(config) # now save a config parameter that is not expected - with open( - os.path.join(tmpdirname, SchedulerObject.config_name), - "r") as f: + with open(os.path.join(tmpdirname, SchedulerObject.config_name), "r") as f: data = json.load(f) data["unexpected"] = True - with open( - os.path.join(tmpdirname, SchedulerObject.config_name), - "w") as f: + with open(os.path.join(tmpdirname, SchedulerObject.config_name), "w") as f: json.dump(data, f) with CaptureLogger(logger) as cap_logger_2: @@ -115,12 +119,12 @@ def test_save_load_from_different_config(self): assert cap_logger_1.out == "" assert ( - cap_logger_2.out == - "The config attributes {'unexpected': True} were passed to SchedulerObject, but are not expected and" + cap_logger_2.out + == "The config attributes {'unexpected': True} were passed to SchedulerObject, but are not expected and" " will" - " be ignored. Please verify your config.json configuration file.\n") - assert (cap_logger_2.out.replace("SchedulerObject", "SchedulerObject2") - == cap_logger_3.out) + " be ignored. Please verify your config.json configuration file.\n" + ) + assert cap_logger_2.out.replace("SchedulerObject", "SchedulerObject2") == cap_logger_3.out def test_save_load_compatible_schedulers(self): SchedulerObject2._compatibles = ["SchedulerObject"] @@ -137,16 +141,12 @@ def test_save_load_compatible_schedulers(self): obj.save_config(tmpdirname) # now save a config parameter that is expected by another class, but not origin class - with open( - os.path.join(tmpdirname, SchedulerObject.config_name), - "r") as f: + with open(os.path.join(tmpdirname, SchedulerObject.config_name), "r") as f: data = json.load(f) data["f"] = [0, 0] data["unexpected"] = True - with open( - os.path.join(tmpdirname, SchedulerObject.config_name), - "w") as f: + with open(os.path.join(tmpdirname, SchedulerObject.config_name), "w") as f: json.dump(data, f) with CaptureLogger(logger) as cap_logger: @@ -156,10 +156,11 @@ def test_save_load_compatible_schedulers(self): assert new_obj.__class__ == SchedulerObject assert ( - cap_logger.out == - "The config attributes {'unexpected': True} were passed to SchedulerObject, but are not expected and" + cap_logger.out + == "The config attributes {'unexpected': True} were passed to SchedulerObject, but are not expected and" " will" - " be ignored. Please verify your config.json configuration file.\n") + " be ignored. Please verify your config.json configuration file.\n" + ) def test_save_load_from_different_config_comp_schedulers(self): SchedulerObject3._compatibles = ["SchedulerObject", "SchedulerObject2"] @@ -195,14 +196,8 @@ def test_save_load_from_different_config_comp_schedulers(self): assert new_obj_3.__class__ == SchedulerObject3 assert cap_logger_1.out == "" - assert ( - cap_logger_2.out == - "{'f'} was not found in config. Values will be initialized to default values.\n" - ) - assert ( - cap_logger_3.out == - "{'f'} was not found in config. Values will be initialized to default values.\n" - ) + assert cap_logger_2.out == "{'f'} was not found in config. Values will be initialized to default values.\n" + assert cap_logger_3.out == "{'f'} was not found in config. Values will be initialized to default values.\n" class SchedulerCommonTest(unittest.TestCase): @@ -252,9 +247,10 @@ def check_over_configs(self, time_step=0, **config): for scheduler_class in self.scheduler_classes: # TODO(Suraj) - delete the following two lines once DDPM, DDIM, and PNDM have timesteps casted to float by default if scheduler_class in ( - EulerAncestralDiscreteScheduler, - EulerDiscreteScheduler, - LMSDiscreteScheduler, ): + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + LMSDiscreteScheduler, + ): time_step = float(time_step) scheduler_config = self.get_scheduler_config(**config) @@ -273,12 +269,10 @@ def check_over_configs(self, time_step=0, **config): scheduler.save_config(tmpdirname) new_scheduler = scheduler_class.from_pretrained(tmpdirname) - if num_inference_steps is not None and hasattr(scheduler, - "set_timesteps"): + if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): scheduler.set_timesteps(num_inference_steps) new_scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr( - scheduler, "set_timesteps"): + elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): kwargs["num_inference_steps"] = num_inference_steps # Make sure `scale_model_input` is invoked to prevent a warning @@ -287,20 +281,15 @@ def check_over_configs(self, time_step=0, **config): _ = new_scheduler.scale_model_input(sample, 0) # Set the seed before step() as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler - if "generator" in set( - inspect.signature(scheduler.step).parameters.keys()): + if "generator" in set(inspect.signature(scheduler.step).parameters.keys()): kwargs["generator"] = paddle.Generator().manual_seed(0) - output = scheduler.step(residual, time_step, sample, - **kwargs).prev_sample + output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample - if "generator" in set( - inspect.signature(scheduler.step).parameters.keys()): + if "generator" in set(inspect.signature(scheduler.step).parameters.keys()): kwargs["generator"] = paddle.Generator().manual_seed(0) - new_output = new_scheduler.step(residual, time_step, sample, - **kwargs).prev_sample + new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler outputs are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" def check_over_forward(self, time_step=0, **forward_kwargs): kwargs = dict(self.forward_default_kwargs) @@ -310,9 +299,10 @@ def check_over_forward(self, time_step=0, **forward_kwargs): for scheduler_class in self.scheduler_classes: if scheduler_class in ( - EulerAncestralDiscreteScheduler, - EulerDiscreteScheduler, - LMSDiscreteScheduler, ): + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + LMSDiscreteScheduler, + ): time_step = float(time_step) scheduler_config = self.get_scheduler_config() @@ -331,28 +321,21 @@ def check_over_forward(self, time_step=0, **forward_kwargs): scheduler.save_config(tmpdirname) new_scheduler = scheduler_class.from_pretrained(tmpdirname) - if num_inference_steps is not None and hasattr(scheduler, - "set_timesteps"): + if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): scheduler.set_timesteps(num_inference_steps) new_scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr( - scheduler, "set_timesteps"): + elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): kwargs["num_inference_steps"] = num_inference_steps - if "generator" in set( - inspect.signature(scheduler.step).parameters.keys()): + if "generator" in set(inspect.signature(scheduler.step).parameters.keys()): kwargs["generator"] = paddle.Generator().manual_seed(0) - output = scheduler.step(residual, time_step, sample, - **kwargs).prev_sample + output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample - if "generator" in set( - inspect.signature(scheduler.step).parameters.keys()): + if "generator" in set(inspect.signature(scheduler.step).parameters.keys()): kwargs["generator"] = paddle.Generator().manual_seed(0) - new_output = new_scheduler.step(residual, time_step, sample, - **kwargs).prev_sample + new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler outputs are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" def test_from_save_pretrained(self): kwargs = dict(self.forward_default_kwargs) @@ -362,9 +345,10 @@ def test_from_save_pretrained(self): for scheduler_class in self.scheduler_classes: timestep = 1 if scheduler_class in ( - EulerAncestralDiscreteScheduler, - EulerDiscreteScheduler, - LMSDiscreteScheduler, ): + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + LMSDiscreteScheduler, + ): timestep = float(timestep) scheduler_config = self.get_scheduler_config() @@ -383,28 +367,21 @@ def test_from_save_pretrained(self): scheduler.save_config(tmpdirname) new_scheduler = scheduler_class.from_pretrained(tmpdirname) - if num_inference_steps is not None and hasattr(scheduler, - "set_timesteps"): + if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): scheduler.set_timesteps(num_inference_steps) new_scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr( - scheduler, "set_timesteps"): + elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): kwargs["num_inference_steps"] = num_inference_steps - if "generator" in set( - inspect.signature(scheduler.step).parameters.keys()): + if "generator" in set(inspect.signature(scheduler.step).parameters.keys()): kwargs["generator"] = paddle.Generator().manual_seed(0) - output = scheduler.step(residual, timestep, sample, - **kwargs).prev_sample + output = scheduler.step(residual, timestep, sample, **kwargs).prev_sample - if "generator" in set( - inspect.signature(scheduler.step).parameters.keys()): + if "generator" in set(inspect.signature(scheduler.step).parameters.keys()): kwargs["generator"] = paddle.Generator().manual_seed(0) - new_output = new_scheduler.step(residual, timestep, sample, - **kwargs).prev_sample + new_output = new_scheduler.step(residual, timestep, sample, **kwargs).prev_sample - assert (paddle.sum(paddle.abs(output - new_output)) < 1e-5 - ), "Scheduler outputs are not identical" + assert paddle.sum(paddle.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" def test_compatibles(self): for scheduler_class in self.scheduler_classes: @@ -415,31 +392,20 @@ def test_compatibles(self): assert all(c is not None for c in scheduler.compatibles) for comp_scheduler_cls in scheduler.compatibles: - comp_scheduler = comp_scheduler_cls.from_config( - scheduler.config) + comp_scheduler = comp_scheduler_cls.from_config(scheduler.config) assert comp_scheduler is not None new_scheduler = scheduler_class.from_config(comp_scheduler.config) - new_scheduler_config = { - k: v - for k, v in new_scheduler.config.items() - if k in scheduler.config - } - scheduler_diff = { - k: v - for k, v in new_scheduler.config.items() - if k not in scheduler.config - } + new_scheduler_config = {k: v for k, v in new_scheduler.config.items() if k in scheduler.config} + scheduler_diff = {k: v for k, v in new_scheduler.config.items() if k not in scheduler.config} # make sure that configs are essentially identical assert new_scheduler_config == dict(scheduler.config) # make sure that only differences are for configs that are not in init - init_keys = inspect.signature( - scheduler_class.__init__).parameters.keys() - assert set(scheduler_diff.keys()).intersection(set( - init_keys)) == set() + init_keys = inspect.signature(scheduler_class.__init__).parameters.keys() + assert set(scheduler_diff.keys()).intersection(set(init_keys)) == set() def test_from_pretrained(self): for scheduler_class in self.scheduler_classes: @@ -463,9 +429,10 @@ def test_step_shape(self): for scheduler_class in self.scheduler_classes: if scheduler_class in ( - EulerAncestralDiscreteScheduler, - EulerDiscreteScheduler, - LMSDiscreteScheduler, ): + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + LMSDiscreteScheduler, + ): timestep_0 = float(timestep_0) timestep_1 = float(timestep_1) @@ -481,17 +448,13 @@ def test_step_shape(self): sample = self.dummy_sample residual = 0.1 * sample - if num_inference_steps is not None and hasattr(scheduler, - "set_timesteps"): + if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr( - scheduler, "set_timesteps"): + elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): kwargs["num_inference_steps"] = num_inference_steps - output_0 = scheduler.step(residual, timestep_0, sample, - **kwargs).prev_sample - output_1 = scheduler.step(residual, timestep_1, sample, - **kwargs).prev_sample + output_0 = scheduler.step(residual, timestep_0, sample, **kwargs).prev_sample + output_1 = scheduler.step(residual, timestep_1, sample, **kwargs).prev_sample self.assertEqual(output_0.shape, sample.shape) self.assertEqual(output_0.shape, output_1.shape) @@ -504,12 +467,10 @@ def set_nan_tensor_to_zero(t): def recursive_check(tuple_object, dict_object): if isinstance(tuple_object, (List, Tuple)): - for tuple_iterable_value, dict_iterable_value in zip( - tuple_object, dict_object.values()): + for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()): recursive_check(tuple_iterable_value, dict_iterable_value) elif isinstance(tuple_object, Dict): - for tuple_iterable_value, dict_iterable_value in zip( - tuple_object.values(), dict_object.values()): + for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()): recursive_check(tuple_iterable_value, dict_iterable_value) elif tuple_object is None: return @@ -518,27 +479,29 @@ def recursive_check(tuple_object, dict_object): paddle.allclose( set_nan_tensor_to_zero(tuple_object).cast("float32"), set_nan_tensor_to_zero(dict_object).cast("float32"), - atol=1e-5, ), + atol=1e-5, + ), msg=( "Tuple and dict output are not equal. Difference:" f" {paddle.max(paddle.abs(tuple_object - dict_object))}. Tuple has `nan`:" f" {paddle.isnan(tuple_object).any()} and `inf`: {paddle.isinf(tuple_object)}. Dict has" f" `nan`: {paddle.isnan(dict_object).any()} and `inf`: {paddle.isinf(dict_object)}." - ), ) + ), + ) kwargs = dict(self.forward_default_kwargs) num_inference_steps = kwargs.pop("num_inference_steps", 50) timestep = 0 - if (len(self.scheduler_classes) > 0 and - self.scheduler_classes[0] == IPNDMScheduler): + if len(self.scheduler_classes) > 0 and self.scheduler_classes[0] == IPNDMScheduler: timestep = 1 for scheduler_class in self.scheduler_classes: if scheduler_class in ( - EulerAncestralDiscreteScheduler, - EulerDiscreteScheduler, - LMSDiscreteScheduler, ): + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + LMSDiscreteScheduler, + ): timestep = float(timestep) scheduler_config = self.get_scheduler_config() @@ -553,32 +516,25 @@ def recursive_check(tuple_object, dict_object): sample = self.dummy_sample residual = 0.1 * sample - if num_inference_steps is not None and hasattr(scheduler, - "set_timesteps"): + if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr( - scheduler, "set_timesteps"): + elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): kwargs["num_inference_steps"] = num_inference_steps # Set the seed before state as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler - if "generator" in set( - inspect.signature(scheduler.step).parameters.keys()): + if "generator" in set(inspect.signature(scheduler.step).parameters.keys()): kwargs["generator"] = paddle.Generator().manual_seed(0) outputs_dict = scheduler.step(residual, timestep, sample, **kwargs) - if num_inference_steps is not None and hasattr(scheduler, - "set_timesteps"): + if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr( - scheduler, "set_timesteps"): + elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): kwargs["num_inference_steps"] = num_inference_steps # Set the seed before state as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler - if "generator" in set( - inspect.signature(scheduler.step).parameters.keys()): + if "generator" in set(inspect.signature(scheduler.step).parameters.keys()): kwargs["generator"] = paddle.Generator().manual_seed(0) - outputs_tuple = scheduler.step( - residual, timestep, sample, return_dict=False, **kwargs) + outputs_tuple = scheduler.step(residual, timestep, sample, return_dict=False, **kwargs) recursive_check(outputs_tuple, outputs_dict) @@ -594,8 +550,11 @@ def test_scheduler_public_api(self): ) self.assertTrue( hasattr(scheduler, "scale_model_input"), - (f"{scheduler_class} does not implement a required class method `scale_model_input(sample," - " timestep)`"), ) + ( + f"{scheduler_class} does not implement a required class method `scale_model_input(sample," + " timestep)`" + ), + ) self.assertTrue( hasattr(scheduler, "step"), f"{scheduler_class} does not implement a required class method `step(...)`", @@ -625,9 +584,7 @@ def test_add_noise_device(self): def test_deprecated_kwargs(self): for scheduler_class in self.scheduler_classes: - has_kwarg_in_model_class = ( - "kwargs" in - inspect.signature(scheduler_class.__init__).parameters) + has_kwarg_in_model_class = "kwargs" in inspect.signature(scheduler_class.__init__).parameters has_deprecated_kwarg = len(scheduler_class._deprecated_kwargs) > 0 if has_kwarg_in_model_class and not has_deprecated_kwarg: @@ -635,7 +592,8 @@ def test_deprecated_kwargs(self): f"{scheduler_class} has `**kwargs` in its __init__ method but has not defined any deprecated" " kwargs under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if" " there are no deprecated arguments or add the deprecated argument with `_deprecated_kwargs =" - " []`") + " []`" + ) if not has_kwarg_in_model_class and has_deprecated_kwarg: raise ValueError( @@ -651,8 +609,7 @@ def test_trained_betas(self): continue scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class( - **scheduler_config, trained_betas=np.array([0.1, 0.3])) + scheduler = scheduler_class(**scheduler_config, trained_betas=np.array([0.1, 0.3])) with tempfile.TemporaryDirectory() as tmpdirname: scheduler.save_pretrained(tmpdirname) @@ -680,8 +637,7 @@ def test_getattr_is_correct(self): # no warning should be thrown assert cap_logger.out == "" - logger = logging.get_logger( - "ppdiffusers.schedulers.schedulering_utils") + logger = logging.get_logger("ppdiffusers.schedulers.schedulering_utils") # 30 for warning logger.setLevel(30) with CaptureLogger(logger) as cap_logger: @@ -703,7 +659,4 @@ def test_getattr_is_correct(self): with self.assertRaises(AttributeError) as error: scheduler.does_not_exist - assert ( - str(error.exception) == - f"'{type(scheduler).__name__}' object has no attribute 'does_not_exist'" - ) + assert str(error.exception) == f"'{type(scheduler).__name__}' object has no attribute 'does_not_exist'" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000000..4b868b99b22f9 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,23 @@ +[tool.isort] +profile = 'black' +known_third_party = ["paddle"] + +[tool.black] +line-length = 119 +target_version = ['py35', 'py36', 'py37', 'py38', 'py39', 'py310'] +exclude = ['.flake8'] + +[tool.pytest.ini_options] +minversion = "6.0" +pythonpath = ["."] +testpaths = [ + # "tests/models", +] +python_files = [ + "test.py", + "test_*.py" +] +filterwarnings = [ + "ignore::UserWarning", + 'ignore::DeprecationWarning', +] \ No newline at end of file diff --git a/setup.py b/setup.py index 2578b4bad4f96..0074ba09ce033 100644 --- a/setup.py +++ b/setup.py @@ -46,8 +46,7 @@ def read_requirements(): setup( name="paddlemix", - packages=(find_packages() + find_packages( - where="./ppdiffusers", exclude=["tests", "tests.*"])), + packages=(find_packages() + find_packages(where="./ppdiffusers", exclude=["tests", "tests.*"])), package_dir={ "": ".", "ppdiffusers": "./ppdiffusers/ppdiffusers", @@ -62,10 +61,7 @@ def read_requirements(): keywords=["paddle", "paddlemix"], install_requires=REQUIRED_PACKAGES, python_requires=">=3.6", - entry_points={ - "console_scripts": - ["ppdiffusers-cli=ppdiffusers.commands.ppdiffusers_cli:main"] - }, + entry_points={"console_scripts": ["ppdiffusers-cli=ppdiffusers.commands.ppdiffusers_cli:main"]}, classifiers=[ "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.6", @@ -75,4 +71,5 @@ def read_requirements(): "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", ], - license="Apache 2.0", ) + license="Apache 2.0", +) diff --git a/tests/models/test_blip2.py b/tests/models/test_blip2.py index 4193ca0f0a0de..d11db96722581 100644 --- a/tests/models/test_blip2.py +++ b/tests/models/test_blip2.py @@ -21,49 +21,54 @@ import numpy as np import paddle import paddle.nn as nn -import requests from paddlenlp.transformers.opt.configuration import OPTConfig -from PIL import Image -from paddlemix.models.blip2 import (Blip2Config, Blip2ForConditionalGeneration, - Blip2QFormerConfig, Blip2VisionConfig) +from paddlemix.models.blip2 import ( + Blip2Config, + Blip2ForConditionalGeneration, + Blip2QFormerConfig, + Blip2VisionConfig, +) from paddlemix.models.blip2.eva_vit import VisionTransformer -from paddlemix.models.blip2.modeling import \ - BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST +from paddlemix.models.blip2.modeling import BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST from paddlemix.models.blip2.Qformer import BertLMHeadModel from tests.models.test_configuration_common import ConfigTester from tests.models.test_modeling_common import ( - ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask) + ModelTesterMixin, + floats_tensor, + ids_tensor, + random_attention_mask, +) from tests.testing_utils import slow def _config_zero_init(config): configs_no_init = copy.deepcopy(config) for key in configs_no_init.__dict__.keys(): - if ("_range" in key or "_std" in key or "initializer_factor" in key or - "layer_scale" in key): + if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key: setattr(configs_no_init, key, 1e-10) return configs_no_init class Blip2VisionModelTester: def __init__( - self, - parent, - batch_size=12, - image_size=30, - patch_size=2, - num_channels=3, - is_training=True, - hidden_size=1408, - projection_dim=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, - dropout=0.1, - attention_dropout=0.1, - initializer_range=1e-10, - scope=None, ): + self, + parent, + batch_size=12, + image_size=30, + patch_size=2, + num_channels=3, + is_training=True, + hidden_size=1408, + projection_dim=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + initializer_range=1e-10, + scope=None, + ): self.parent = parent self.batch_size = batch_size self.image_size = image_size @@ -81,13 +86,11 @@ def __init__( self.scope = scope # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token) - num_patches = (image_size // patch_size)**2 + num_patches = (image_size // patch_size) ** 2 self.seq_length = num_patches + 1 def prepare_config_and_inputs(self): - pixel_values = floats_tensor([ - self.batch_size, self.num_channels, self.image_size, self.image_size - ]) + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) config = self.get_config() return config, pixel_values @@ -104,7 +107,8 @@ def get_config(self): intermediate_size=self.intermediate_size, dropout=self.dropout, attention_dropout=self.attention_dropout, - initializer_range=self.initializer_range, ) + initializer_range=self.initializer_range, + ) def create_and_check_model(self, config, pixel_values): model = VisionTransformer(config=config) @@ -114,13 +118,12 @@ def create_and_check_model(self, config, pixel_values): # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token) image_size = (self.image_size, self.image_size) patch_size = (self.patch_size, self.patch_size) - num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // - patch_size[0]) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) self.parent.assertEqual( result.last_hidden_state.shape, - [self.batch_size, num_patches + 1, self.hidden_size], ) - self.parent.assertEqual(result.pooler_output.shape, - [self.batch_size, self.hidden_size]) + [self.batch_size, num_patches + 1, self.hidden_size], + ) + self.parent.assertEqual(result.pooler_output.shape, [self.batch_size, self.hidden_size]) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() @@ -135,7 +138,7 @@ class Blip2VisionModelTest(ModelTesterMixin, unittest.TestCase): attention_mask and seq_length. """ - all_model_classes = (VisionTransformer, ) + all_model_classes = (VisionTransformer,) fx_compatible = False test_pruning = False test_resize_embeddings = False @@ -148,7 +151,8 @@ def setUp(self): self, config_class=Blip2VisionConfig, has_text_modality=False, - hidden_size=37, ) + hidden_size=37, + ) def test_config(self): self.config_tester.run_common_tests() @@ -191,28 +195,29 @@ def test_model_from_pretrained(self): class BertLMHeadModelTester: def __init__( - self, - parent, - batch_size=12, - seq_length=7, - is_training=True, - use_input_mask=True, - use_labels=True, - vocab_size=99, - hidden_size=768, - projection_dim=32, - num_hidden_layers=6, - num_attention_heads=4, - intermediate_size=37, - dropout=0.1, - attention_dropout=0.1, - max_position_embeddings=512, - initializer_range=0.02, - bos_token_id=0, - scope=None, - num_patches=257, - encoder_hidden_size=1408, - encoder_width=1408, ): + self, + parent, + batch_size=12, + seq_length=7, + is_training=True, + use_input_mask=True, + use_labels=True, + vocab_size=99, + hidden_size=768, + projection_dim=32, + num_hidden_layers=6, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + max_position_embeddings=512, + initializer_range=0.02, + bos_token_id=0, + scope=None, + num_patches=257, + encoder_hidden_size=1408, + encoder_width=1408, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -236,12 +241,9 @@ def __init__( self.encoder_width = encoder_width def prepare_config_and_inputs(self): - query_embeds = floats_tensor( - [self.batch_size, self.seq_length, self.hidden_size]) - encoder_hidden_states = floats_tensor( - [self.batch_size, self.num_patches, self.encoder_hidden_size]) - encoder_attention_mask = random_attention_mask( - [self.batch_size, self.num_patches]) + query_embeds = floats_tensor([self.batch_size, self.seq_length, self.hidden_size]) + encoder_hidden_states = floats_tensor([self.batch_size, self.num_patches, self.encoder_hidden_size]) + encoder_attention_mask = random_attention_mask([self.batch_size, self.num_patches]) config = self.get_config() return config, query_embeds, encoder_hidden_states, encoder_attention_mask @@ -259,19 +261,21 @@ def get_config(self): max_position_embeddings=self.max_position_embeddings, initializer_range=self.initializer_range, bos_token_id=self.bos_token_id, - encoder_hidden_size=self.encoder_hidden_size, ) + encoder_hidden_size=self.encoder_hidden_size, + ) - def create_and_check_model(self, config, query_embeds, - encoder_hidden_states, encoder_attention_mask): + def create_and_check_model(self, config, query_embeds, encoder_hidden_states, encoder_attention_mask): model = BertLMHeadModel(config=config, encoder_width=self.encoder_width) model.eval() result = model( query_embeds=query_embeds, encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, ) + encoder_attention_mask=encoder_attention_mask, + ) self.parent.assertEqual( result.last_hidden_state.shape, - [self.batch_size, self.seq_length, self.hidden_size], ) + [self.batch_size, self.seq_length, self.hidden_size], + ) model = BertLMHeadModel(config=config) model.eval() @@ -279,11 +283,13 @@ def create_and_check_model(self, config, query_embeds, result = model( query_embeds, encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, ) + encoder_attention_mask=encoder_attention_mask, + ) self.parent.assertEqual( result.last_hidden_state.shape, - [self.batch_size, self.seq_length, self.hidden_size], ) + [self.batch_size, self.seq_length, self.hidden_size], + ) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() @@ -291,7 +297,8 @@ def prepare_config_and_inputs_for_common(self): config, query_embeds, encoder_hidden_states, - encoder_attention_mask, ) = config_and_inputs + encoder_attention_mask, + ) = config_and_inputs inputs_dict = { "query_embeds": query_embeds, "encoder_hidden_states": encoder_hidden_states, @@ -301,7 +308,7 @@ def prepare_config_and_inputs_for_common(self): class BertLMHeadModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (BertLMHeadModel, ) + all_model_classes = (BertLMHeadModel,) fx_compatible = False test_pruning = False test_resize_embeddings = False @@ -314,7 +321,8 @@ def setUp(self): self, config_class=Blip2QFormerConfig, has_text_modality=False, - hidden_size=37, ) + hidden_size=37, + ) def test_config(self): self.config_tester.run_common_tests() @@ -337,28 +345,29 @@ def test_save_load(self): class Blip2TextModelTester: def __init__( - self, - parent, - batch_size=12, - seq_length=7, - is_training=True, - use_labels=False, - vocab_size=99, - hidden_size=16, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=4, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=20, - eos_token_id=2, - pad_token_id=1, - bos_token_id=0, - embed_dim=16, - num_labels=3, - word_embed_proj_dim=16, - type_sequence_label_size=2, ): + self, + parent, + batch_size=12, + seq_length=7, + is_training=True, + use_labels=False, + vocab_size=99, + hidden_size=16, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=4, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=20, + eos_token_id=2, + pad_token_id=1, + bos_token_id=0, + embed_dim=16, + num_labels=3, + word_embed_proj_dim=16, + type_sequence_label_size=2, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -385,14 +394,12 @@ def __init__( def prepare_config_and_inputs(self): config = self.get_config() - input_ids = ids_tensor( - [self.batch_size, self.seq_length], self.vocab_size, - dtype="int64").clip(3, ) + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size, dtype="int64").clip( + 3, + ) input_ids[:, -1] = self.eos_token_id # Eos Token - attention_mask = input_ids.not_equal( - paddle.to_tensor( - [self.pad_token_id], dtype="int64")).cast("int64") + attention_mask = input_ids.not_equal(paddle.to_tensor([self.pad_token_id], dtype="int64")).cast("int64") return config, input_ids, attention_mask @@ -411,18 +418,20 @@ def get_config(self): pad_token_id=self.pad_token_id, embed_dim=self.embed_dim, is_encoder_decoder=False, - word_embed_proj_dim=self.word_embed_proj_dim, ) + word_embed_proj_dim=self.word_embed_proj_dim, + ) class Blip2ModelTester: def __init__( - self, - parent, - vision_kwargs=None, - qformer_kwargs=None, - text_kwargs=None, - is_training=True, - num_query_tokens=10, ): + self, + parent, + vision_kwargs=None, + qformer_kwargs=None, + text_kwargs=None, + is_training=True, + num_query_tokens=10, + ): if vision_kwargs is None: vision_kwargs = {} if qformer_kwargs is None: @@ -431,10 +440,8 @@ def __init__( text_kwargs = {} self.parent = parent - self.vision_model_tester = Blip2VisionModelTester(parent, - **vision_kwargs) - self.qformer_model_tester = BertLMHeadModelTester(parent, - **qformer_kwargs) + self.vision_model_tester = Blip2VisionModelTester(parent, **vision_kwargs) + self.qformer_model_tester = BertLMHeadModelTester(parent, **qformer_kwargs) self.text_model_tester = Blip2TextModelTester(parent, **text_kwargs) self.is_training = is_training self.num_query_tokens = num_query_tokens @@ -456,16 +463,15 @@ def get_config(self): vision_config=self.vision_model_tester.get_config(), qformer_config=self.qformer_model_tester.get_config(), text_config=self.text_model_tester.get_config(), - num_query_tokens=self.num_query_tokens, ) + num_query_tokens=self.num_query_tokens, + ) @unittest.skip(reason="BLIP-2's output needs to unified") - def create_and_check_for_conditional_generation( - self, config, input_ids, attention_mask, pixel_values): + def create_and_check_for_conditional_generation(self, config, input_ids, attention_mask, pixel_values): model = Blip2ForConditionalGeneration(config) model.eval() with paddle.no_grad(): - result = model( - pixel_values, input_ids, attention_mask, return_dict=True) + result = model(pixel_values, input_ids, attention_mask, return_dict=True) self.parent.assertEqual( result.logits.shape, @@ -473,7 +479,8 @@ def create_and_check_for_conditional_generation( self.vision_model_tester.batch_size, self.text_model_tester.seq_length + self.num_query_tokens, self.text_model_tester.vocab_size, - ], ) + ], + ) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() @@ -481,7 +488,8 @@ def prepare_config_and_inputs_for_common(self): config, input_ids, attention_mask, - pixel_values, ) = config_and_inputs + pixel_values, + ) = config_and_inputs inputs_dict = { "pixel_values": pixel_values, "input_ids": input_ids, @@ -491,7 +499,7 @@ def prepare_config_and_inputs_for_common(self): class Blip2ModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (Blip2ForConditionalGeneration, ) + all_model_classes = (Blip2ForConditionalGeneration,) fx_compatible = False test_head_masking = False test_pruning = False @@ -505,16 +513,14 @@ def setUp(self): def test_for_conditional_generation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_for_conditional_generation( - *config_and_inputs) + self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs) @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): pass def test_determinism(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( - ) + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() def check_determinism(first, second): out_1 = first.numpy() @@ -551,22 +557,19 @@ def test_forward_signature(self): self.assertListEqual(arg_names[:1], expected_arg_names) def test_load_vision_qformer_text_config(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( - ) + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() # Save Blip2Config and check if we can load Blip2VisionConfig from it with tempfile.TemporaryDirectory() as tmp_dir_name: config.save_pretrained(tmp_dir_name) vision_config = Blip2VisionConfig.from_pretrained(tmp_dir_name) - self.assertDictEqual(config.vision_config.to_dict(), - vision_config.to_dict()) + self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict()) # Save Blip2Config and check if we can load Blip2QFormerConfig from it with tempfile.TemporaryDirectory() as tmp_dir_name: config.save_pretrained(tmp_dir_name) qformer_config = Blip2QFormerConfig.from_pretrained(tmp_dir_name) - self.assertDictEqual(config.qformer_config.to_dict(), - qformer_config.to_dict()) + self.assertDictEqual(config.qformer_config.to_dict(), qformer_config.to_dict()) @slow def test_model_from_pretrained(self): diff --git a/tests/models/test_configuration_common.py b/tests/models/test_configuration_common.py index b014bbfe522ea..839941f706385 100644 --- a/tests/models/test_configuration_common.py +++ b/tests/models/test_configuration_common.py @@ -12,22 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy import json import os import tempfile -import unittest.mock as mock - -from paddlenlp.transformers.configuration_utils import PretrainedConfig -from requests.exceptions import HTTPError class ConfigTester(object): - def __init__(self, - parent, - config_class=None, - has_text_modality=True, - **kwargs): + def __init__(self, parent, config_class=None, has_text_modality=True, **kwargs): self.parent = parent self.config_class = config_class self.has_text_modality = has_text_modality diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py index 43ae283d9149d..226caf803f84a 100644 --- a/tests/models/test_modeling_common.py +++ b/tests/models/test_modeling_common.py @@ -18,9 +18,7 @@ import os import random import shutil -import subprocess import tempfile -import time import unittest from typing import Optional, Tuple, Type @@ -36,8 +34,7 @@ def _config_zero_init(config): configs_no_init = copy.deepcopy(config) for key in configs_no_init.__dict__.keys(): - if ("_range" in key or "_std" in key or "initializer_factor" in key or - "layer_scale" in key): + if "_range" in key or "_std" in key or "initializer_factor" in key or "layer_scale" in key: setattr(configs_no_init, key, 1e-10) return configs_no_init @@ -64,11 +61,8 @@ def floats_tensor(shape, scale=1.0): return scale * paddle.randn(shape, dtype="float32") -def check_two_model_parameter(first_model: PretrainedModel, - second_model: PretrainedModel): - assert (len( - set(first_model.state_dict().keys()) - set(second_model.state_dict() - .keys())) == 0) +def check_two_model_parameter(first_model: PretrainedModel, second_model: PretrainedModel): + assert len(set(first_model.state_dict().keys()) - set(second_model.state_dict().keys())) == 0 # random choice the keys to compare key = random.choice(list(first_model.state_dict().keys())) @@ -106,8 +100,7 @@ def _make_model_instance(self, config, model_class): return model_class(self.base_model_class(**config)) def test_save_load(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( - ) + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() def check_save_load(out1, out2): # make sure we don't have nans @@ -123,16 +116,14 @@ def check_save_load(out1, out2): model = self._make_model_instance(config, model_class) model.eval() with paddle.no_grad(): - first = model(**self._prepare_for_class(inputs_dict, - model_class))[0] + first = model(**self._prepare_for_class(inputs_dict, model_class))[0] with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) model = model_class.from_pretrained(tmpdirname) model.eval() with paddle.no_grad(): - second = model(**self._prepare_for_class(inputs_dict, - model_class))[0] + second = model(**self._prepare_for_class(inputs_dict, model_class))[0] # support tuple of tensor if isinstance(first, tuple) and isinstance(second, tuple): @@ -142,8 +133,7 @@ def check_save_load(out1, out2): check_save_load(first, second) def test_determinism(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( - ) + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() def check_determinism(first, second): out_1 = first.numpy() @@ -157,10 +147,8 @@ def check_determinism(first, second): model = self._make_model_instance(config, model_class) model.eval() with paddle.no_grad(): - first = model(**self._prepare_for_class(inputs_dict, - model_class))[0] - second = model(**self._prepare_for_class(inputs_dict, - model_class))[0] + first = model(**self._prepare_for_class(inputs_dict, model_class))[0] + second = model(**self._prepare_for_class(inputs_dict, model_class))[0] if isinstance(first, tuple) and isinstance(second, tuple): for tensor1, tensor2 in zip(first, second): @@ -190,30 +178,21 @@ def test_training_gradient_checkpointing(self): def test_attention_outputs(self): if not self.has_attentions: return - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( - ) + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() seq_len = getattr(self.model_tester, "seq_length", None) - decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", - seq_len) - encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", - seq_len) - decoder_key_length = getattr(self.model_tester, "decoder_key_length", - decoder_seq_length) - encoder_key_length = getattr(self.model_tester, "key_length", - encoder_seq_length) + decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) + encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len) + decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length) + encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length) chunk_length = getattr(self.model_tester, "chunk_length", None) - if chunk_length is not None and hasattr(self.model_tester, - "num_hashes"): + if chunk_length is not None and hasattr(self.model_tester, "num_hashes"): encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes for model_class in self.all_model_classes: signature = inspect.signature(model_class.forward) # signature.parameters is an OrderedDict => so arg_names order is deterministic arg_names = [*signature.parameters.keys()] - if not all( - name in arg_names - for name in - ["output_attentions", "output_hidden_states", "return_dict"]): + if not all(name in arg_names for name in ["output_attentions", "output_hidden_states", "return_dict"]): continue inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False @@ -221,12 +200,9 @@ def test_attention_outputs(self): model = self._make_model_instance(config, model_class) model.eval() with paddle.no_grad(): - outputs = model(**self._prepare_for_class(inputs_dict, - model_class)) - attentions = (outputs.encoder_attentions - if self.is_encoder_decoder else outputs.attentions) - self.assertEqual( - len(attentions), self.model_tester.num_hidden_layers) + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions if self.is_encoder_decoder else outputs.attentions + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) # TODO(guosheng): check that output_attentions also work using config @@ -238,7 +214,8 @@ def test_attention_outputs(self): encoder_seq_length, chunk_length, encoder_key_length, - ], ) + ], + ) else: self.assertListEqual( list(attentions[0].shape[-3:]), @@ -246,7 +223,8 @@ def test_attention_outputs(self): self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length, - ], ) + ], + ) out_len = len(outputs) if self.is_encoder_decoder: @@ -257,9 +235,7 @@ def test_attention_outputs(self): correct_outlen += 1 # loss is added to beginning # Question Answering model returns start_logits and end_logits if model_class.__name__.endswith("ForQuestionAnswering"): - correct_outlen += ( - 1 # start_logits and end_logits instead of only 1 output - ) + correct_outlen += 1 # start_logits and end_logits instead of only 1 output if "past_key_values" in outputs: correct_outlen += 1 # past_key_values have been returned @@ -268,29 +244,28 @@ def test_attention_outputs(self): # decoder attentions decoder_attentions = outputs.decoder_attentions self.assertIsInstance(decoder_attentions, (list, tuple)) - self.assertEqual( - len(decoder_attentions), - self.model_tester.num_hidden_layers) + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(decoder_attentions[0].shape[-3:]), [ self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length, - ], ) + ], + ) # cross attentions cross_attentions = outputs.cross_attentions self.assertIsInstance(cross_attentions, (list, tuple)) - self.assertEqual( - len(cross_attentions), self.model_tester.num_hidden_layers) + self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(cross_attentions[0].shape[-3:]), [ self.model_tester.num_attention_heads, decoder_seq_length, encoder_key_length, - ], ) + ], + ) # Check attention is always last and order is fine inputs_dict["output_attentions"] = True @@ -298,8 +273,7 @@ def test_attention_outputs(self): model = self._make_model_instance(config, model_class) model.eval() with paddle.no_grad(): - outputs = model(**self._prepare_for_class(inputs_dict, - model_class)) + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) if hasattr(self.model_tester, "num_hidden_states_types"): added_hidden_states = self.model_tester.num_hidden_states_types @@ -309,11 +283,9 @@ def test_attention_outputs(self): added_hidden_states = 1 self.assertEqual(out_len + added_hidden_states, len(outputs)) - self_attentions = (outputs.encoder_attentions if - self.is_encoder_decoder else outputs.attentions) + self_attentions = outputs.encoder_attentions if self.is_encoder_decoder else outputs.attentions - self.assertEqual( - len(self_attentions), self.model_tester.num_hidden_layers) + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) if chunk_length is not None: self.assertListEqual( list(self_attentions[0].shape[-4:]), @@ -322,7 +294,8 @@ def test_attention_outputs(self): encoder_seq_length, chunk_length, encoder_key_length, - ], ) + ], + ) else: self.assertListEqual( list(self_attentions[0].shape[-3:]), @@ -330,7 +303,8 @@ def test_attention_outputs(self): self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length, - ], ) + ], + ) def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): @@ -338,29 +312,28 @@ def check_hidden_states_output(inputs_dict, config, model_class): model.eval() with paddle.no_grad(): - outputs = model(**self._prepare_for_class(inputs_dict, - model_class)) + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - hidden_states = (outputs.encoder_hidden_states if - self.is_encoder_decoder else outputs.hidden_states) + hidden_states = outputs.encoder_hidden_states if self.is_encoder_decoder else outputs.hidden_states expected_num_layers = getattr( self.model_tester, "expected_num_hidden_layers", - self.model_tester.num_hidden_layers + 1, ) + self.model_tester.num_hidden_layers + 1, + ) self.assertEqual(len(hidden_states), expected_num_layers) if hasattr(self.model_tester, "encoder_seq_length"): seq_length = self.model_tester.encoder_seq_length - if (hasattr(self.model_tester, "chunk_length") and - self.model_tester.chunk_length > 1): + if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1: seq_length = seq_length * self.model_tester.chunk_length else: seq_length = self.model_tester.seq_length self.assertListEqual( list(hidden_states[0].shape[-2:]), - [seq_length, self.model_tester.hidden_size], ) + [seq_length, self.model_tester.hidden_size], + ) if self.is_encoder_decoder: hidden_states = outputs.decoder_hidden_states @@ -368,24 +341,20 @@ def check_hidden_states_output(inputs_dict, config, model_class): self.assertIsInstance(hidden_states, (list, tuple)) self.assertEqual(len(hidden_states), expected_num_layers) seq_len = getattr(self.model_tester, "seq_length", None) - decoder_seq_length = getattr(self.model_tester, - "decoder_seq_length", seq_len) + decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) self.assertListEqual( list(hidden_states[0].shape[-2:]), - [decoder_seq_length, self.model_tester.hidden_size], ) + [decoder_seq_length, self.model_tester.hidden_size], + ) - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( - ) + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() inputs_dict["return_dict"] = True for model_class in self.all_model_classes: signature = inspect.signature(model_class.forward) # signature.parameters is an OrderedDict => so arg_names order is deterministic arg_names = [*signature.parameters.keys()] - if not all( - name in arg_names - for name in - ["output_attentions", "output_hidden_states", "return_dict"]): + if not all(name in arg_names for name in ["output_attentions", "output_hidden_states", "return_dict"]): continue inputs_dict["output_hidden_states"] = True check_hidden_states_output(inputs_dict, config, model_class) @@ -417,7 +386,8 @@ def test_resize_position_vector_embeddings(self): if self.is_encoder_decoder: ( encoder_model_embed, - decoder_model_embed, ) = model.get_position_embeddings() + decoder_model_embed, + ) = model.get_position_embeddings() encoder_cloned_embeddings = encoder_model_embed.weight.clone() decoder_cloned_embeddings = decoder_model_embed.weight.clone() else: @@ -427,24 +397,25 @@ def test_resize_position_vector_embeddings(self): # Check that resizing the position embeddings with a larger max_position_embeddings increases # the model's postion embeddings size model.resize_position_embeddings(max_position_embeddings + 10) - self.assertEqual(model.config.max_position_embeddings, - max_position_embeddings + 10) + self.assertEqual(model.config.max_position_embeddings, max_position_embeddings + 10) # Check that it actually resizes the embeddings matrix if model.config.is_encoder_decoder: ( encoder_model_embed, - decoder_model_embed, ) = model.get_position_embeddings() + decoder_model_embed, + ) = model.get_position_embeddings() self.assertEqual( encoder_model_embed.weight.shape[0], - encoder_cloned_embeddings.shape[0] + 10, ) + encoder_cloned_embeddings.shape[0] + 10, + ) self.assertEqual( decoder_model_embed.weight.shape[0], - decoder_cloned_embeddings.shape[0] + 10, ) + decoder_cloned_embeddings.shape[0] + 10, + ) else: model_embed = model.get_position_embeddings() - self.assertEqual(model_embed.weight.shape[0], - cloned_embeddings.shape[0] + 10) + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) # Check that the model can still do a forward pass successfully (every parameter should be resized) model(**self._prepare_for_class(inputs_dict, model_class)) @@ -454,23 +425,26 @@ def test_resize_position_vector_embeddings(self): model.resize_position_embeddings(max_position_embeddings - 5) self.assertEqual( model.base_model.config["max_position_embeddings"], - max_position_embeddings - 5, ) + max_position_embeddings - 5, + ) # Check that it actually resizes the embeddings matrix if self.is_encoder_decoder: ( encoder_model_embed, - decoder_model_embed, ) = model.get_position_embeddings() + decoder_model_embed, + ) = model.get_position_embeddings() self.assertEqual( encoder_model_embed.weight.shape[0], - encoder_cloned_embeddings.shape[0] - 5, ) + encoder_cloned_embeddings.shape[0] - 5, + ) self.assertEqual( decoder_model_embed.weight.shape[0], - decoder_cloned_embeddings.shape[0] - 5, ) + decoder_cloned_embeddings.shape[0] - 5, + ) else: model_embed = model.get_position_embeddings() - self.assertEqual(model_embed.weight.shape[0], - cloned_embeddings.shape[0] - 5) + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 5) # Check that the model can still do a forward pass successfully (every parameter should be resized) model(**self._prepare_for_class(inputs_dict, model_class)) @@ -479,12 +453,10 @@ def test_resize_position_vector_embeddings(self): models_equal = True if model.config.is_encoder_decoder: - for p1, p2 in zip(encoder_cloned_embeddings, - encoder_model_embed.weight): + for p1, p2 in zip(encoder_cloned_embeddings, encoder_model_embed.weight): if p1.data.ne(p2.data).sum() > 0: models_equal = False - for p1, p2 in zip(decoder_cloned_embeddings, - decoder_model_embed.weight): + for p1, p2 in zip(decoder_cloned_embeddings, decoder_model_embed.weight): if p1.data.ne(p2.data).sum() > 0: models_equal = False else: @@ -515,32 +487,27 @@ def test_resize_tokens_embeddings(self): # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size model_embed = model.resize_token_embeddings(model_vocab_size + 10) - self.assertEqual(model.base_model.config.vocab_size, - model_vocab_size + 10) + self.assertEqual(model.base_model.config.vocab_size, model_vocab_size + 10) # Check that it actually resizes the embeddings matrix - self.assertEqual(model_embed.weight.shape[0], - cloned_embeddings.shape[0] + 10) + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) # Check that the model can still do a forward pass successfully (every parameter should be resized) model(**self._prepare_for_class(inputs_dict, model_class)) # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size model_embed = model.resize_token_embeddings(model_vocab_size - 15) - self.assertEqual(model.base_model.config.vocab_size, - model_vocab_size - 15) + self.assertEqual(model.base_model.config.vocab_size, model_vocab_size - 15) # Check that it actually resizes the embeddings matrix - self.assertEqual(model_embed.weight.shape[0], - cloned_embeddings.shape[0] - 15) + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15) # Check that the model can still do a forward pass successfully (every parameter should be resized) # Input ids should be clamped to the maximum size of the vocabulary - inputs_dict["input_ids"] = paddle.clip( - inputs_dict["input_ids"], max=model_vocab_size - 15 - 1) + inputs_dict["input_ids"] = paddle.clip(inputs_dict["input_ids"], max=model_vocab_size - 15 - 1) # make sure that decoder_input_ids are resized as well if "decoder_input_ids" in inputs_dict: inputs_dict["decoder_input_ids"] = paddle.clip( - inputs_dict["decoder_input_ids"], - max=model_vocab_size - 15 - 1) + inputs_dict["decoder_input_ids"], max=model_vocab_size - 15 - 1 + ) model(**self._prepare_for_class(inputs_dict, model_class)) # Check that adding and removing tokens has not modified the first part of the embedding matrix. @@ -566,15 +533,13 @@ def test_inputs_embeds(self): if not self.use_test_inputs_embeds: return # get config for model and inputs_dict for model forward - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( - ) + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() # test all model classes for model_class in self.all_model_classes: model = self._make_model_instance(config, model_class) model.eval() - inputs = copy.deepcopy( - self._prepare_for_class(inputs_dict, model_class)) + inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) with paddle.no_grad(): ids_output = model(**inputs) @@ -584,8 +549,7 @@ def test_inputs_embeds(self): del inputs["input_ids"] else: encoder_input_ids = inputs["input_ids"] - decoder_input_ids = inputs.get("decoder_input_ids", - encoder_input_ids) + decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids) del inputs["input_ids"] inputs.pop("decoder_input_ids", None) @@ -616,8 +580,7 @@ def test_model_name_list(self): self.assertTrue(len(model.model_name_list) != 0) def test_pretrained_config_save_load(self): - if (self.base_model_class is None or - not self.base_model_class.constructed_from_pretrained_config()): + if self.base_model_class is None or not self.base_model_class.constructed_from_pretrained_config(): return config_class = self.base_model_class.config_class @@ -627,23 +590,21 @@ def test_pretrained_config_save_load(self): config.save_pretrained(tempdir) # check the file exist - self.assertFalse( - os.path.exists(os.path.join(tempdir, LEGACY_CONFIG_NAME))) + self.assertFalse(os.path.exists(os.path.join(tempdir, LEGACY_CONFIG_NAME))) self.assertTrue(os.path.exists(os.path.join(tempdir, CONFIG_NAME))) # rename the CONFIG_NAME shutil.move( os.path.join(tempdir, CONFIG_NAME), - os.path.join(tempdir, LEGACY_CONFIG_NAME), ) + os.path.join(tempdir, LEGACY_CONFIG_NAME), + ) loaded_config = config.__class__.from_pretrained(tempdir) for key in config.__dict__.keys(): - self.assertEqual( - getattr(config, key), getattr(loaded_config, key)) + self.assertEqual(getattr(config, key), getattr(loaded_config, key)) def random_choice_pretrained_config_field(self) -> Optional[str]: - if (self.base_model_class is None or - not self.base_model_class.constructed_from_pretrained_config()): + if self.base_model_class is None or not self.base_model_class.constructed_from_pretrained_config(): return None config = self.base_model_class.config_class() @@ -652,21 +613,17 @@ def random_choice_pretrained_config_field(self) -> Optional[str]: def test_for_missed_attribute(self): if not self.test_model_compatibility_keys: - self.skipTest( - f"Do not test model_compatibility_keys on {self.base_model_class}" - ) + self.skipTest(f"Do not test model_compatibility_keys on {self.base_model_class}") return - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( - ) + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: if not model_class.constructed_from_pretrained_config(): continue model = self._make_model_instance(config, model_class) - all_maps: dict = copy.deepcopy( - model_class.config_class.attribute_map) + all_maps: dict = copy.deepcopy(model_class.config_class.attribute_map) for old_attribute, new_attribute in all_maps.items(): old_value = getattr(model.config, old_attribute) @@ -683,11 +640,9 @@ def test_tie_weight(self): if not self.test_tie_weights: return - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common( - ) + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: - if ("CausalLM" not in model_class.__name__ and - "MaskedLM" not in model_class.__name__): + if "CausalLM" not in model_class.__name__ and "MaskedLM" not in model_class.__name__: continue model = self._make_model_instance(config, model_class) @@ -695,8 +650,7 @@ def test_tie_weight(self): if not model.config.tie_word_embeddings: continue - if hasattr(model, "get_input_embeddings") and hasattr( - model, "get_output_embeddings"): + if hasattr(model, "get_input_embeddings") and hasattr(model, "get_output_embeddings"): try: input_embeddings = model.get_input_embeddings() except NotImplementedError: @@ -719,14 +673,16 @@ def test_tie_weight(self): input_embeddings_weight = input_embeddings print( input_embeddings_weight, - output_embeddings_weight, ) - print("model name :{},id is{},{}".format( - model_class, - id(output_embeddings_weight), - id(input_embeddings_weight), )) - self.assertEqual( - id(output_embeddings_weight), - id(input_embeddings_weight)) + output_embeddings_weight, + ) + print( + "model name :{},id is{},{}".format( + model_class, + id(output_embeddings_weight), + id(input_embeddings_weight), + ) + ) + self.assertEqual(id(output_embeddings_weight), id(input_embeddings_weight)) class ModelTesterPretrainedMixin: @@ -739,48 +695,42 @@ class ModelTesterPretrainedMixin: def test_model_from_pretrained_hf_hub(self): if self.hf_remote_test_model_path is None or self.base_model_class is None: return - model = self.base_model_class.from_pretrained( - self.hf_remote_test_model_path, from_hf_hub=True) + model = self.base_model_class.from_pretrained(self.hf_remote_test_model_path, from_hf_hub=True) self.assertIsNotNone(model) def test_model_from_pretrained_paddle_hub(self): - if (self.paddlehub_remote_test_model_path is None or - self.base_model_class is None): + if self.paddlehub_remote_test_model_path is None or self.base_model_class is None: return - model = self.base_model_class.from_pretrained( - self.paddlehub_remote_test_model_path) + model = self.base_model_class.from_pretrained(self.paddlehub_remote_test_model_path) self.assertIsNotNone(model) def test_model_from_config_paddle_hub(self): - if (self.paddlehub_remote_test_model_path is None or - self.base_model_class is None): + if self.paddlehub_remote_test_model_path is None or self.base_model_class is None: return - config = self.base_model_class.config_class.from_pretrained( - self.paddlehub_remote_test_model_path) + config = self.base_model_class.config_class.from_pretrained(self.paddlehub_remote_test_model_path) model = self.base_model_class._from_config(config) self.assertIsNotNone(model) @slow def test_model_from_pretrained_with_cache_dir(self): - for model_name in list( - self.base_model_class.pretrained_init_configuration)[:1]: + for model_name in list(self.base_model_class.pretrained_init_configuration)[:1]: with tempfile.TemporaryDirectory() as tempdir: tempdir = str(tempdir) - model = self.base_model_class.from_pretrained( - model_name, cache_dir=tempdir) + model = self.base_model_class.from_pretrained(model_name, cache_dir=tempdir) self.assertIsNotNone(model) self.assertTrue( os.path.isfile( os.path.join( tempdir, model_name, - self.base_model_class.resource_files_names[ - "model_state"], ))) + self.base_model_class.resource_files_names["model_state"], + ) + ) + ) self.assertTrue( - os.path.isfile( - os.path.join(tempdir, model_name, - self.base_model_class.model_config_file))) + os.path.isfile(os.path.join(tempdir, model_name, self.base_model_class.model_config_file)) + ) @slow def test_pretrained_save_and_load(self): @@ -788,8 +738,7 @@ def test_pretrained_save_and_load(self): eg: `bert-base-uncased.pdparams` and `model_state.pdparams` """ - for model_name in list( - self.base_model_class.pretrained_init_configuration)[:1]: + for model_name in list(self.base_model_class.pretrained_init_configuration)[:1]: model = self.base_model_class.from_pretrained(model_name) self.assertIsNotNone(model) @@ -798,8 +747,7 @@ def test_pretrained_save_and_load(self): tempdirname = str(tempdir) model.save_pretrained(tempdirname) - loaded_model = self.base_model_class.from_pretrained( - tempdirname) + loaded_model = self.base_model_class.from_pretrained(tempdirname) check_two_model_parameter(model, loaded_model) @@ -809,20 +757,20 @@ def test_pretrained_save_and_load(self): shutil.copytree( os.path.join(MODEL_HOME, model_name), - tempdirname, ) + tempdirname, + ) saved_model_state_file = os.path.join( tempdirname, - self.base_model_class.resource_files_names["model_state"], ) + self.base_model_class.resource_files_names["model_state"], + ) self.assertTrue(os.path.isfile(saved_model_state_file)) # rename it to the old style: name of url, eg: model_state.pdparams -> bert-base-uncased.pdparams - url = self.base_model_class.pretrained_resource_files_map[ - "model_state"][model_name] + url = self.base_model_class.pretrained_resource_files_map["model_state"][model_name] pretrained_resource_file_name = os.path.split(url)[-1] - target_file_path = os.path.join(tempdirname, - pretrained_resource_file_name) + target_file_path = os.path.join(tempdirname, pretrained_resource_file_name) shutil.copyfile(saved_model_state_file, target_file_path) os.remove(saved_model_state_file) diff --git a/tests/testing_utils.py b/tests/testing_utils.py index 27448810955d7..dbf16a00360e8 100644 --- a/tests/testing_utils.py +++ b/tests/testing_utils.py @@ -14,18 +14,10 @@ from __future__ import annotations -import copy -import gc -import inspect import os -import sys import unittest from argparse import ArgumentTypeError -import numpy as np -import paddle -import yaml - def strtobool(v): if isinstance(v, bool): @@ -47,9 +39,7 @@ def get_bool_from_env(key, default_value=False): try: value = strtobool(value) except ValueError: - raise ValueError( - f"If set, {key} must be yes, no, true, false, 0 or 1 (case insensitive)." - ) + raise ValueError(f"If set, {key} must be yes, no, true, false, 0 or 1 (case insensitive).") return value